In [28]:
import numpy as np
import pandas as pd
import re

# pd.set_option('display.max_rows', None)        # Show all rows
# pd.set_option('display.max_columns', None)     # Show all columns
# pd.set_option('display.width', None)           # No width limit
pd.set_option('display.max_colwidth', 50)      # Limit column width


In [29]:
def parse_temp_file_to_numpy():
    """Parse temp.txt and organize data into numpy arrays with embedding models, chat models, and scores."""
    
    # Read and parse the data
    embedding_models = []
    chat_models = []
    scores = []
    raw_scores = []  # New: store raw x/90 scores
    
    with open('temp.txt', 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith('Model:'):
                # Extract model names and score using regex
                match = re.search(r'Model: (.+?)--(.+?)_quiz_results\.json, Correct: (\d+)/(\d+)', line)
                if match:
                    embedding_model = match.group(1)
                    chat_model = match.group(2)
                    correct = int(match.group(3))
                    total = int(match.group(4))
                    accuracy = (correct / total) * 100
                    raw_score = f"{correct}/{total}"  # New: raw score format
                    
                    embedding_models.append(embedding_model)
                    chat_models.append(chat_model)
                    scores.append(accuracy)
                    raw_scores.append(raw_score)  # New: add raw score
    
    # Get unique models for matrix creation
    unique_embedding_models = sorted(list(set(embedding_models)))
    unique_chat_models = sorted(list(set(chat_models)))
    
    print(f"📊 Data Summary:")
    print(f"Total entries: {len(scores)}")
    print(f"Unique embedding models: {len(unique_embedding_models)}")
    print(f"Unique chat models: {len(unique_chat_models)}")
    
    # Create a matrix with embedding models as rows and chat models as columns
    score_matrix = np.full((len(unique_embedding_models), len(unique_chat_models)), np.nan)
    
    # Fill the matrix with scores
    for i, (emb, chat, score) in enumerate(zip(embedding_models, chat_models, scores)):
        emb_idx = unique_embedding_models.index(emb)
        chat_idx = unique_chat_models.index(chat)
        score_matrix[emb_idx, chat_idx] = score
    
    # Create structured arrays for easier access - Updated with raw_score
    data_array = np.array(list(zip(embedding_models, chat_models, scores, raw_scores)), 
                         dtype=[('embedding_model', 'U50'), ('chat_model', 'U50'), ('accuracy', 'f4'), ('raw_score', 'U10')])
    
    print(f"\n🏆 Top 10 Performing Combinations:")
    sorted_indices = np.argsort(data_array['accuracy'])[::-1]
    for i, idx in enumerate(sorted_indices[:10]):
        entry = data_array[idx]
        print(f"{i+1:2d}. {entry['embedding_model']} + {entry['chat_model']}: {entry['accuracy']:.1f}% ({entry['raw_score']})")
    
    print(f"\n📉 Bottom 5 Performing Combinations:")
    for i, idx in enumerate(sorted_indices[-5:]):
        entry = data_array[idx]
        print(f"{i+1:2d}. {entry['embedding_model']} + {entry['chat_model']}: {entry['accuracy']:.1f}% ({entry['raw_score']})")
    
    return {
        'data_array': data_array,
        'score_matrix': score_matrix,
        'embedding_models': unique_embedding_models,
        'chat_models': unique_chat_models,
        'raw_data': {
            'embedding_models': embedding_models,
            'chat_models': chat_models,
            'scores': scores,
            'raw_scores': raw_scores  # New: include raw scores
        }
    }

def analyze_performance(data_dict):
    """Analyze performance statistics from the numpy data."""
    data_array = data_dict['data_array']
    score_matrix = data_dict['score_matrix']
    embedding_models = data_dict['embedding_models']
    chat_models = data_dict['chat_models']
    
    print(f"\n📈 Performance Analysis:")
    print(f"Overall average accuracy: {np.mean(data_array['accuracy']):.2f}%")
    print(f"Standard deviation: {np.std(data_array['accuracy']):.2f}%")
    print(f"Best performance: {np.max(data_array['accuracy']):.1f}%")
    print(f"Worst performance: {np.min(data_array['accuracy']):.1f}%")
    
    # Average performance by embedding model
    print(f"\n🎯 Average Performance by Embedding Model:")
    for i, emb_model in enumerate(embedding_models):
        row_scores = score_matrix[i, :]
        valid_scores = row_scores[~np.isnan(row_scores)]
        if len(valid_scores) > 0:
            avg_score = np.mean(valid_scores)
            # Calculate average raw score
            emb_data = data_array[data_array['embedding_model'] == emb_model]
            avg_correct = np.mean([int(score.split('/')[0]) for score in emb_data['raw_score']])
            print(f"{emb_model:30s}: {avg_score:5.1f}% ({avg_correct:.1f}/90 avg) from {len(valid_scores)} models")
    
    # Average performance by chat model
    print(f"\n💬 Average Performance by Chat Model:")
    for j, chat_model in enumerate(chat_models):
        col_scores = score_matrix[:, j]
        valid_scores = col_scores[~np.isnan(col_scores)]
        if len(valid_scores) > 0:
            avg_score = np.mean(valid_scores)
            # Calculate average raw score
            chat_data = data_array[data_array['chat_model'] == chat_model]
            avg_correct = np.mean([int(score.split('/')[0]) for score in chat_data['raw_score']])
            print(f"{chat_model:30s}: {avg_score:5.1f}% ({avg_correct:.1f}/90 avg) from {len(valid_scores)} embeddings")
    
    return score_matrix

def create_pandas_dataframe(data_dict):
    """Convert numpy data to pandas DataFrame for easier manipulation."""
    data_array = data_dict['data_array']
    
    # Create DataFrame with both percentage and raw scores
    df = pd.DataFrame({
        'embedding_model': data_array['embedding_model'],
        'chat_model': data_array['chat_model'],
        'accuracy_percent': data_array['accuracy'],
        'raw_score': data_array['raw_score']  # New: raw score column
    })
    
    # Create pivot tables for both metrics
    pivot_df_percent = df.pivot(index='embedding_model', columns='chat_model', values='accuracy_percent')
    pivot_df_raw = df.pivot(index='embedding_model', columns='chat_model', values='raw_score')
    
    print(f"\n📋 Data as Pandas DataFrame:")
    print(df.head(10))
    
    print(f"\n📊 Pivot Table - Percentages (first 5x5):")
    print(pivot_df_percent.iloc[:5, :5])
    
    print(f"\n📊 Pivot Table - Raw Scores (first 5x5):")
    print(pivot_df_raw.iloc[:5, :5])
    
    return df, pivot_df_percent, pivot_df_raw

In [30]:
# Run the analysis
print("🔍 Parsing temp.txt file...")
data_dict = parse_temp_file_to_numpy()

print("\n" + "="*70)
analyze_performance(data_dict)

print("\n" + "="*70)
df, pivot_df_percent, pivot_df_raw = create_pandas_dataframe(data_dict)

# Access the numpy arrays
data_array = data_dict['data_array']
score_matrix = data_dict['score_matrix']
embedding_models = data_dict['embedding_models']
chat_models = data_dict['chat_models']

print(f"\n✅ Data successfully loaded into numpy arrays!")
print(f"📐 Score matrix shape: {score_matrix.shape}")
print(f"📏 Data array length: {len(data_array)}")

# Show the updated DataFrame structure
print(f"\n🔍 DataFrame columns: {list(df.columns)}")
print(f"📊 Sample data:")
print(df.head())

🔍 Parsing temp.txt file...
📊 Data Summary:
Total entries: 183
Unique embedding models: 13
Unique chat models: 15

🏆 Top 10 Performing Combinations:
 1. instructor_xl + gemma_3_1b_it: 78.9% (71/90)
 2. instructor_xl + Llama_3.2_1B: 78.9% (71/90)
 3. instructor_xl + flan_t5_large: 78.9% (71/90)
 4. instructor_xl + falcon_7b: 78.9% (71/90)
 5. instructor_xl + gemma_2_2b: 78.9% (71/90)
 6. instructor_xl + Falcon_H1_0.5B_Instruct: 78.9% (71/90)
 7. instructor_xl + falcon_7b_instruct: 78.9% (71/90)
 8. instructor_xl + Llama_3.1_8B_Instruct: 78.9% (71/90)
 9. instructor_xl + Llama_3.2_3B_Instruct: 78.9% (71/90)
10. instructor_xl + zephyr_7b_beta: 78.9% (71/90)

📉 Bottom 5 Performing Combinations:
 1. static_retrieval_mrl_en_v1 + flan_t5_small: 58.9% (53/90)
 2. static_retrieval_mrl_en_v1 + falcon_7b: 58.9% (53/90)
 3. static_retrieval_mrl_en_v1 + zephyr_7b_beta: 58.9% (53/90)
 4. static_retrieval_mrl_en_v1 + Llama_3.2_3B_Instruct: 58.9% (53/90)
 5. static_retrieval_mrl_en_v1 + flan_t5_base: 5

In [31]:
df

Unnamed: 0,embedding_model,chat_model,accuracy_percent,raw_score
0,multi_qa_mpnet_base_dot_v1,gemma_2_2b,66.666664,60/90
1,bge_large_en,gemma_2_2b_it,73.333336,66/90
2,bge_base_en_v1.5,Llama_3.1_8B_Instruct,74.444443,67/90
3,bge_large_en,flan_t5_base,62.222221,56/90
4,all_mpnet_base_v2,Llama_3.1_8B_Instruct,67.777779,61/90
5,multi_qa_mpnet_base_cos_v1,flan_t5_xl,68.888885,62/90
6,static_retrieval_mrl_en_v1,flan_t5_large,58.888889,53/90
7,bge_large_en,gemma_3_1b_it,73.333336,66/90
8,all_mpnet_base_v2,gemma_3_1b_it,67.777779,61/90
9,instructor_large,gemma_3_1b_it,68.888885,62/90


In [32]:
# df.to_csv("temp2.txt", index=False)

In [33]:
# Group by embedding model and calculate average scores
embedding_stats = df.groupby('embedding_model').agg({
    'accuracy_percent': ['mean', 'std', 'count'],
    'raw_score': lambda x: f"{x.apply(lambda s: int(s.split('/')[0])).mean():.1f}/90"
}).round(2)

# Flatten column names
embedding_stats.columns = ['avg_accuracy', 'std_accuracy', 'count_models', 'avg_raw_score']

# Sort by average accuracy (descending)
embedding_stats_sorted = embedding_stats.sort_values('avg_accuracy', ascending=False)

print("🎯 Embedding Model Performance Rankings:")
print("=" * 80)
print(f"{'Rank':<4} {'Embedding Model':<35} {'Avg %':<8} {'Std':<6} {'Count':<6} {'Avg Raw':<10}")
print("-" * 80)

for rank, (model, row) in enumerate(embedding_stats_sorted.iterrows(), 1):
    print(f"{rank:<4} {model:<35} {row['avg_accuracy']:<8} {row['std_accuracy']:<6} {row['count_models']:<6} {row['avg_raw_score']:<10}")

# Display as DataFrame
print(f"\n📊 Detailed Embedding Model Statistics:")
embedding_stats_sorted

🎯 Embedding Model Performance Rankings:
Rank Embedding Model                     Avg %    Std    Count  Avg Raw   
--------------------------------------------------------------------------------
1    instructor_xl                       77.62000274658203 3.26   14     69.9/90   
2    bge_base_en_v1.5                    73.19000244140625 2.91   15     65.9/90   
3    all_MiniLM_L6_v2                    72.80999755859375 1.25   15     65.5/90   
4    e5_base_v2                          72.5199966430664 1.7    15     65.3/90   
5    bge_large_en                        71.4800033569336 3.89   15     64.3/90   
6    bge_large_en_v1.5                   71.26000213623047 2.65   15     64.1/90   
7    multi_qa_mpnet_base_cos_v1          71.26000213623047 2.05   15     64.1/90   
8    bge_m3                              70.0     2.34   15     63.0/90   
9    instructor_large                    68.30000305175781 2.68   15     61.5/90   
10   all_mpnet_base_v2                   67.62999725341797 

Unnamed: 0_level_0,avg_accuracy,std_accuracy,count_models,avg_raw_score
embedding_model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
instructor_xl,77.620003,3.26,14,69.9/90
bge_base_en_v1.5,73.190002,2.91,15,65.9/90
all_MiniLM_L6_v2,72.809998,1.25,15,65.5/90
e5_base_v2,72.519997,1.7,15,65.3/90
bge_large_en,71.480003,3.89,15,64.3/90
bge_large_en_v1.5,71.260002,2.65,15,64.1/90
multi_qa_mpnet_base_cos_v1,71.260002,2.05,15,64.1/90
bge_m3,70.0,2.34,15,63.0/90
instructor_large,68.300003,2.68,15,61.5/90
all_mpnet_base_v2,67.629997,1.32,15,60.9/90


In [34]:
# Group by chat model and calculate average scores
chat_stats = df.groupby('chat_model').agg({
    'accuracy_percent': ['mean', 'std', 'count'],
    'raw_score': lambda x: f"{x.apply(lambda s: int(s.split('/')[0])).mean():.1f}/90"
}).round(2)

# Flatten column names
chat_stats.columns = ['avg_accuracy', 'std_accuracy', 'count_embeddings', 'avg_raw_score']

# Sort by average accuracy (descending)
chat_stats_sorted = chat_stats.sort_values('avg_accuracy', ascending=False)

print("💬 Chat Model Performance Rankings:")
print("=" * 80)
print(f"{'Rank':<4} {'Chat Model':<35} {'Avg %':<8} {'Std':<6} {'Count':<6} {'Avg Raw':<10}")
print("-" * 80)

for rank, (model, row) in enumerate(chat_stats_sorted.iterrows(), 1):
    print(f"{rank:<4} {model:<35} {row['avg_accuracy']:<8} {row['std_accuracy']:<6} {row['count_embeddings']:<6} {row['avg_raw_score']:<10}")

# Display as DataFrame
print(f"\n📊 Detailed Chat Model Statistics:")
chat_stats_sorted

💬 Chat Model Performance Rankings:
Rank Chat Model                          Avg %    Std    Count  Avg Raw   
--------------------------------------------------------------------------------
1    Falcon_H1_0.5B_Instruct             70.83000183105469 4.99   12     63.8/90   
2    Llama_3.2_1B                        70.83000183105469 4.99   12     63.8/90   
3    Llama_3.2_3B_Instruct               70.83000183105469 4.99   12     63.8/90   
4    Meta_Llama_3_8B_Instruct            70.83000183105469 4.99   12     63.8/90   
5    falcon_7b_instruct                  70.83000183105469 4.99   12     63.8/90   
6    flan_t5_large                       70.83000183105469 4.99   12     63.8/90   
7    gemma_2_2b                          70.83000183105469 4.99   12     63.8/90   
8    gemma_2_2b_it                       70.83000183105469 4.99   12     63.8/90   
9    Llama_3.1_8B_Instruct               70.08999633789062 5.49   13     63.1/90   
10   falcon_7b                           70.089996337

Unnamed: 0_level_0,avg_accuracy,std_accuracy,count_embeddings,avg_raw_score
chat_model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Falcon_H1_0.5B_Instruct,70.830002,4.99,12,63.8/90
Llama_3.2_1B,70.830002,4.99,12,63.8/90
Llama_3.2_3B_Instruct,70.830002,4.99,12,63.8/90
Meta_Llama_3_8B_Instruct,70.830002,4.99,12,63.8/90
falcon_7b_instruct,70.830002,4.99,12,63.8/90
flan_t5_large,70.830002,4.99,12,63.8/90
gemma_2_2b,70.830002,4.99,12,63.8/90
gemma_2_2b_it,70.830002,4.99,12,63.8/90
Llama_3.1_8B_Instruct,70.089996,5.49,13,63.1/90
falcon_7b,70.089996,5.49,13,63.1/90
