In [1]:
import json

with open('../data/task_3/summary.json', 'r') as f:
    summary_data = json.load(f)

# Create a mapping from ID to metadata
summary_mapping = {item['id']: {
    'task_mode': item['task_mode'],
    'correct_answer': item['correct_answer']
} for item in summary_data['gt']}


In [2]:
import pandas as pd
from pathlib import Path

evaluation_path = Path('../evaluation/task_3/')
records = []

for pe_type_dir in evaluation_path.iterdir():
    if pe_type_dir.is_dir():
        for model_dir in pe_type_dir.iterdir():
            if model_dir.is_dir():
                for file_path in model_dir.glob('*.csv'):
                    modality = file_path.stem.lower()
                    df = pd.read_csv(file_path)

                    for _, row in df.iterrows():
                        file_name = Path(row['file']).name
                        question_id = file_name.split('_')[0]
                        metadata = summary_mapping.get(question_id)

                        if metadata:
                            is_correct = row['answer'].strip().upper() == metadata['correct_answer'].strip().upper()
                            records.append({
                                'model': model_dir.name,
                                'pe_type': pe_type_dir.name,
                                'modality': modality,
                                'task_mode': metadata['task_mode'],
                                'is_correct': is_correct,
                                'selected_option': row['answer'].strip().upper()
                            })


In [3]:
analysis_df = pd.DataFrame(records)

# Performance by hidden task
performance_task = analysis_df.groupby(['model', 'pe_type', 'modality', 'task_mode'])['is_correct'].agg(['sum', 'count']).reset_index()
performance_task.rename(columns={'sum': 'correct', 'count': 'total'}, inplace=True)

# Bias towards specific options
bias_options = analysis_df.groupby(['model', 'pe_type', 'modality', 'selected_option']).size().reset_index(name='selected_count')
total_questions = analysis_df.groupby(['model', 'pe_type', 'modality']).size().reset_index(name='total_questions')
bias_options = bias_options.merge(total_questions, on=['model', 'pe_type', 'modality'])
bias_options['selection_rate'] = bias_options['selected_count'] / bias_options['total_questions']

# Display
print("=== Performance by task ===")
display(performance_task)

print("\n=== Bias towards specific options ===")
display(bias_options)



=== Performance by task ===


Unnamed: 0,model,pe_type,modality,task_mode,correct,total
0,gemma3-4b,null-shot,image,prediction_diff_block,21,100
1,gemma3-4b,null-shot,image,prediction_diff_id,24,100
2,gemma3-4b,null-shot,image,prediction_diff_level,15,100
3,gemma3-4b,null-shot,json,prediction_diff_block,24,100
4,gemma3-4b,null-shot,json,prediction_diff_id,18,100
...,...,...,...,...,...,...
211,qwen2.5-vl-7b,zero-shot-cot,json,prediction_diff_id,20,100
212,qwen2.5-vl-7b,zero-shot-cot,json,prediction_diff_level,18,100
213,qwen2.5-vl-7b,zero-shot-cot,xml,prediction_diff_block,54,100
214,qwen2.5-vl-7b,zero-shot-cot,xml,prediction_diff_id,58,100



=== Bias towards specific options ===


Unnamed: 0,model,pe_type,modality,selected_option,selected_count,total_questions,selection_rate
0,gemma3-4b,null-shot,image,A,30,300,0.100000
1,gemma3-4b,null-shot,image,B,52,300,0.173333
2,gemma3-4b,null-shot,image,C,83,300,0.276667
3,gemma3-4b,null-shot,image,D,115,300,0.383333
4,gemma3-4b,null-shot,image,INVALID,20,300,0.066667
...,...,...,...,...,...,...,...
328,qwen2.5-vl-7b,zero-shot-cot,xml,A,116,300,0.386667
329,qwen2.5-vl-7b,zero-shot-cot,xml,B,83,300,0.276667
330,qwen2.5-vl-7b,zero-shot-cot,xml,C,36,300,0.120000
331,qwen2.5-vl-7b,zero-shot-cot,xml,D,63,300,0.210000


In [4]:
with pd.ExcelWriter('task_3_analysis.xlsx') as writer:
    performance_task.to_excel(writer, sheet_name='Performance_Hidden_Task', index=False)
    bias_options.to_excel(writer, sheet_name='Bias_Options', index=False)


In [5]:
performance_task = (
    analysis_df
    .groupby(['model', 'pe_type', 'modality', 'task_mode'])['is_correct']
    .agg(['sum', 'count'])
    .reset_index()
    .rename(columns={'sum': 'correct', 'count': 'total'})
)


In [None]:
#performance_pivot.to_excel('task_3_performance_summary.xlsx', index=False)
performance_pivot.to_excel('task_3_performance_summary.xlsx', index=False)


In [None]:
# Create pivot table for performance
performance_pivot = (
    performance_task
    .pivot(index=['model', 'pe_type', 'modality'], columns='task_mode', values=['correct', 'total'])
)
performance_pivot.columns = [
    f"{task_mode}_{metric}"
    for metric, task_mode in performance_pivot.columns
]
performance_pivot = performance_pivot.reset_index()
desired_order = [
    'model', 'pe_type', 'modality',
    'prediction_diff_block_correct', 'prediction_diff_block_total',
    'prediction_diff_level_correct', 'prediction_diff_level_total',
    'prediction_diff_id_correct', 'prediction_diff_id_total'
]
performance_pivot = performance_pivot[desired_order]

# Create the additional tables
# GPT-4o Option Performance
gpt4o_option_performance = (
    analysis_df[analysis_df['model'] == 'gpt_4o']
    .groupby(['pe_type', 'modality'])['is_correct']
    .mean()
    .reset_index()
    .rename(columns={'is_correct': 'accuracy'})
)

# GPT-4o Subtask Performance
gpt4o_subtask = (
    analysis_df[analysis_df['model'] == 'gpt_4o']
    .groupby(['modality', 'task_mode'])['is_correct']
    .mean()
    .reset_index()
    .rename(columns={'is_correct': 'accuracy'})
)

# GPT-4-1-mini Option Performance
gpt4_1_mini_option_performance = (
    analysis_df[analysis_df['model'] == 'gpt_4-1-mini']
    .groupby(['pe_type', 'modality'])['is_correct']
    .mean()
    .reset_index()
    .rename(columns={'is_correct': 'accuracy'})
)

# GPT-4-1-mini Subtask Performance
gpt4_1_mini_subtask = (
    analysis_df[analysis_df['model'] == 'gpt_4-1-mini']
    .groupby(['modality', 'task_mode'])['is_correct']
    .mean()
    .reset_index()
    .rename(columns={'is_correct': 'accuracy'})
)

# # Display all tables
# print("=== Performance by task ===")
# display(performance_pivot)

# print("\n=== Bias towards specific options ===")
# display(bias_options)

print("\n=== GPT-4o Option Performance ===")
display(gpt4o_option_performance)

print("\n=== GPT-4o Subtask Performance ===")
display(gpt4o_subtask)

print("\n=== GPT-4-1-mini Option Performance ===")
display(gpt4_1_mini_option_performance)

print("\n=== GPT-4-1-mini Subtask Performance ===")
display(gpt4_1_mini_subtask)

# Save all tables to Excel
with pd.ExcelWriter('task_3_analysis.xlsx') as writer:
    performance_pivot.to_excel(writer, sheet_name='Performance_Summary', index=False)
    bias_options.to_excel(writer, sheet_name='Bias_Options', index=False)
    gpt4o_option_performance.to_excel(writer, sheet_name='GPT4o_Option_Performance', index=False)
    gpt4o_subtask.to_excel(writer, sheet_name='GPT4o_Subtask', index=False)
    gpt4_1_mini_option_performance.to_excel(writer, sheet_name='GPT4_1_mini_Option_Performance', index=False)
    gpt4_1_mini_subtask.to_excel(writer, sheet_name='GPT4_1_mini_Subtask', index=False)


=== Focus Models Performance by task ===


Unnamed: 0,model,pe_type,modality,task_mode,correct,total
0,gpt_4-1-mini,null-shot,image,prediction_diff_block,46,100
1,gpt_4-1-mini,null-shot,image,prediction_diff_id,50,100
2,gpt_4-1-mini,null-shot,image,prediction_diff_level,35,100
3,gpt_4-1-mini,null-shot,json,prediction_diff_block,74,100
4,gpt_4-1-mini,null-shot,json,prediction_diff_id,71,100
...,...,...,...,...,...,...
67,gpt_4o,zero-shot-cot,json,prediction_diff_id,57,100
68,gpt_4o,zero-shot-cot,json,prediction_diff_level,58,100
69,gpt_4o,zero-shot-cot,xml,prediction_diff_block,72,100
70,gpt_4o,zero-shot-cot,xml,prediction_diff_id,45,100



=== Focus Models Bias towards specific options ===


Unnamed: 0,model,pe_type,modality,selected_option,selected_count,total_questions,selection_rate
0,gpt_4-1-mini,null-shot,image,A,112,300,0.373333
1,gpt_4-1-mini,null-shot,image,B,66,300,0.220000
2,gpt_4-1-mini,null-shot,image,C,61,300,0.203333
3,gpt_4-1-mini,null-shot,image,D,61,300,0.203333
4,gpt_4-1-mini,null-shot,json,A,92,300,0.306667
...,...,...,...,...,...,...,...
91,gpt_4o,zero-shot-cot,json,D,32,300,0.106667
92,gpt_4o,zero-shot-cot,xml,A,162,300,0.540000
93,gpt_4o,zero-shot-cot,xml,B,61,300,0.203333
94,gpt_4o,zero-shot-cot,xml,C,53,300,0.176667



=== Focus Models Performance Pivot ===


Unnamed: 0,model,pe_type,modality,prediction_diff_block_correct,prediction_diff_block_total,prediction_diff_level_correct,prediction_diff_level_total,prediction_diff_id_correct,prediction_diff_id_total
0,gpt_4-1-mini,null-shot,image,46,100,35,100,50,100
1,gpt_4-1-mini,null-shot,json,74,100,66,100,71,100
2,gpt_4-1-mini,null-shot,xml,75,100,73,100,81,100
3,gpt_4-1-mini,null-shot-cot,image,42,100,33,100,51,100
4,gpt_4-1-mini,null-shot-cot,json,75,100,63,100,75,100
5,gpt_4-1-mini,null-shot-cot,xml,80,100,71,100,83,100
6,gpt_4-1-mini,zero-shot,image,51,100,42,100,57,100
7,gpt_4-1-mini,zero-shot,json,72,100,67,100,81,100
8,gpt_4-1-mini,zero-shot,xml,82,100,78,100,89,100
9,gpt_4-1-mini,zero-shot-cot,image,48,100,40,100,53,100
