In [2]:
import json

with open('../data/task_2/summary.json', 'r') as f:
    summary_data = json.load(f)

# Create a mapping from ID to metadata
summary_mapping = {item['id']: {
    'task_mode': item['task_mode'],
    'correct_answer': item['correct_answer'],
    'options_count': len(item['options'])
} for item in summary_data['gt']}


In [3]:
import pandas as pd
from pathlib import Path

evaluation_path = Path('../evaluation/task_2/')
records = []

for pe_type_dir in evaluation_path.iterdir():
    if pe_type_dir.is_dir():
        for model_dir in pe_type_dir.iterdir():
            if model_dir.is_dir():
                for file_path in model_dir.glob('*.csv'):
                    modality = file_path.stem.lower()
                    df = pd.read_csv(file_path)

                    for _, row in df.iterrows():
                        file_name = Path(row['file']).name
                        question_id = file_name.split('_')[0]
                        metadata = summary_mapping.get(question_id)

                        if metadata:
                            is_correct = row['answer'].strip().upper() == metadata['correct_answer'].strip().upper()
                            records.append({
                                'model': model_dir.name,
                                'pe_type': pe_type_dir.name,
                                'modality': modality,
                                'options_count': metadata['options_count'],
                                'task_mode': metadata['task_mode'],
                                'is_correct': is_correct,
                                'selected_option': row['answer'].strip().upper()
                            })


In [4]:
analysis_df = pd.DataFrame(records)

# Performance by number of options
performance_options = analysis_df.groupby(['model', 'pe_type', 'modality', 'options_count'])['is_correct'].agg(['sum', 'count']).reset_index()
performance_options.rename(columns={'sum': 'correct', 'count': 'total'}, inplace=True)

# Performance by task category
performance_category = analysis_df.groupby(['model', 'pe_type', 'modality', 'task_mode'])['is_correct'].agg(['sum', 'count']).reset_index()
performance_category.rename(columns={'sum': 'correct', 'count': 'total'}, inplace=True)

# Bias towards specific options
bias_options = analysis_df.groupby(['model', 'pe_type', 'modality', 'options_count', 'selected_option']).size().reset_index(name='selected_count')
total_questions = analysis_df.groupby(['model', 'pe_type', 'modality', 'options_count']).size().reset_index(name='total_questions')
bias_options = bias_options.merge(total_questions, on=['model', 'pe_type', 'modality', 'options_count'])
bias_options['selection_rate'] = bias_options['selected_count'] / bias_options['total_questions']



# Display
print("=== Performance by number of options ===")
display(performance_options)

print("\n=== Performance by task category ===")
display(performance_category)

print("\n=== Bias towards specific options ===")
display(bias_options)


=== Performance by number of options ===


Unnamed: 0,model,pe_type,modality,options_count,correct,total
0,gemma3-4b,null-shot,image,2,55,100
1,gemma3-4b,null-shot,image,3,38,100
2,gemma3-4b,null-shot,image,4,28,100
3,gemma3-4b,null-shot,json,2,45,100
4,gemma3-4b,null-shot,json,3,30,100
...,...,...,...,...,...,...
211,qwen2.5-vl-7b,zero-shot-cot,json,3,38,100
212,qwen2.5-vl-7b,zero-shot-cot,json,4,20,100
213,qwen2.5-vl-7b,zero-shot-cot,xml,2,44,100
214,qwen2.5-vl-7b,zero-shot-cot,xml,3,46,100



=== Performance by task category ===


Unnamed: 0,model,pe_type,modality,task_mode,correct,total
0,gemma3-4b,null-shot,image,multiple_stable,63,150
1,gemma3-4b,null-shot,image,multiple_unstable,58,150
2,gemma3-4b,null-shot,json,multiple_stable,46,150
3,gemma3-4b,null-shot,json,multiple_unstable,45,150
4,gemma3-4b,null-shot,xml,multiple_stable,57,150
...,...,...,...,...,...,...
139,qwen2.5-vl-7b,zero-shot-cot,image,multiple_unstable,55,150
140,qwen2.5-vl-7b,zero-shot-cot,json,multiple_stable,57,150
141,qwen2.5-vl-7b,zero-shot-cot,json,multiple_unstable,55,150
142,qwen2.5-vl-7b,zero-shot-cot,xml,multiple_stable,62,150



=== Bias towards specific options ===


Unnamed: 0,model,pe_type,modality,options_count,selected_option,selected_count,total_questions,selection_rate
0,gemma3-4b,null-shot,image,2,A,1,100,0.01
1,gemma3-4b,null-shot,image,2,B,95,100,0.95
2,gemma3-4b,null-shot,image,2,INVALID,4,100,0.04
3,gemma3-4b,null-shot,image,3,A,15,100,0.15
4,gemma3-4b,null-shot,image,3,B,66,100,0.66
...,...,...,...,...,...,...,...,...
725,qwen2.5-vl-7b,zero-shot-cot,xml,3,C,36,100,0.36
726,qwen2.5-vl-7b,zero-shot-cot,xml,4,A,12,100,0.12
727,qwen2.5-vl-7b,zero-shot-cot,xml,4,B,22,100,0.22
728,qwen2.5-vl-7b,zero-shot-cot,xml,4,C,20,100,0.20


In [5]:
# GPT-4o specific analysis
gpt4o_df = analysis_df[analysis_df['model'] == 'gpt_4o']

# GPT-4o performance by number of options (pivoted)
gpt4o_option_performance = gpt4o_df.groupby(['modality', 'options_count'])['is_correct'].mean().unstack().reset_index()
gpt4o_option_performance.columns = ['modality'] + [f'option_{col}' for col in gpt4o_option_performance.columns[1:]]

# GPT-4o performance by subtask
gpt4o_subtask = gpt4o_df.groupby(['modality', 'task_mode'])['is_correct'].mean().reset_index()
gpt4o_subtask.rename(columns={'is_correct': 'accuracy'}, inplace=True)

print("\n=== GPT-4o Performance by number of options ===")
display(gpt4o_option_performance)

print("\n=== GPT-4o Performance by subtask ===")
display(gpt4o_subtask)


=== GPT-4o Performance by number of options ===


Unnamed: 0,modality,option_2,option_3,option_4
0,image,0.7975,0.695,0.6375
1,json,0.6325,0.46,0.405
2,xml,0.61,0.4425,0.375



=== GPT-4o Performance by subtask ===


Unnamed: 0,modality,task_mode,accuracy
0,image,multiple_stable,0.728333
1,image,multiple_unstable,0.691667
2,json,multiple_stable,0.486667
3,json,multiple_unstable,0.511667
4,xml,multiple_stable,0.47
5,xml,multiple_unstable,0.481667


In [None]:
# GPT-4o specific analysis
gpt4_1_mini_df = analysis_df[analysis_df['model'] == 'gpt_4-1_mini']

# GPT-4o performance by number of options (pivoted)
gpt4_1_mini_option_performance = gpt4_1_mini_df.groupby(['modality', 'options_count'])['is_correct'].mean().unstack().reset_index()
gpt4_1_mini_option_performance.columns = ['modality'] + [f'option_{col}' for col in gpt4_1_mini_option_performance.columns[1:]]

# GPT-4o performance by subtask
gpt4_1_mini_subtask = gpt4_1_mini_df.groupby(['modality', 'task_mode'])['is_correct'].mean().reset_index()
gpt4_1_mini_subtask.rename(columns={'is_correct': 'accuracy'}, inplace=True)

print("\n=== GPT-4o Performance by number of options ===")
display(gpt4_1_mini_option_performance)

print("\n=== GPT-4o Performance by subtask ===")
display(gpt4_1_mini_subtask)

In [None]:
with pd.ExcelWriter('task_2_analysis.xlsx') as writer:
    performance_options.to_excel(writer, sheet_name='Performance_Options', index=False)
    performance_category.to_excel(writer, sheet_name='Performance_Category', index=False)
    bias_options.to_excel(writer, sheet_name='Bias_Options', index=False)
    gpt4o_option_performance.to_excel(writer, sheet_name='GPT4o_Option_Performance', index=False)
    gpt4o_subtask.to_excel(writer, sheet_name='GPT4o_Subtask', index=False)
    gpt4_1_mini_option_performance.to_excel(writer, sheet_name='GPT4-1-mini_Option_Performance', index=False)
    gpt4_1_mini_subtask.to_excel(writer, sheet_name='GPT4-1-mini_Subtask', index=False)