In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [34]:
def fill_correct(df, half_scores = False):
    df = df.copy()
    if half_scores:
        df['is_correct'] = df['score'].apply(
            lambda x: 1 if str(x).strip().lower() == 'right'
            else 0.5 if str(x).strip().lower() == 'somewhat right' 
            else 0
            )
    else:
        df['is_correct'] = df['score'].apply(
            lambda x: 1 if str(x).strip().lower() == 'right' 
            else 0
        )
    return df

def clean_df(df, subset_cols,):
    df = df.copy()
    df = df.dropna(subset=subset_cols).drop_duplicates(subset=subset_cols, keep='last')
    return df

def prepare_df(df, half_scores = False):
    df = df.copy()
    df = fill_correct(df, half_scores=half_scores)
    ensemble_subset_cols = [
            "problem_id",
            "dataset_name",
            "type_name",
            "version",
        ]
    
    single_subset_cols = [
            "problem_id",
            "dataset_name",
            "model_name",
            "strategy_name",
            "version"
        ]
    df_singles = df[df["ensemble"] == False]
    df_singles = clean_df(df_singles, subset_cols=single_subset_cols)
    df_ensembles = df[df["ensemble"] == True]
    df_ensembles = clean_df(df_ensembles, subset_cols=ensemble_subset_cols)
    return df_singles, df_ensembles

In [37]:
df = pd.read_csv('results/all_results_concat.csv')
df = prepare_df(df, half_scores=True)[0]
df.head()

Unnamed: 0,problem_id,answer,confidence,rationale,raw_response,score,key,type_name,ensemble,judge_rationale,reasoning,dataset_name,model_name,strategy_name,version,is_correct
100,0,D,1.0,"In Set D, there's no consistent majority rule ...","{'answer': 'D', 'confidence': 1.0, 'rationale'...",Wrong,C,,False,,,cvr,Qwen/Qwen2.5-VL-7B-Instruct,classification,1,0.0
101,1,C,0.9,Set C has all images with a consistent number ...,"{'answer': 'C', 'confidence': 0.9, 'rationale'...",Wrong,A,,False,,,cvr,Qwen/Qwen2.5-VL-7B-Instruct,classification,1,0.0
102,2,D,0.9,"In Set D, there is no clear majority rule that...","{'answer': 'D', 'confidence': 0.9, 'rationale'...",Right,D,,False,,,cvr,Qwen/Qwen2.5-VL-7B-Instruct,classification,1,1.0
103,3,D,0.9,"In Set D, all visible images share the same nu...","{'answer': 'D', 'confidence': 0.9, 'rationale'...",Wrong,C,,False,,,cvr,Qwen/Qwen2.5-VL-7B-Instruct,classification,1,0.0
104,4,D,1.0,"In set D, all visible images share a similar p...","{'answer': 'D', 'confidence': 1.0, 'rationale'...",Wrong,B,,False,,,cvr,Qwen/Qwen2.5-VL-7B-Instruct,classification,1,0.0


### Top models

In [44]:
def accuracy_stats(
    df,
    group_cols,
    versions=(1, 3),
    top_n=10,
    bottom_n=10,
    return_top_k=5
):

    df = df.copy()

    # Filter versions
    df = df[df["version"].isin(versions)]

    # Group and aggregate
    grouped = df.groupby(group_cols)["is_correct"]

    score = grouped.sum()
    total = grouped.count()

    percentage = score / total * 100

    results = pd.DataFrame({
        "total_score": score,
        "percentage": percentage
    }).reset_index()

    # Sort
    top = results.sort_values("percentage", ascending=False).head(top_n)
    bottom = results.sort_values("percentage", ascending=True).head(bottom_n)

    print(f"\nTop {top_n}:")
    print(top)

    print(f"\nBottom {bottom_n}:")
    print(bottom)

    # ---- Top K list in requested format ----
    top_k_list = [
        [str(row[col]) for col in group_cols]
        for _, row in top.head(return_top_k).iterrows()
    ]

    print(f"\nTop {return_top_k} {[f'{c}' for c in group_cols]}:")
    print(top_k_list)

    return top_k_list


In [47]:
group_cols = ["strategy_name", "model_name", "version"]
accuracy_stats(df, group_cols=group_cols)

bp_only = df[df["dataset_name"]=="bp"].copy()
bp_only = bp_only[bp_only["strategy_name"]!="classification"]
cvr_only = df[df["dataset_name"]=="cvr"].copy()
raven_only = df[df["dataset_name"]=="raven"].copy()
mars_only = df[df["dataset_name"]=="marsvqa"].copy()

group_cols.append("dataset_name")
print("\n ------------ BP -------------\n")
accuracy_stats(bp_only, group_cols=group_cols)
print("\n ------------ CVR -------------\n")
accuracy_stats(cvr_only, group_cols=group_cols)
print("\n ------------ RAVEN -------------\n")
accuracy_stats(raven_only, group_cols=group_cols)
print("\n ------------ MARS -------------\n")
accuracy_stats(mars_only, group_cols=group_cols)


Top 10:
     strategy_name                   model_name  version  total_score  \
21          direct  Qwen/Qwen2.5-VL-7B-Instruct        3        196.0   
9      contrastive  Qwen/Qwen2.5-VL-7B-Instruct        3        193.0   
20          direct  Qwen/Qwen2.5-VL-7B-Instruct        1        190.5   
8      contrastive  Qwen/Qwen2.5-VL-7B-Instruct        1        172.5   
18          direct       OpenGVLab/InternVL3-8B        1        126.0   
12     descriptive       OpenGVLab/InternVL3-8B        1        124.0   
2   classification  Qwen/Qwen2.5-VL-7B-Instruct        1        122.0   
3   classification  Qwen/Qwen2.5-VL-7B-Instruct        3        119.0   
19          direct       OpenGVLab/InternVL3-8B        3        117.5   
13     descriptive       OpenGVLab/InternVL3-8B        3        117.0   

    percentage  
21   49.000000  
9    48.250000  
20   47.625000  
8    43.233083  
18   31.500000  
12   31.000000  
2    30.500000  
3    29.750000  
19   29.375000  
13   29.250000  


[['direct', 'Qwen/Qwen2.5-VL-7B-Instruct', '1', 'marsvqa'],
 ['direct', 'Qwen/Qwen2.5-VL-7B-Instruct', '3', 'marsvqa'],
 ['contrastive', 'Qwen/Qwen2.5-VL-7B-Instruct', '1', 'marsvqa'],
 ['contrastive', 'Qwen/Qwen2.5-VL-7B-Instruct', '3', 'marsvqa'],
 ['contrastive', 'OpenGVLab/InternVL3-8B', '3', 'marsvqa']]