# Error Analysis Table

This notebook includes:
1. read results from `result/*`
2. summarize *C1 error analysis*
3. summarize *C2 error analysis*

TODO

In [11]:
from pathlib import Path

c1_res_dir = Path("/home/u5u/kdeng.u5u/spatial-reasoning-of-LMs/result/error-analysis-c1")
c2_res_dir = Path("/home/u5u/kdeng.u5u/spatial-reasoning-of-LMs/result/error-analysis-c2")

In [12]:
import pandas as pd
import jsonlines

def _read_c1_res(res_dir: Path):
    res = []
    for model_dir in res_dir.iterdir():
        if not model_dir.is_dir():
            continue
        res_file = model_dir / "inference.jsonl"
        with jsonlines.open(res_file) as reader:
            for row in reader:
                res.append(row)
                
    return pd.DataFrame(res)

In [28]:
c1_df = _read_c1_res(c1_res_dir)
c1_df["Task"] = "C1"
c2_df = _read_c1_res(c2_res_dir)
c2_df["Task"] = "C2"

In [29]:
a_big_table = pd.concat([c1_df, c2_df], ignore_index=True)

Try to combinate 2 tables

In [30]:
a_big_table.replace({"gpt-4o": "GPT-4o"}, inplace=True)
a_big_table["vlm_id"] = a_big_table["vlm_id"].apply(lambda id: id.split("/")[-1])
vlm_order = [
    "SpaceQwen2.5-VL-3B-Instruct",
    "Idefics3-8B-Llama3",
    "llava-onevision-qwen2-7b-ov-hf",
    "Qwen2.5-VL-7B-Instruct", "Qwen2.5-VL-72B-Instruct", 
    "GPT-4o",
]

a_big_table["vlm_id"] = pd.Categorical(a_big_table["vlm_id"], categories=vlm_order, ordered=True)

a_big_table["prompt_mode"] = a_big_table["prompt_mode"].map({
    0: "zero-shot",
    1: "hint level-1",
    2: "hint level-2",
    3: "hint level-3",
})
prompt_order = ["zero-shot", "hint level-1", "hint level-2", "hint level-3"]
a_big_table["prompt_mode"] = pd.Categorical(a_big_table["prompt_mode"], categories=prompt_order, ordered=True)

a_big_table.rename(columns={"vlm_id": "Model", "prompt_mode": "Strategy"}, inplace=True)

In [31]:
from sklearn.metrics import accuracy_score, f1_score

def acc_f1(group):
    acc = accuracy_score(group["cor_idx"], group["pred"])
    f1 = f1_score(group["cor_idx"], group["pred"], average="weighted")
    return pd.Series({"accuracy": acc, "f1": f1})

In [33]:
nice_table = a_big_table.groupby(["Task", "Model", "Strategy"]).apply(acc_f1).reset_index()
nice_table = nice_table.pivot(index=["Task", "Model"], columns="Strategy", values=["accuracy", "f1"])

  nice_table = a_big_table.groupby(["Task", "Model", "Strategy"]).apply(acc_f1).reset_index()
  nice_table = a_big_table.groupby(["Task", "Model", "Strategy"]).apply(acc_f1).reset_index()


In [34]:
nice_table

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,accuracy,accuracy,accuracy,f1,f1,f1,f1
Unnamed: 0_level_1,Strategy,zero-shot,hint level-1,hint level-2,hint level-3,zero-shot,hint level-1,hint level-2,hint level-3
Task,Model,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
C1,SpaceQwen2.5-VL-3B-Instruct,0.259259,0.518519,0.259259,0.185185,0.227053,0.527816,0.249084,0.14751
C1,Idefics3-8B-Llama3,0.444444,0.259259,0.222222,0.148148,0.444444,0.252612,0.237241,0.122299
C1,llava-onevision-qwen2-7b-ov-hf,0.259259,0.555556,0.148148,0.481481,0.351648,0.571723,0.174815,0.496212
C1,Qwen2.5-VL-7B-Instruct,0.62963,0.62963,0.185185,0.592593,0.670996,0.624561,0.205556,0.592593
C1,Qwen2.5-VL-72B-Instruct,0.740741,0.692308,0.259259,1.0,0.74386,0.732441,0.289811,1.0
C1,GPT-4o,0.62963,0.518519,0.444444,1.0,0.624542,0.518519,0.455172,1.0
C2,SpaceQwen2.5-VL-3B-Instruct,0.568966,0.568966,0.5,0.5,0.560658,0.564415,0.499197,0.477621
C2,Idefics3-8B-Llama3,0.5,0.586207,0.551724,0.568966,0.494759,0.586207,0.550653,0.458549
C2,llava-onevision-qwen2-7b-ov-hf,0.465517,0.517241,0.517241,0.603448,0.413117,0.510279,0.516092,0.535888
C2,Qwen2.5-VL-7B-Instruct,0.5,0.517241,0.655172,0.586207,0.477953,0.492054,0.655172,0.576065


## C1

This notebook:
1. overall summary
2. acc divided by dof

In [None]:
# rename
# c1_df.rename(columns={"vlm_id": "model"}, inplace=True)
c1_df.replace({"gpt-4o": "GPT-4o"}, inplace=True)
c1_df["vlm_id"] = c1_df["vlm_id"].apply(lambda id: id.split["/"][-1])
vlm_order = [
    "remyxai/SpaceQwen2.5-VL-3B-Instruct",
    "Idefics3-8B-Llama3",
    "llava-onevision-qwen2-7b-ov-hf",
    "Qwen/Qwen2.5-VL-7B-Instruct", "Qwen/Qwen2.5-VL-72B-Instruct", 
    "GPT-4o",
]
c1_df["vlm_id"] = pd.Categorical(c1_df["vlm_id"], categories=vlm_order, ordered=True)

c1_df["prompt_mode"]= c1_df["prompt_mode"].map({
    0: "zero-shot",
    1: "hint level-1",
    2: "hint level-2",
    3: "hint level-3",
})
prompt_order = ["zero-shot", "hint level-1", "hint level-2", "hint level-3"]
c1_df["prompt_mode"] = pd.Categorical(c1_df["prompt_mode"], categories=prompt_order, ordered=True)

c1_df.rename(columns={"vlm_id": "Model", "prompt_mode": "Strategy"}, inplace=True)

In [5]:
summary_table = c1_df.groupby(["Model", "Strategy"]).agg({"is_correct": "mean"})
summary_table.rename(columns={"is_correct": "Accuracy"}, inplace=True)
summary_table.reset_index(inplace=True)
summary_table = summary_table.pivot(index="Model", columns="Strategy", values="Accuracy")
summary_table

  summary_table = c1_df.groupby(["Model", "Strategy"]).agg({"is_correct": "mean"})


Strategy,zero-shot,hint level-1,hint level-2,hint level-3
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HuggingFaceM4/Idefics3-8B-Llama3,0.444444,0.259259,0.222222,0.148148
llava-hf/llava-onevision-qwen2-7b-ov-hf,0.259259,0.555556,0.148148,0.481481
remyxai/SpaceQwen2.5-VL-3B-Instruct,0.259259,0.518519,0.259259,0.185185
Qwen/Qwen2.5-VL-7B-Instruct,0.62963,0.62963,0.185185,0.592593
Qwen/Qwen2.5-VL-72B-Instruct,0.740741,0.692308,0.259259,1.0
meta-llama/Llama-4-Scout-17B-16E-Instruct,0.62963,0.62963,0.555556,1.0
GPT-4o,0.62963,0.518519,0.444444,1.0


In [6]:
print(summary_table.to_latex(
    float_format="%.2f",
    column_format="lcccc",
    label="tab:error-aly-3-c1",
    caption="xxx",
))

\begin{table}
\caption{xxx}
\label{tab:error-aly-3-c1}
\begin{tabular}{lcccc}
\toprule
Strategy & zero-shot & hint level-1 & hint level-2 & hint level-3 \\
Model &  &  &  &  \\
\midrule
HuggingFaceM4/Idefics3-8B-Llama3 & 0.44 & 0.26 & 0.22 & 0.15 \\
llava-hf/llava-onevision-qwen2-7b-ov-hf & 0.26 & 0.56 & 0.15 & 0.48 \\
remyxai/SpaceQwen2.5-VL-3B-Instruct & 0.26 & 0.52 & 0.26 & 0.19 \\
Qwen/Qwen2.5-VL-7B-Instruct & 0.63 & 0.63 & 0.19 & 0.59 \\
Qwen/Qwen2.5-VL-72B-Instruct & 0.74 & 0.69 & 0.26 & 1.00 \\
meta-llama/Llama-4-Scout-17B-16E-Instruct & 0.63 & 0.63 & 0.56 & 1.00 \\
GPT-4o & 0.63 & 0.52 & 0.44 & 1.00 \\
\bottomrule
\end{tabular}
\end{table}



## C2
1. summary of error analysis c2

In [7]:
c2_df.replace({"gpt-4o": "GPT-4o"}, inplace=True)
c2_df["vlm_id"] = pd.Categorical(c2_df["vlm_id"], categories=vlm_order, ordered=True)

c2_df["prompt_mode"] = c2_df["prompt_mode"].map({
    0: "zero-shot",
    1: "hint level-1",
    2: "hint level-2",
    3: "hint level-3",
})
prompt_order = ["zero-shot", "hint level-1", "hint level-2", "hint level-3"]
c2_df["prompt_mode"] = pd.Categorical(c2_df["prompt_mode"], categories=prompt_order, ordered=True)

c2_df.rename(columns={"vlm_id": "Model", "prompt_mode": "Strategy"}, inplace=True)

In [8]:
summary_table_c2 = c2_df.groupby(["Model", "Strategy"]).agg({"is_correct": "mean"})
summary_table_c2.rename(columns={"is_correct": "Accuracy"}, inplace=True)
summary_table_c2.reset_index(inplace=True)
summary_table_c2 = summary_table_c2.pivot(index="Model", columns="Strategy", values="Accuracy")
summary_table_c2

  summary_table_c2 = c2_df.groupby(["Model", "Strategy"]).agg({"is_correct": "mean"})


Strategy,zero-shot,hint level-1,hint level-2,hint level-3
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HuggingFaceM4/Idefics3-8B-Llama3,0.5,0.586207,0.551724,0.568966
llava-hf/llava-onevision-qwen2-7b-ov-hf,0.465517,0.517241,0.517241,0.603448
remyxai/SpaceQwen2.5-VL-3B-Instruct,0.568966,0.568966,0.5,0.5
Qwen/Qwen2.5-VL-7B-Instruct,0.5,0.517241,0.655172,0.586207
Qwen/Qwen2.5-VL-72B-Instruct,0.517241,0.568966,0.5,0.413793
meta-llama/Llama-4-Scout-17B-16E-Instruct,0.5,0.517241,0.517241,0.517241
GPT-4o,0.431034,0.517241,0.586207,0.413793


In [9]:
print(summary_table_c2.to_latex(
    float_format="%.2f",
    column_format="lcccc",
    label="tab:error-aly-3-c2",
    caption="xxx",
))

\begin{table}
\caption{xxx}
\label{tab:error-aly-3-c2}
\begin{tabular}{lcccc}
\toprule
Strategy & zero-shot & hint level-1 & hint level-2 & hint level-3 \\
Model &  &  &  &  \\
\midrule
HuggingFaceM4/Idefics3-8B-Llama3 & 0.50 & 0.59 & 0.55 & 0.57 \\
llava-hf/llava-onevision-qwen2-7b-ov-hf & 0.47 & 0.52 & 0.52 & 0.60 \\
remyxai/SpaceQwen2.5-VL-3B-Instruct & 0.57 & 0.57 & 0.50 & 0.50 \\
Qwen/Qwen2.5-VL-7B-Instruct & 0.50 & 0.52 & 0.66 & 0.59 \\
Qwen/Qwen2.5-VL-72B-Instruct & 0.52 & 0.57 & 0.50 & 0.41 \\
meta-llama/Llama-4-Scout-17B-16E-Instruct & 0.50 & 0.52 & 0.52 & 0.52 \\
GPT-4o & 0.43 & 0.52 & 0.59 & 0.41 \\
\bottomrule
\end{tabular}
\end{table}

