
# Error Analysis Step2

In this notebook, we make tables for showing results of error-analysis-step2:
1. multi-models
2. 2 prompt types
3. use acc and f1 as metriecs

In [1]:
from pathlib import Path
import jsonlines

res_dir = Path("/home/u5u/kdeng.u5u/spatial-reasoning-of-LMs/result/error-analysis-step2")

def read_res(dir: Path):
    data = []
    for model_dir in res_dir.iterdir():
        if model_dir.is_dir():
            inf_file = model_dir / "inference.jsonl"
            with jsonlines.open(inf_file) as reader:
                for item in reader:
                    data.append(item)

    return data

res = read_res(res_dir)

In [2]:
import pandas as pd

df = pd.DataFrame(res)

In [3]:
df["vlm_id"].value_counts()

vlm_id
Qwen/Qwen2.5-VL-72B-Instruct               54
HuggingFaceM4/Idefics3-8B-Llama3           54
remyxai/SpaceQwen2.5-VL-3B-Instruct        54
llava-hf/llava-onevision-qwen2-7b-ov-hf    54
Qwen/Qwen2.5-VL-7B-Instruct                54
gpt-4o                                     54
Name: count, dtype: int64

In [4]:
df["pred"].value_counts()

pred
closer     106
right      103
left        53
farther     24
up          19
error       16
down         3
Name: count, dtype: int64

In [5]:
df.columns

Index(['src_img_path', 'tgt_img_path', 'dof', 'sign', 'ref_obj', 'label',
       'vlm_id', 'prompt_mode', 'prompt', 'answer', 'obj_desc', 'pred',
       'is_correct', 'is_parse'],
      dtype='object')

In [6]:
table = df.copy()

# leave some blank for pre-process
table["vlm_id"] = table["vlm_id"].apply(lambda model_id: model_id.split("/")[-1])
model_order = [
    "SpaceQwen2.5-VL-3B-Instruct",
    "Idefics3-8B-Llama3",
    "llava-onevision-qwen2-7b-ov-hf",
    "Qwen2.5-VL-7B-Instruct",
    "Qwen2.5-VL-72B-Instruct",
    "gpt-4o",
]

table["vlm_id"] = pd.Categorical(table["vlm_id"], categories=model_order, ordered=True)

In [7]:
from sklearn.metrics import f1_score

def _acc_f1(group):
    acc = group["is_correct"].mean()
    return pd.Series({"acc.": acc})

In [8]:
table = table.groupby(["vlm_id", "prompt_mode"]).apply(_acc_f1).reset_index().pivot(index=["vlm_id"], columns=["prompt_mode"], values="acc.")
table = (table).round(2)

table

  table = table.groupby(["vlm_id", "prompt_mode"]).apply(_acc_f1).reset_index().pivot(index=["vlm_id"], columns=["prompt_mode"], values="acc.")
  table = table.groupby(["vlm_id", "prompt_mode"]).apply(_acc_f1).reset_index().pivot(index=["vlm_id"], columns=["prompt_mode"], values="acc.")


prompt_mode,0,1
vlm_id,Unnamed: 1_level_1,Unnamed: 2_level_1
SpaceQwen2.5-VL-3B-Instruct,0.33,0.37
Idefics3-8B-Llama3,0.3,0.3
llava-onevision-qwen2-7b-ov-hf,0.33,0.33
Qwen2.5-VL-7B-Instruct,0.22,0.37
Qwen2.5-VL-72B-Instruct,0.37,0.33
gpt-4o,0.19,0.22


In [9]:
# post-process

table.index.name = "Model"
table.columns.name = "Strategy"
table.columns = ["w/o Ref. Obj.", "w/ Ref. Obj."]

In [10]:
table

Unnamed: 0_level_0,w/o Ref. Obj.,w/ Ref. Obj.
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
SpaceQwen2.5-VL-3B-Instruct,0.33,0.37
Idefics3-8B-Llama3,0.3,0.3
llava-onevision-qwen2-7b-ov-hf,0.33,0.33
Qwen2.5-VL-7B-Instruct,0.22,0.37
Qwen2.5-VL-72B-Instruct,0.37,0.33
gpt-4o,0.19,0.22


In [11]:
print(table.to_latex(
    float_format="%.2f",
))

\begin{tabular}{lrr}
\toprule
 & w/o Ref. Obj. & w/ Ref. Obj. \\
Model &  &  \\
\midrule
SpaceQwen2.5-VL-3B-Instruct & 0.33 & 0.37 \\
Idefics3-8B-Llama3 & 0.30 & 0.30 \\
llava-onevision-qwen2-7b-ov-hf & 0.33 & 0.33 \\
Qwen2.5-VL-7B-Instruct & 0.22 & 0.37 \\
Qwen2.5-VL-72B-Instruct & 0.37 & 0.33 \\
gpt-4o & 0.19 & 0.22 \\
\bottomrule
\end{tabular}

