In [1]:
import os
import json

from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np

In [2]:
os.chdir("/home/u5u/kdeng.u5u/spatial-reasoning-of-LMs")

In [3]:
col_name_map = {
    "dataset": "Dataset",
    "model": "Model",
    "tau": "Tau",
    "split": "DoF",

    "accuracy": "Acc.",
    "f1_score": "F1",
}

useful_map = {
    "gpt-4o": "GPT-4o",
    "Qwen2.5-VL-7B-Instruct": "Qwen-7B",
    "Qwen2.5-VL-32B-Instruct": "Qwen-32B",
    "Qwen2.5-VL-72B-Instruct": "Qwen-72B",

    "sift": "SIFT",
    "loftr": "LoFTR",

    "7-scenes": "7 Scenes",
    "scannet": "ScanNet",
    "scannetpp": "ScanNet++",

    "rotation": "Yaw",
    "translation": "L/R",

    "zero-shot": "ZS",
    "dataset-prior-hint": "w/ DP",
    "CoT-hint": "DP+CoT",
    "VoT-hint": "DP+VoT",

    "min-angle-15-deg": "15°",
    "min-angle-30-deg": "30°",
    "min-angle-45-deg": "45°",
    "min-angle-60-deg": "60°",

    np.nan: "—", 
    None: "—",
}


In [4]:
dataset_order = ["7 Scenes", "ScanNet", "ScanNet++"]
model_order = ["SIFT", "LoFTR", "Qwen-7B", "Qwen-32B", "Qwen-72B", "GPT-4o"]
dof_order = ["Pitch", "Yaw", "Roll", "U/D", "L/R", "F/B"]

In [5]:
def _read_c2_vlm(data_dir: str) -> pd.DataFrame:
    data_dir = Path(data_dir)
    # split_count = {}
    data = []
    for metric in data_dir.iterdir():
        if metric.is_dir():
            # split_count[metric.name] = 0
            for dataset in metric.iterdir():
                if dataset.is_dir():
                    for model in dataset.iterdir():
                        if model.is_dir():
                            for tau in model.iterdir():
                                if tau.is_dir():

                                    try: 
                                        metrics_path = tau / "metrics" / "metrics.json"
                                        with open(metrics_path, "r") as f:
                                            metrics = json.load(f)
                                        
                                        # # count the dataset numbers
                                        # inference_path = tau / "inference.jsonl"
                                        # with open(inference_path, "r") as f:
                                        #     for _ in f:
                                        #         split_count[metric.name] += 1

                                        
                                        data.append({
                                            "dataset": dataset.name,
                                            "model": model.name,
                                            "tau": tau.name,
                                            "split": metric.name,
                                            "accuracy": metrics.get("accuracy", None),
                                            "f1_score": metrics.get("f1_score", None),
                                            "precision": metrics.get("precision", None),
                                            "recall": metrics.get("recall", None)
                                        })
                                    
                                    except Exception as e:
                                        print(f"Error processing data: {e}, dataset: {dataset}, model: {model}, split: {metric}")

    df = pd.DataFrame(data)
    return df


def _rename_table(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(columns=col_name_map)
    df = df.replace(useful_map)
    return df


def _set_factor_as_ordered_cate(df: pd.DataFrame, col: str, order: list) -> pd.DataFrame:
    """
    Set a column as ordered categorical with a specific order.
    """
    df[col] = pd.Categorical(df[col], categories=order, ordered=True)
    return df


def _read_c2_cv(data_dir: str) -> pd.DataFrame:
    data_dir = Path(data_dir)
    data = []
    for dataset in data_dir.iterdir():
        if dataset.is_dir():
            for tau in dataset.iterdir():
                if tau.is_dir():
                    for model in tau.iterdir():
                        if model.is_dir():
                            for split in model.iterdir():
                                if split.is_dir():

                                    try: 
                                        metrics_path = split / "metrics" / "metrics.json"
                                        with open(metrics_path, "r") as f:
                                            metrics = json.load(f)
                                        
                                        data.append({
                                            "dataset": dataset.name,
                                            "model": model.name,
                                            "tau": tau.name,
                                            "split": split.name,
                                            "accuracy": metrics.get("accuracy", None),
                                            "f1_score": metrics.get("f1_score", None),
                                            "precision": metrics.get("precision", None),
                                            "recall": metrics.get("recall", None)
                                        })
                                    
                                    except Exception as e:
                                        print(f"Error processing data: {e}, dataset: {dataset}, model: {model}, split: {split}")
                                
    return pd.DataFrame(data)


def _cal_avg_of_acc_f1(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
    """
    Calculate the average of accuracy and F1 score for each group.
    """
    count = kwargs.get("count") # dict: col: int

    if count: # count as weigt for each col
        count = {useful_map.get(key, key): value for key, value in count.items()} # rename the count
        acc_cols = [col for col in df.columns if col[1] == 'Acc.']
        f1_cols = [col for col in df.columns if col[1] == 'F1']

        df[('Avg', 'Acc.')] = sum(df[col] * count.get(col[0], 1) for col in acc_cols) / sum(count.get(col[0], 1) for col in acc_cols)
        df[('Avg', 'F1')] = sum(df[col] * count.get(col[0], 1) for col in f1_cols) / sum(count.get(col[0], 1) for col in f1_cols)

    else:
        acc_cols = [col for col in df.columns if col[1] == 'Acc.']
        f1_cols = [col for col in df.columns if col[1] == 'F1']

        df[('Avg', 'Acc.')] = df[acc_cols].mean(axis=1)
        df[('Avg', 'F1')] = df[f1_cols].mean(axis=1)

    return df


def _pivot_c2_vlm(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
    """
    Pivot the DataFrame to have a multi-index with Dataset, Model, Tau, and DoF.
    """
    count = kwargs.get("count")  # dict: col: int
    pivot = df.pivot_table(
        index=["Dataset", "Model", "Tau"],
        columns="DoF",
        values=["Acc.", "F1"],
    )
    pivot = pivot.swaplevel(axis=1).sort_index(axis=1)
    pivot.columns.names = [None, None]  # Remove the names of the columns
    pivot = _cal_avg_of_acc_f1(pivot, count=count).round(3)
    return pivot

### C2 without trap

In [6]:
data_dir = Path("/home/u5u/kdeng.u5u/spatial-reasoning-of-LMs/result/final-table-wo-trap/obj-centered-cls")
c2_wo_trap = _read_c2_vlm(data_dir)
c2_wo_trap = _rename_table(c2_wo_trap)

In [7]:
c2_wo_trap = _set_factor_as_ordered_cate(c2_wo_trap, "Dataset", dataset_order)
c2_wo_trap = _set_factor_as_ordered_cate(c2_wo_trap, "Model", model_order)
c2_wo_trap = _set_factor_as_ordered_cate(c2_wo_trap, "DoF", dof_order)

In [8]:
pivot_c2_wo_trap = _pivot_c2_vlm(c2_wo_trap)
pivot_c2_wo_trap

  pivot = df.pivot_table(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Yaw,Yaw,L/R,L/R,Avg,Avg
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Acc.,F1,Acc.,F1,Acc.,F1
Dataset,Model,Tau,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
7 Scenes,Qwen-7B,15°,0.45,0.322,0.567,0.48,0.508,0.401
7 Scenes,Qwen-7B,30°,0.383,0.264,0.55,0.46,0.467,0.362
7 Scenes,Qwen-7B,45°,0.433,0.305,0.583,0.549,0.508,0.427
7 Scenes,Qwen-7B,60°,0.553,0.435,0.447,0.318,0.5,0.377
7 Scenes,Qwen-32B,15°,0.45,0.437,0.667,0.664,0.558,0.55
7 Scenes,Qwen-32B,30°,0.45,0.455,0.567,0.564,0.508,0.509
7 Scenes,Qwen-32B,45°,0.467,0.469,0.467,0.477,0.467,0.473
7 Scenes,Qwen-32B,60°,0.5,0.506,0.342,0.343,0.421,0.425
7 Scenes,Qwen-72B,15°,0.583,0.571,0.617,0.618,0.6,0.594
7 Scenes,Qwen-72B,30°,0.483,0.481,0.5,0.503,0.492,0.492


In [None]:
# print latex code and save as csv
print(pivot_c2_wo_trap.to_latex(
    position="tb",
    column_format="lll|cc|cc|cc",
    bold_rows=True,
    float_format="%.3f",
    multirow=True,
    multicolumn=True,
    multicolumn_format="c",
    escape=False,
    caption="C2 without trap.",
    label="tab:c2-wo-trap",
))

pivot_c2_wo_trap.to_csv("result/csv-tables/c2-wo-trap.csv")

\begin{table}[tb]
\caption{C2 without trap.}
\label{tab:c2-wo-trap}
\begin{tabular}{lll|cc|cc}
\toprule
 &  &  & \multicolumn{2}{c}{Yaw} & \multicolumn{2}{c}{L/R} & \multicolumn{2}{c}{Avg} \\
 &  &  & Acc. & F1 & Acc. & F1 & Acc. & F1 \\
Dataset & Model & Tau &  &  &  &  &  &  \\
\midrule
\multirow[t]{16}{*}{\textbf{7 Scenes}} & \multirow[t]{4}{*}{\textbf{Qwen-7B}} & \textbf{15°} & 0.450 & 0.322 & 0.567 & 0.480 & 0.508 & 0.401 \\
\textbf{} & \textbf{} & \textbf{30°} & 0.383 & 0.264 & 0.550 & 0.460 & 0.467 & 0.362 \\
\textbf{} & \textbf{} & \textbf{45°} & 0.433 & 0.305 & 0.583 & 0.549 & 0.508 & 0.427 \\
\textbf{} & \textbf{} & \textbf{60°} & 0.553 & 0.435 & 0.447 & 0.318 & 0.500 & 0.377 \\
\cline{2-9}
\textbf{} & \multirow[t]{4}{*}{\textbf{Qwen-32B}} & \textbf{15°} & 0.450 & 0.437 & 0.667 & 0.664 & 0.558 & 0.550 \\
\textbf{} & \textbf{} & \textbf{30°} & 0.450 & 0.455 & 0.567 & 0.564 & 0.508 & 0.509 \\
\textbf{} & \textbf{} & \textbf{45°} & 0.467 & 0.469 & 0.467 & 0.477 & 0.467 & 0.473 \

## C2 with trap

In [10]:
data_dir = Path("/home/u5u/kdeng.u5u/spatial-reasoning-of-LMs/result/final-table-w-trap/obj-centered-cls")
c2_w_trap = _read_c2_vlm(data_dir)
c2_w_trap = _rename_table(c2_w_trap)

In [11]:
c2_w_trap = _set_factor_as_ordered_cate(c2_w_trap, "Dataset", dataset_order)
c2_w_trap = _set_factor_as_ordered_cate(c2_w_trap, "Model", model_order)
c2_w_trap = _set_factor_as_ordered_cate(c2_w_trap, "DoF", dof_order)

In [18]:
pivot_c2_w_trap = _pivot_c2_vlm(c2_w_trap)
pivot_c2_w_trap.index.names = ["Dataset", "Model", r"$\tau$"]
pivot_c2_w_trap

  pivot = df.pivot_table(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Yaw,Yaw,L/R,L/R,Avg,Avg
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Acc.,F1,Acc.,F1,Acc.,F1
Dataset,Model,$\tau$,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
7 Scenes,Qwen-7B,15°,0.217,0.223,0.467,0.556,0.342,0.389
7 Scenes,Qwen-7B,30°,0.25,0.213,0.4,0.394,0.325,0.304
7 Scenes,Qwen-7B,45°,0.317,0.312,0.45,0.477,0.383,0.395
7 Scenes,Qwen-7B,60°,0.553,0.512,0.421,0.396,0.487,0.454
7 Scenes,Qwen-32B,15°,0.25,0.265,0.213,0.284,0.232,0.275
7 Scenes,Qwen-32B,30°,0.417,0.474,0.3,0.376,0.358,0.425
7 Scenes,Qwen-32B,45°,0.333,0.357,0.283,0.336,0.308,0.346
7 Scenes,Qwen-32B,60°,0.474,0.52,0.421,0.457,0.447,0.489
7 Scenes,Qwen-72B,15°,0.483,0.476,0.633,0.641,0.558,0.558
7 Scenes,Qwen-72B,30°,0.467,0.467,0.55,0.545,0.508,0.506


In [None]:
# print latex code and save as csv
print(pivot_c2_w_trap.to_latex(
    column_format="lll|cc|cc|cc",
    float_format="%.3f",
    multicolumn_format="c",
    escape=False,
    caption="C2 with trap option.",
    label="tab:c2-w-trap",
    position="tb",
    multicolumn=True,
    multirow=True,
    # index_names=False,
    bold_rows=True,
    # longtable=True,
))
pivot_c2_w_trap.to_csv("result/csv-tables/c2-w-trap.csv")

\begin{table}[tb]
\caption{C2 with trap option.}
\label{tab:c2-w-trap}
\begin{tabular}{lll|cc|cc｜cc}
\toprule
 &  &  & \multicolumn{2}{c}{Yaw} & \multicolumn{2}{c}{L/R} & \multicolumn{2}{c}{Avg} \\
 &  &  & Acc. & F1 & Acc. & F1 & Acc. & F1 \\
Dataset & Model & $\tau$ &  &  &  &  &  &  \\
\midrule
\multirow[t]{16}{*}{\textbf{7 Scenes}} & \multirow[t]{4}{*}{\textbf{Qwen-7B}} & \textbf{15°} & 0.217 & 0.223 & 0.467 & 0.556 & 0.342 & 0.389 \\
\textbf{} & \textbf{} & \textbf{30°} & 0.250 & 0.213 & 0.400 & 0.394 & 0.325 & 0.304 \\
\textbf{} & \textbf{} & \textbf{45°} & 0.317 & 0.312 & 0.450 & 0.477 & 0.383 & 0.395 \\
\textbf{} & \textbf{} & \textbf{60°} & 0.553 & 0.512 & 0.421 & 0.396 & 0.487 & 0.454 \\
\cline{2-9}
\textbf{} & \multirow[t]{4}{*}{\textbf{Qwen-32B}} & \textbf{15°} & 0.250 & 0.265 & 0.213 & 0.284 & 0.232 & 0.275 \\
\textbf{} & \textbf{} & \textbf{30°} & 0.417 & 0.474 & 0.300 & 0.376 & 0.358 & 0.425 \\
\textbf{} & \textbf{} & \textbf{45°} & 0.333 & 0.357 & 0.283 & 0.336 & 0.308 

### C2 cv method

In [14]:
data_dir = "/home/u5u/kdeng.u5u/spatial-reasoning-of-LMs/result/final-table-cv-methods/obj-centered-cls"
c2_cv = _read_c2_cv(data_dir)
c2_cv = _rename_table(c2_cv)

In [15]:
c2_cv = _set_factor_as_ordered_cate(c2_cv, "Dataset", dataset_order)
c2_cv = _set_factor_as_ordered_cate(c2_cv, "Model", model_order)
c2_cv = _set_factor_as_ordered_cate(c2_cv, "DoF", dof_order)

In [21]:
pivot_c2_cv = _pivot_c2_vlm(c2_cv)
pivot_c2_cv.index.names = ["Dataset", "Model", r"$\tau$"]
pivot_c2_cv

  pivot = df.pivot_table(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Yaw,Yaw,L/R,L/R,Avg,Avg
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Acc.,F1,Acc.,F1,Acc.,F1
Dataset,Model,$\tau$,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
7 Scenes,SIFT,15°,0.883,0.906,0.75,0.766,0.817,0.836
7 Scenes,SIFT,30°,0.667,0.709,0.633,0.671,0.65,0.69
7 Scenes,SIFT,45°,0.517,0.584,0.567,0.643,0.542,0.613
7 Scenes,SIFT,60°,0.447,0.541,0.316,0.38,0.382,0.461
7 Scenes,LoFTR,15°,0.95,0.966,0.95,0.966,0.95,0.966
7 Scenes,LoFTR,30°,0.95,0.958,0.95,0.958,0.95,0.958
7 Scenes,LoFTR,45°,0.917,0.945,0.9,0.927,0.908,0.936
7 Scenes,LoFTR,60°,0.763,0.82,0.789,0.851,0.776,0.836
ScanNet,SIFT,15°,0.55,0.58,0.65,0.678,0.6,0.629
ScanNet,SIFT,30°,0.517,0.547,0.6,0.637,0.558,0.592


In [22]:
# print latex code and save as csv
print(pivot_c2_cv.to_latex(
    buf=None,
    column_format="lll|cc|cc|cc",
    float_format="%.3f",
    multicolumn_format="c",
    escape=False,
    caption="C2 by computer vision methods.",
    label="tab:c2-cv-method",
    position="tb",
    multicolumn=True,
    multirow=True,
    bold_rows=True,
))
pivot_c2_w_trap.to_csv("result/csv-tables/c2-cv-method.csv")

\begin{table}[tb]
\caption{C2 by computer vision methods.}
\label{tab:c2-cv-method}
\begin{tabular}{lll|cc|cc|cc}
\toprule
 &  &  & \multicolumn{2}{c}{Yaw} & \multicolumn{2}{c}{L/R} & \multicolumn{2}{c}{Avg} \\
 &  &  & Acc. & F1 & Acc. & F1 & Acc. & F1 \\
Dataset & Model & $\tau$ &  &  &  &  &  &  \\
\midrule
\multirow[t]{8}{*}{\textbf{7 Scenes}} & \multirow[t]{4}{*}{\textbf{SIFT}} & \textbf{15°} & 0.883 & 0.906 & 0.750 & 0.766 & 0.817 & 0.836 \\
\textbf{} & \textbf{} & \textbf{30°} & 0.667 & 0.709 & 0.633 & 0.671 & 0.650 & 0.690 \\
\textbf{} & \textbf{} & \textbf{45°} & 0.517 & 0.584 & 0.567 & 0.643 & 0.542 & 0.613 \\
\textbf{} & \textbf{} & \textbf{60°} & 0.447 & 0.541 & 0.316 & 0.380 & 0.382 & 0.461 \\
\cline{2-9}
\textbf{} & \multirow[t]{4}{*}{\textbf{LoFTR}} & \textbf{15°} & 0.950 & 0.966 & 0.950 & 0.966 & 0.950 & 0.966 \\
\textbf{} & \textbf{} & \textbf{30°} & 0.950 & 0.958 & 0.950 & 0.958 & 0.950 & 0.958 \\
\textbf{} & \textbf{} & \textbf{45°} & 0.917 & 0.945 & 0.900 & 0.927 & 