In [77]:
import os
import json

from pathlib import Path
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np

In [78]:
os.chdir("/home/u5u/kdeng.u5u/spatial-reasoning-of-LMs")

[ ] introduce dataset stat table, and calculate weighted average

In [79]:
col_name_map = {
    "dataset": "Dataset",
    "model": "Model",
    "strategy": "Strategy",
    "split": "DoF",

    "accuracy": "Acc.",
    "f1_score": "F1",
}

useful_map = {
    "gpt-4o": "GPT-4o",
    "Qwen2.5-VL-7B-Instruct": "Qwen-7B",
    "Qwen2.5-VL-32B-Instruct": "Qwen-32B",
    "Qwen2.5-VL-72B-Instruct": "Qwen-72B",

    "sift": "SIFT",
    "loftr": "LoFTR",

    "7-scenes": "7 Scenes",
    "scannet": "ScanNet",
    "scannetpp": "ScanNet++",

    "theta": "Pitch",
    "phi": "Yaw",
    "psi": "Roll",
    "tx": "L/R",
    "ty": "U/D",
    "tz": "F/B",

    # "dataset-prior-hint": "w/ dataset prior",
    # "CoT-hint": "w/ dataset prior + CoT",
    # "VoT-hint": "w/ dataset prior + VoT",

    "zero-shot": "ZS",
    "dataset-prior-hint": "w/ DP",
    "CoT-hint": "DP+CoT",
    "VoT-hint": "DP+VoT",

    np.nan: "—", 
    None: "—",
}


In [80]:
dataset_order = ["7 Scenes", "ScanNet", "ScanNet++"]
model_order = ["SIFT", "LoFTR", "Qwen-7B", "Qwen-32B", "Qwen-72B", "GPT-4o"]
dof_order = ["Pitch", "Yaw", "Roll", "U/D", "L/R", "F/B"]
# strategy_order = ["zero-shot", "w/ dataset prior", "w/ dataset prior + CoT", "w/ dataset prior + VoT"]
strategy_order = ["ZS", "w/ DP", "DP+CoT", "DP+VoT"]

### Useful functions for making table

In [81]:
def _read_c1_vlm(data_dir: str) -> pd.DataFrame:
    data_dir = Path(data_dir)
    data = []
    for dataset in data_dir.iterdir():
        if dataset.is_dir():
            for model in dataset.iterdir():
                if model.is_dir():
                    for stratrgy in model.iterdir():
                        if stratrgy.is_dir():
                            for split in stratrgy.iterdir():
                                if split.is_dir():
                                    try: 
                                        metrics_path = split / "metrics" / "metrics.json"
                                        with open(metrics_path, "r") as f:
                                            metrics = json.load(f)
                                        
                                        data.append({
                                            "dataset": dataset.name,
                                            "model": model.name,
                                            "strategy": stratrgy.name,
                                            "split": split.name,
                                            "accuracy": metrics.get("accuracy", None),
                                            "f1_score": metrics.get("f1_score", None),
                                            "precision": metrics.get("precision", None),
                                            "recall": metrics.get("recall", None)
                                        })
                                    
                                    except Exception as e:
                                        print(f"Error processing data: {e}, dataset: {dataset}, model: {model}, strategy: {stratrgy}, split: {split}")

    df = pd.DataFrame(data)
    return df


def _read_c1_cv(data_dir: str) -> pd.DataFrame:
    data_dir = Path(data_dir)
    data = []
    for dataset in data_dir.iterdir():
        if dataset.is_dir():
            for model in dataset.iterdir():
                if model.is_dir():
                    for split in model.iterdir():
                        if split.is_dir():

                            try: 
                                metrics_path = split / "metrics" / "metrics.json"
                                with open(metrics_path, "r") as f:
                                    metrics = json.load(f)
                                
                                data.append({
                                    "dataset": dataset.name,
                                    "model": model.name,
                                    "strategy": None,
                                    "split": split.name,
                                    "accuracy": metrics.get("accuracy", None),
                                    "f1_score": metrics.get("f1_score", None),
                                    "precision": metrics.get("precision", None),
                                    "recall": metrics.get("recall", None)
                                })
                            
                            except Exception as e:
                                print(f"Error processing data: {e}, dataset: {dataset}, model: {model}, split: {split}")
                                
    return pd.DataFrame(data)


def _rename_table(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(columns=col_name_map)
    df = df.replace(useful_map)
    return df


def _set_factor_as_ordered_cate(df: pd.DataFrame, col: str, order: list) -> pd.DataFrame:
    """
    Set a column as ordered categorical with a specific order.
    """
    df[col] = pd.Categorical(df[col], categories=order, ordered=True)
    return df


def _weighted_avg(row, df: pd.DataFrame, counting_table: pd.DataFrame):
    acc_cols = [col for col in df.columns if col[1] == 'Acc.']
    f1_cols = [col for col in df.columns if col[1] == 'F1']

    # get the row index to see which data
    weights = counting_table.loc[row.name[0], :]

    # calculate the weighted average
    acc_weighted_avg = sum(row[col] * weights.get(col[0], 1) for col in acc_cols) / sum(weights.get(col[0], 1) for col in acc_cols)
    f1_weighted_avg = sum(row[col] * weights.get(col[0], 1) for col in f1_cols) / sum(weights.get(col[0], 1) for col in f1_cols)
    
    return pd.Series({
        ('Avg', 'Acc.'): acc_weighted_avg,
        ('Avg', 'F1'): f1_weighted_avg
    })

    # df[('Avg', 'Acc.')] = sum(df[col] * counting_table.get(col[0], 1) for col in acc_cols) / sum(counting_table.get(col[0], 1) for col in acc_cols)
    # df[('Avg', 'F1')] = sum(df[col] * counting_table.get(col[0], 1) for col in f1_cols) / sum(counting_table.get(col[0], 1) for col in f1_cols)


def _cal_avg_of_acc_f1(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
    """
    Calculate the average of accuracy and F1 score for each group.
    """
    count = kwargs.get("count")

    if count is not None: # count as weigt for each col
        avg_cols = df.apply(lambda row: _weighted_avg(row, df, count), axis=1)
        df = pd.concat([df, avg_cols], axis=1)

    else:
        acc_cols = [col for col in df.columns if col[1] == 'Acc.']
        f1_cols = [col for col in df.columns if col[1] == 'F1']

        df[('Avg', 'Acc.')] = df[acc_cols].mean(axis=1)
        df[('Avg', 'F1')] = df[f1_cols].mean(axis=1)

    return df

# C1 without trap option

In [82]:
c1_without_trap = _read_c1_vlm("/home/u5u/kdeng.u5u/spatial-reasoning-of-LMs/result/final-table-wo-trap/single-dof-cls")
c1_without_trap["is_trap"] = False
c1_without_trap["is_shuffle"] = True
c1_without_trap = _rename_table(c1_without_trap)

c1_without_trap = _set_factor_as_ordered_cate(c1_without_trap, "Dataset", dataset_order)
c1_without_trap = _set_factor_as_ordered_cate(c1_without_trap, "Model", model_order)
c1_without_trap = _set_factor_as_ordered_cate(c1_without_trap, "DoF", dof_order)
c1_without_trap = _set_factor_as_ordered_cate(c1_without_trap, "Strategy", strategy_order)

In [83]:
counting_table = pd.read_csv("/home/u5u/kdeng.u5u/spatial-reasoning-of-LMs/result/csv-tables/c1-counting-table.csv")

In [84]:
counting_table.set_index(["Dataset"], inplace=True)
counting_table

Unnamed: 0_level_0,Pitch,Yaw,Roll,U/D,L/R,F/B
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
7 Scenes,58.0,85.0,87.0,32.0,10.0,18.0
ScanNet,122.0,143.0,34.0,48.0,15.0,19.0
ScanNet++,146.0,262.0,2.0,63.0,4.0,16.0


In [85]:
pivot = c1_without_trap.pivot_table(
    index=["Dataset", "Model", "Strategy"],
    columns="DoF",
    values=["Acc.", "F1"],
)
swap = pivot.swaplevel(0, 1, axis=1).sort_index(axis=1)
swap.columns.names = [None, None]
swap = _cal_avg_of_acc_f1(swap, count=counting_table).round(3)
# swap = _cal_avg_of_acc_f1(swap).round(3)

swap.to_csv("result/csv-tables/c1-wo-trap.csv")
swap

  pivot = c1_without_trap.pivot_table(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Pitch,Pitch,Yaw,Yaw,Roll,Roll,U/D,U/D,L/R,L/R,F/B,F/B,Avg,Avg
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Acc.,F1,Acc.,F1,Acc.,F1,Acc.,F1,Acc.,F1,Acc.,F1,Acc.,F1
Dataset,Model,Strategy,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
7 Scenes,Qwen-7B,ZS,0.655,0.672,0.55,0.498,0.45,0.5,0.7,0.709,0.594,0.49,0.667,0.622,0.566,0.564
7 Scenes,Qwen-7B,w/ DP,0.741,0.659,0.55,0.512,0.567,0.567,0.5,0.475,0.531,0.528,0.778,0.681,0.601,0.565
7 Scenes,Qwen-7B,DP+CoT,0.31,0.168,0.617,0.534,0.4,0.355,1.0,1.0,0.438,0.299,0.5,0.533,0.519,0.45
7 Scenes,Qwen-7B,DP+VoT,0.31,0.168,0.45,0.449,0.583,0.471,0.6,0.6,0.469,0.473,0.444,0.472,0.479,0.418
7 Scenes,Qwen-32B,ZS,0.879,0.878,0.817,0.814,0.45,0.376,0.9,0.903,0.719,0.722,0.444,0.479,0.702,0.681
7 Scenes,Qwen-32B,w/ DP,0.862,0.852,0.783,0.776,0.483,0.473,0.7,0.709,0.812,0.804,0.444,0.491,0.68,0.676
7 Scenes,Qwen-32B,DP+CoT,0.741,0.718,0.833,0.833,0.467,0.485,0.4,0.4,0.719,0.717,0.5,0.519,0.632,0.634
7 Scenes,Qwen-32B,DP+VoT,0.741,0.718,0.767,0.766,0.55,0.549,0.7,0.709,0.688,0.688,0.667,0.622,0.68,0.673
7 Scenes,Qwen-72B,ZS,0.845,0.883,0.817,0.816,0.583,0.637,0.9,0.903,0.875,0.887,0.722,0.652,0.758,0.777
7 Scenes,Qwen-72B,w/ DP,0.793,0.795,0.85,0.85,0.583,0.561,0.7,0.709,0.781,0.782,0.722,0.652,0.732,0.722


In [86]:
# print latex code
print(swap.to_latex(
    column_format="lll|cc|cc|cc|cc|cc|cc|cc",
    multirow=True,
    multicolumn=True,
    multicolumn_format="c",
    float_format="%.3f",
    escape=False,
    label="tab:c1-without-trap",
    caption="C1 without trap.",
    position="tb",
    bold_rows=True,
))

\begin{table}[tb]
\caption{C1 without trap.}
\label{tab:c1-without-trap}
\begin{tabular}{lll|cc|cc|cc|cc|cc|cc|cc}
\toprule
 &  &  & \multicolumn{2}{c}{Pitch} & \multicolumn{2}{c}{Yaw} & \multicolumn{2}{c}{Roll} & \multicolumn{2}{c}{U/D} & \multicolumn{2}{c}{L/R} & \multicolumn{2}{c}{F/B} & \multicolumn{2}{c}{Avg} \\
 &  &  & Acc. & F1 & Acc. & F1 & Acc. & F1 & Acc. & F1 & Acc. & F1 & Acc. & F1 & Acc. & F1 \\
Dataset & Model & Strategy &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\midrule
\multirow[t]{16}{*}{\textbf{7 Scenes}} & \multirow[t]{4}{*}{\textbf{Qwen-7B}} & \textbf{ZS} & 0.655 & 0.672 & 0.550 & 0.498 & 0.450 & 0.500 & 0.700 & 0.709 & 0.594 & 0.490 & 0.667 & 0.622 & 0.566 & 0.564 \\
\textbf{} & \textbf{} & \textbf{w/ DP} & 0.741 & 0.659 & 0.550 & 0.512 & 0.567 & 0.567 & 0.500 & 0.475 & 0.531 & 0.528 & 0.778 & 0.681 & 0.601 & 0.565 \\
\textbf{} & \textbf{} & \textbf{DP+CoT} & 0.310 & 0.168 & 0.617 & 0.534 & 0.400 & 0.355 & 1.000 & 1.000 & 0.438 & 0.299 & 0.500 & 0.533 & 0.519 &

# C1 with trap option

In [87]:
data_dir = Path("/home/u5u/kdeng.u5u/spatial-reasoning-of-LMs/result/final-table-w-trap/single-dof-cls")
c1_with_trap = _read_c1_vlm(data_dir)
c1_with_trap["is_trap"] = True
c1_with_trap["is_shuffle"] = True
c1_with_trap = _rename_table(c1_with_trap)

c1_with_trap = _set_factor_as_ordered_cate(c1_with_trap, "Dataset", dataset_order)
c1_with_trap = _set_factor_as_ordered_cate(c1_with_trap, "Model", model_order)
c1_with_trap = _set_factor_as_ordered_cate(c1_with_trap, "DoF", dof_order)
c1_with_trap = _set_factor_as_ordered_cate(c1_with_trap, "Strategy", strategy_order)

In [88]:
pivot = c1_with_trap.pivot_table(
    index=["Dataset", "Model", "Strategy"],
    columns="DoF",
    values=["Acc.", "F1"],
)
swap = pivot.swaplevel(0, 1, axis=1).sort_index(axis=1)
swap.columns.names = [None, None]
swap = _cal_avg_of_acc_f1(swap, count=counting_table).round(3)

swap.to_csv("result/csv-tables/c1-w-trap.csv")
swap

  pivot = c1_with_trap.pivot_table(


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Pitch,Pitch,Yaw,Yaw,Roll,Roll,U/D,U/D,L/R,L/R,F/B,F/B,Avg,Avg
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Acc.,F1,Acc.,F1,Acc.,F1,Acc.,F1,Acc.,F1,Acc.,F1,Acc.,F1
Dataset,Model,Strategy,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
7 Scenes,Qwen-7B,ZS,0.052,0.092,0.2,0.285,0.0,0.0,0.1,0.175,0.25,0.317,0.111,0.183,0.096,0.144
7 Scenes,Qwen-7B,w/ DP,0.655,0.566,0.5,0.48,0.333,0.42,0.5,0.475,0.594,0.583,0.556,0.556,0.488,0.487
7 Scenes,Qwen-7B,DP+CoT,0.31,0.218,0.517,0.412,0.417,0.418,0.8,0.8,0.344,0.25,0.556,0.593,0.473,0.423
7 Scenes,Qwen-7B,DP+VoT,0.397,0.342,0.5,0.402,0.483,0.425,0.3,0.323,0.344,0.313,0.611,0.649,0.454,0.401
7 Scenes,Qwen-32B,ZS,0.207,0.325,0.578,0.687,0.1,0.151,0.4,0.467,0.531,0.663,0.278,0.36,0.321,0.409
7 Scenes,Qwen-32B,w/ DP,0.741,0.802,0.717,0.73,0.367,0.489,0.7,0.755,0.562,0.63,0.667,0.733,0.606,0.672
7 Scenes,Qwen-32B,DP+CoT,0.672,0.734,0.6,0.635,0.3,0.398,0.4,0.43,0.531,0.582,0.556,0.594,0.497,0.557
7 Scenes,Qwen-32B,DP+VoT,0.655,0.717,0.483,0.523,0.317,0.422,0.6,0.681,0.562,0.633,0.5,0.582,0.484,0.557
7 Scenes,Qwen-72B,ZS,0.552,0.696,0.85,0.864,0.267,0.344,0.8,0.886,0.812,0.826,0.556,0.672,0.59,0.664
7 Scenes,Qwen-72B,w/ DP,0.81,0.827,0.85,0.85,0.467,0.47,0.8,0.8,0.75,0.765,0.722,0.652,0.71,0.711


In [89]:
# print latex code
print(swap.to_latex(
    column_format="lll|cc|cc|cc|cc|cc|cc|cc",
    multirow=True,
    multicolumn=True,
    multicolumn_format="c",
    float_format="%.3f",
    escape=False,
    label="tab:c1-with-trap",
    caption="C1 with trap.",
    bold_rows=True,
    position="tb",
))

\begin{table}[tb]
\caption{C1 with trap.}
\label{tab:c1-with-trap}
\begin{tabular}{lll|cc|cc|cc|cc|cc|cc|cc}
\toprule
 &  &  & \multicolumn{2}{c}{Pitch} & \multicolumn{2}{c}{Yaw} & \multicolumn{2}{c}{Roll} & \multicolumn{2}{c}{U/D} & \multicolumn{2}{c}{L/R} & \multicolumn{2}{c}{F/B} & \multicolumn{2}{c}{Avg} \\
 &  &  & Acc. & F1 & Acc. & F1 & Acc. & F1 & Acc. & F1 & Acc. & F1 & Acc. & F1 & Acc. & F1 \\
Dataset & Model & Strategy &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\midrule
\multirow[t]{16}{*}{\textbf{7 Scenes}} & \multirow[t]{4}{*}{\textbf{Qwen-7B}} & \textbf{ZS} & 0.052 & 0.092 & 0.200 & 0.285 & 0.000 & 0.000 & 0.100 & 0.175 & 0.250 & 0.317 & 0.111 & 0.183 & 0.096 & 0.144 \\
\textbf{} & \textbf{} & \textbf{w/ DP} & 0.655 & 0.566 & 0.500 & 0.480 & 0.333 & 0.420 & 0.500 & 0.475 & 0.594 & 0.583 & 0.556 & 0.556 & 0.488 & 0.487 \\
\textbf{} & \textbf{} & \textbf{DP+CoT} & 0.310 & 0.218 & 0.517 & 0.412 & 0.417 & 0.418 & 0.800 & 0.800 & 0.344 & 0.250 & 0.556 & 0.593 & 0.473 & 0.423

# C1 cv methods

In [90]:
data_dir = Path("/home/u5u/kdeng.u5u/spatial-reasoning-of-LMs/result/final-table-cv-methods/single-dof-cls")
c1_cv_method = _read_c1_cv(data_dir)
c1_cv_method = _rename_table(c1_cv_method)

c1_cv_method = _set_factor_as_ordered_cate(c1_cv_method, "Dataset", dataset_order)
c1_cv_method = _set_factor_as_ordered_cate(c1_cv_method, "Model", model_order)
c1_cv_method = _set_factor_as_ordered_cate(c1_cv_method, "DoF", dof_order)

### Save c1 cv methods results

In [91]:
pivot = c1_cv_method.pivot_table(
    index=["Dataset", "Model"],
    columns="DoF",
    values=["Acc.", "F1"],
)
swap = pivot.swaplevel(0, 1, axis=1).sort_index(axis=1)
swap.columns.names = [None, None] # for latex use
swap = _cal_avg_of_acc_f1(swap, count=counting_table).round(3)

swap.to_csv("result/csv-tables/c1-cv-method.csv")
swap

  pivot = c1_cv_method.pivot_table(


Unnamed: 0_level_0,Unnamed: 1_level_0,Pitch,Pitch,Yaw,Yaw,Roll,Roll,U/D,U/D,L/R,L/R,F/B,F/B,Avg,Avg
Unnamed: 0_level_1,Unnamed: 1_level_1,Acc.,F1,Acc.,F1,Acc.,F1,Acc.,F1,Acc.,F1,Acc.,F1,Acc.,F1
Dataset,Model,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
7 Scenes,SIFT,1.0,1.0,1.0,1.0,0.983,0.983,0.8,0.84,0.875,0.873,1.0,1.0,0.969,0.973
7 Scenes,LoFTR,1.0,1.0,0.983,0.983,0.983,0.983,0.8,0.8,0.969,0.969,1.0,1.0,0.967,0.967
ScanNet,SIFT,0.833,0.847,0.9,0.901,0.912,0.94,0.667,0.664,0.688,0.701,1.0,1.0,0.847,0.854
ScanNet,LoFTR,0.967,0.967,1.0,1.0,0.971,0.97,0.667,0.678,0.854,0.865,1.0,1.0,0.939,0.941


In [92]:
# print latex code
print(swap.to_latex(
    column_format="lll|cc|cc|cc|cc|cc|cc|cc",
    multirow=True,
    multicolumn=True,
    multicolumn_format="c",
    float_format="%.3f",
    escape=False,
    label="tab:c1-cv-method",
    caption="C1 by computer vision methods.",
    bold_rows=True,
    position="tb",
))

\begin{table}[tb]
\caption{C1 by computer vision methods.}
\label{tab:c1-cv-method}
\begin{tabular}{lll|cc|cc|cc|cc|cc|cc|cc}
\toprule
 &  & \multicolumn{2}{c}{Pitch} & \multicolumn{2}{c}{Yaw} & \multicolumn{2}{c}{Roll} & \multicolumn{2}{c}{U/D} & \multicolumn{2}{c}{L/R} & \multicolumn{2}{c}{F/B} & \multicolumn{2}{c}{Avg} \\
 &  & Acc. & F1 & Acc. & F1 & Acc. & F1 & Acc. & F1 & Acc. & F1 & Acc. & F1 & Acc. & F1 \\
Dataset & Model &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\midrule
\multirow[t]{2}{*}{\textbf{7 Scenes}} & \textbf{SIFT} & 1.000 & 1.000 & 1.000 & 1.000 & 0.983 & 0.983 & 0.800 & 0.840 & 0.875 & 0.873 & 1.000 & 1.000 & 0.969 & 0.973 \\
\textbf{} & \textbf{LoFTR} & 1.000 & 1.000 & 0.983 & 0.983 & 0.983 & 0.983 & 0.800 & 0.800 & 0.969 & 0.969 & 1.000 & 1.000 & 0.967 & 0.967 \\
\cline{1-16}
\multirow[t]{2}{*}{\textbf{ScanNet}} & \textbf{SIFT} & 0.833 & 0.847 & 0.900 & 0.901 & 0.912 & 0.940 & 0.667 & 0.664 & 0.688 & 0.701 & 1.000 & 1.000 & 0.847 & 0.854 \\
\textbf{} & \textbf{