In [None]:
# IMPORTS
from pathlib import Path
from typing import Sequence

import matplotlib.pyplot as plt
import pandas as pd
from IPython.core.pylabtools import figsize
from IPython.display import display

from src import MODEL_ROOTS
from src.tools import TableFormatter
from src.utils import io

In [None]:
# JUPYTER SETUP
%load_ext autoreload
%autoreload 2

# NOTE: rcParams are optimized for dark mode, change colours to black if using light mode.
%matplotlib inline
figsize(15, 10)
plt.rcParams.update({'font.size': 18, 'text.color': 'w', 'axes.edgecolor': 'w', 'axes.labelcolor': 'w', 'xtick.color': 'w', 'ytick.color': 'w'})

In [None]:
# HELPER FUNCTIONS
def load_dfs(files: dict[str, Sequence[Path]]):
    df = pd.json_normalize([io.load_yaml(f) for fs in files.values() for f in fs])
    df.index = [f'{k}' for k, fs in files.items() for i, _ in enumerate(fs)]
    return df

In [None]:
root = MODEL_ROOTS[-1]
exp, split = 'benchmark', 'eigen_benchmark'
ckpt, mode = 'best', '*'  # {best, last}, {stereo, mono, *}
res_dir = 'results'
fname = f'kitti_{split}_{ckpt}_{mode}.yaml'

# models = ['garg', 'monodepth2_MS']
models = []
if not models:
    fs = sorted(root.glob(f'{exp}/**/{res_dir}/{fname}'))
    models = sorted({f.parents[2].stem for f in fs})

print('Evaluation Models:', models)

In [None]:
# LOAD METRICS
# We expect each model to have multiple available checkpoints. e.g. trained with different random seeds.
# This is handled by `df.groupby(level=0)`. We report mean performance over all seeds.
# StdDev may also be useful to check for outliers that failed to train for some reason.

eval_files = {model: sorted(root.glob(f'{exp}/{model}/**/{res_dir}/{fname}')) for model in models}
df = load_dfs(eval_files)
df_agg = df.groupby(level=0)  # Group all metrics from different seeds for each model

df_mean = df_agg.agg('mean').reindex(models)
df_mean.columns.name = 'Mean'

df_std = df_agg.agg('std').reindex(models)
df_std.columns.name = 'StdDev'

In [None]:
# SHOW DATAFRAMES
display(df)  # Might be quite large, comment out if needed.
display(df_mean)
display(df_std)

In [None]:
# LATEX TABLES WITH BEST MODEL

metrics = [1, -1, -1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1] if split == 'eigen' else \
    [1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1]

precision = 4 if split == 'eigen' else 2
print(TableFormatter.from_df(df_mean, metrics=metrics).to_latex(precision=precision, caption=f'Kitti {split} performance.'))