In [None]:
# IMPORTS
from pathlib import Path
from typing import Sequence

import matplotlib.pyplot as plt
import pandas as pd
from IPython.core.pylabtools import figsize
from IPython.display import display

from src import MODEL_ROOTS
from src.tools import TableFormatter
from src.utils import io

In [None]:
# JUPYTER SETUP
%load_ext autoreload
%autoreload 2

# NOTE: rcParams are optimized for dark mode, change colours to black if using light mode.
%matplotlib inline
figsize(15, 10)
plt.rcParams.update({'font.size': 18, 'text.color': 'w', 'axes.edgecolor': 'w', 'axes.labelcolor': 'w', 'xtick.color': 'w', 'ytick.color': 'w'})

In [None]:
# HELPER FUNCTIONS
def load_dfs(files: dict[str, Sequence[Path]]):
    dfs = [pd.json_normalize(io.load_yaml(f)) for fs in files.values() for f in fs]
    df = pd.concat(dfs)

    # Add multi-index based on model and item number, since we don't have mean metrics.
    models = [f'{k}' for k, fs in files.items() for _ in fs]
    df.index = pd.MultiIndex.from_product([models, dfs[0].index], names=['Model', 'Item'])
    return df

In [None]:
root = MODEL_ROOTS[-1]
exp, split = 'benchmark', 'val'
ckpt, mode = 'best', '*'  # {best, last}, {stereo, mono, *}
res_dir = 'results'
fname = f'syns_{split}_{ckpt}_{mode}.yaml'

# models = ['garg', 'monodepth2_MS']
models = []
if not models:
    fs = sorted(root.glob(f'{exp}/**/{res_dir}/{fname}'))
    models = sorted({f.parents[2].stem for f in fs})

print('Evaluation Models:', models)

In [None]:
# LOAD METRICS
# We expect each model to have multiple available checkpoints. e.g. trained with different random seeds.
# This is handled by `df.groupby(level=0)`. We report mean performance over all seeds.
# StdDev may also be useful to check for outliers that failed to train for some reason.

eval_files = {model: sorted(root.glob(f'{exp}/{model}/**/{res_dir}/{fname}')) for model in models}
df = load_dfs(eval_files)
df_edges = pd.concat([df.pop(k) for k in df.columns if k.endswith('-Edges')], axis=1)

df_agg = df.groupby(level=0)  # Group all metrics from different seeds for each model

df_mean = df_agg.agg('mean').reindex(models)
df_mean.columns.name = 'Mean'

df_std = df_agg.agg('std').reindex(models)
df_std.columns.name = 'StdDev'


df_edges_agg = df_edges.groupby(level=0)  # Group all metrics from different seeds for each model

df_edges_mean = df_edges_agg.agg('mean').reindex(models)
df_edges_mean.columns.name = 'Mean'

df_edges_std = df_edges_agg.agg('std').reindex(models)
df_edges_std.columns.name = 'StdDev'

In [None]:
# SHOW DATAFRAMES
# display(df)  # Might be quite large, comment out if needed.
display(df_mean)
# display(df_std)

# display(df_edges)  # Might be quite large, comment out if needed.
display(df_edges_mean)
# display(df_edges_std)

In [None]:
# LATEX TABLES WITH BEST MODEL
metrics = [1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1]
precision = 4
print(TableFormatter.from_df(df_mean, metrics=metrics).to_latex(precision=precision, caption=f'SYNS-Patches {split} performance.'))

metrics2 = metrics + [1, -1, -1, -1, -1]
print(TableFormatter.from_df(df_edges_mean, metrics=metrics2).to_latex(precision=precision, caption=f'SYNS-Patches {split} edges performance.'))

In [None]:
# GROUP BY MODEL & CATEGORY
df_agg2 = df.groupby(['Cat', 'Model'])  # Group all metrics from different seeds for each model AND CATEGORY

df_mean2 = df_agg2.agg('mean')
df_mean2.columns.name = 'Mean'

df_std2 = df_agg2.agg('std')
df_std2.columns.name = 'StdDev'


df_edges_agg2 = df_edges.groupby(['Cat-Edges', 'Model'])  # Group all metrics from different seeds for each model AND CATEGORY

df_edges_mean2 = df_edges_agg2.agg('mean')
df_edges_mean2.columns.name = 'Mean'

df_edges_std2 = df_edges_agg2.agg('std')
df_edges_std2.columns.name = 'StdDev'

In [None]:
# SHOW DATAFRAMES
display(df_mean2)
# display(df_std2)

display(df_edges_mean2)
# display(df_edges_std2)

In [None]:
# LATEX TABLES WITH BEST MODEL
# WARNING: This is quite messy when dealing with all the separate categories.
metrics = [1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1]
precision = 4
print(TableFormatter.from_df(df_mean2, metrics=metrics).to_latex(precision=precision, caption=f'SYNS-Patches {split} performance over each category.'))


metrics2 = metrics + [1, -1, -1, -1, -1]
print(TableFormatter.from_df(df_edges_mean2, metrics=metrics2).to_latex(precision=precision, caption=f'SYNS-Patches {split} edge performance over each category.'))