In [12]:
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [13]:
path = (
    "/home/nils/projects/uq-method-box/experiments/experiments/exp_results/results.csv"
)

df = pd.read_csv(path)

meta_cols = [
    "base_model",
    "loss_fn",
    "ensemble",
    "ensemble_members",
    "conformalized",
    "dataset_name",
    "pred_log_dir",
    "mlp_n_outputs",
    "date",
    "seed",
]

# Preprocessing

In [14]:
# drop duplicate experiments and keep the one with the latest date
df = (
    df.sort_values("date")
    .drop_duplicates(meta_cols, keep="last")
    .reset_index(drop=True)
)
df["ensemble"] = df["ensemble"].fillna("None")  # should be handled when saving the dict
df = df[
    df["loss_fn"] != "quantile"
]  # discovered something wrong with quantile loss with huge nll so exclude at the moment

df["model_plot_name"] = (
    df["base_model"] + "_" + df["loss_fn"] + "_" + df["ensemble"]
)  # TODO come up with a unique model name for us to make easy plotting
df

Unnamed: 0,nll,crps,check,interval,rms_cal,ma_cal,miscal_area,sharp,mae,rmse,...,loss_fn,ensemble,ensemble_members,conformalized,dataset_name,pred_log_dir,mlp_n_outputs,date,seed,model_plot_name
0,14111.062909,0.41838,0.209498,4.112912,0.511126,0.439608,0.444048,0.035804,0.433516,0.728723,...,mse,,1,False,boston,experiments/experiments/test_swag_base_model_0...,1,2023-03-20 13:16:56,0,base_model_mse_None
1,5828.346964,0.413622,0.207087,4.089348,0.525153,0.452157,0.456724,0.03128,0.427487,0.722097,...,mse,,1,False,boston,experiments/experiments/test_swag_base_model_0...,1,2023-03-20 13:16:56,1,base_model_mse_None
2,213948.004062,0.408114,0.204372,3.995762,0.519845,0.449216,0.453753,0.035449,0.424086,0.727336,...,mse,,1,False,boston,experiments/experiments/test_swag_base_model_0...,1,2023-03-20 13:16:56,2,base_model_mse_None
3,4558.9551,0.406427,0.203539,3.97216,0.522163,0.453529,0.458111,0.037625,0.422738,0.716206,...,mse,,1,False,boston,experiments/experiments/test_swag_base_model_0...,1,2023-03-20 13:16:56,3,base_model_mse_None
4,1503.347772,0.419891,0.210262,4.125983,0.497393,0.425686,0.429986,0.036424,0.434988,0.728648,...,mse,,1,False,boston,experiments/experiments/test_swag_base_model_0...,1,2023-03-20 13:16:56,4,base_model_mse_None
5,5145.308471,0.349734,0.175122,3.438729,0.558262,0.482157,0.487027,0.030207,0.362464,0.637048,...,mse,,1,False,boston,experiments/experiments/test_swag_base_model_0...,1,2023-03-20 13:27:21,0,base_model_mse_None
6,3320.570725,0.351711,0.176084,3.481628,0.534173,0.458235,0.462864,0.027361,0.362982,0.639899,...,mse,,1,False,boston,experiments/experiments/test_swag_base_model_0...,1,2023-03-20 13:27:21,1,base_model_mse_None
7,4512.989789,0.367083,0.183779,3.633085,0.515322,0.445686,0.450188,0.02984,0.379427,0.668666,...,mse,,1,False,boston,experiments/experiments/test_swag_base_model_0...,1,2023-03-20 13:27:21,2,base_model_mse_None
8,8090.752776,0.343855,0.172185,3.371665,0.497412,0.430392,0.43474,0.033585,0.357298,0.632937,...,mse,,1,False,boston,experiments/experiments/test_swag_base_model_0...,1,2023-03-20 13:27:21,3,base_model_mse_None
9,9390.255665,0.357495,0.179049,3.479884,0.53352,0.462549,0.467221,0.036214,0.372699,0.623407,...,mse,,1,False,boston,experiments/experiments/test_swag_base_model_0...,1,2023-03-20 13:27:21,4,base_model_mse_None


# Evaluate All Models across seeds on one dataset

This is in line with the plots in the Bayesian Wilson paper that we want to reproduce.

In [4]:
def evaluate_all_models_one_ds(
    dataset_name: str, metric: str, df: pd.DataFrame
) -> None:
    """Evaluate all models on a single dataset.

    Args:
        dataset_name: name of dataset for which to plot results
        metric: name of the metric to visualize

    """
    dataset_df = df[df["dataset_name"] == dataset_name]
    dataset_df = dataset_df.drop_duplicates(
        ["base_model", "loss_fn", "ensemble", "seed"]
    ).reset_index(drop=True)
    dataset_df["model_plot_name"] = (
        dataset_df["base_model"]
        + "_"
        + dataset_df["loss_fn"]
        + "_"
        + dataset_df["ensemble"]
    )

    sns.violinplot(data=dataset_df, x="model_plot_name", y=metric)
    sns.despine(left=True)
    plt.title(f"{metric} for {dataset_name} dataset.")


evaluate_all_models_one_ds("energy", "nll", df)

ValueError: min() arg is an empty sequence

# Evaluate a single model across datasets

This might be useful to check indidivual model performance and see what is going on.

In [5]:
def evaluate_all_models_one_ds(model_name: str, metric: str, df: pd.DataFrame) -> None:
    """Evaluate all models on a single dataset.

    Args:
        dataset_name: name of dataset for which to plot results
        metric: name of the metric to visualize

    """
    dataset_df = df[df["model_plot_name"] == model_name]
    dataset_df = dataset_df.drop_duplicates(
        ["base_model", "loss_fn", "ensemble", "seed", "dataset_name"]
    ).reset_index(drop=True)

    sns.violinplot(data=dataset_df, x="dataset_name", y=metric)
    sns.despine(left=True)
    plt.title(f"{metric} for {model_name} across datasets.")


evaluate_all_models_one_ds("laplace_mse_None", "nll", df)

ValueError: min() arg is an empty sequence

# Big Tables with numbers

Here we can autogenerate summary table for different things and also convert them to latex.

In [48]:
# TODO code here