FIRST run: python create_results_dfs.py [--ensemble] [--datasets ...] <br>
This will create the results dataframes in OUTPUT_ROOT (usually: aggregated/...)

In [None]:
import os
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

OUTPUT_ROOT = "/home/space/diverse_priors/results/aggregated/"
ood_datasets = ["wds_imagenet-a", "wds_imagenet-r", "wds_imagenet_sketch", "wds_imagenetv2"]

CONFIG_COLS_SINGLE = [
    'task',
    'mode',
    'dataset',
    'feature_normalization',
    'feature_alignment',
    'val_proportion',
    'model_ids',
    'model_source',
    'best_weight_decay'
]

CONFIG_COLS_MULTIPLE = CONFIG_COLS_SINGLE[:-1] + [
    'clustering_method',
    'combiner',
    'models',
    'n_models',
    'num_clusters',
    'sampling_method',
    'similarity_method'
]

HYPER_PARAMETER_COLS = [
    'fewshot_k',
    'fewshot_lr',
    'fewshot_epochs',
    'batch_size'
]

METRIC_COLS = [
    'test_lp_acc1',
    'test_lp_acc5',
    'test_lp_mean_per_class_recall',
]

cols_to_display = ["model_ids", "mean", "std"]

agg_metric = METRIC_COLS[0]
metric_col = 'mean'

ds_name_mapping = {
    "wds_imagenet-a": "ImageNet-A",
    "wds_imagenet-r": "ImageNet-R",
    "wds_imagenet_sketch": "ImageNet Sketch",
    "wds_imagenetv2": "ImageNet V2",
}

sim_metric_name_mapping = {
    'cka_kernel_rbf_unbiased_sigma_0.2': 'CKA RBF 0.2',
    'cka_kernel_rbf_unbiased_sigma_0.4': 'CKA RBF 0.4',
    'cka_kernel_rbf_unbiased_sigma_0.6': 'CKA RBF 0.6',
    'cka_kernel_rbf_unbiased_sigma_0.8': 'CKA RBF 0.8',
    'cka_kernel_linear_unbiased': 'CKA linear',
    'rsa_method_correlation_corr_method_pearson': 'RSA pearson',
    'rsa_method_correlation_corr_method_spearman': 'RSA spearman',
}

model_selec_name_mapping = {
    'cluster_best': "Best model of each cluster",
    'cluster_random': "Random model of each cluster",
    'top-k': 'Top-k models',
    'random': 'Random k models',
}

base_storing_path = Path('/home/space/diverse_priors/results/plots')
SAVE = False

In [None]:
def null_or_eval(x):
    if x == 'null' or x == '"null"':
        return None
    try:
        return eval(x)
    except:
        return x


def convert_columns(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].apply(null_or_eval)
        if isinstance(df[col].iloc[0], list):
            df[col] = df[col].apply(lambda x: tuple(x))
    return df


def get_best_performance_per_config(df, metric, config_cols, ascending=False):
    res = df.groupby(config_cols + HYPER_PARAMETER_COLS, dropna=False)[metric].agg(['mean', 'std']).reset_index()
    idx = res.reset_index().groupby(config_cols, dropna=False)['mean'].idxmax()
    res = res.loc[idx].reset_index(drop=True)
    res[f'{metric} mean (std)'] = res.apply(lambda row: f"{row['mean']:.3f} ({row['std']:.3f})", axis=1)
    res = res.sort_values('mean', ascending=False).reset_index(drop=True)
    return res


def load_data(path):
    df = pd.read_pickle(path)
    df = convert_columns(df)
    return df


# Function to apply gradient color
def gradient_color(df, metric, vmin=None, vmax=None, dec=3):
    return df.style.background_gradient(axis=0,
                                        subset=[metric],
                                        cmap='coolwarm',
                                        vmin=vmin,
                                        vmax=vmax).format(precision=dec, subset=[metric])


## Fix a dataset

In [None]:
dataset = ood_datasets[a]
print(dataset)
if SAVE:
    storing_path = base_storing_path / dataset
    storing_path.mkdir(parents=True, exist_ok=True)

In [None]:
plotting_df = pd.DataFrame()

## Single model

In [None]:
path = os.path.join(OUTPUT_ROOT, dataset + "/single_model/results_imagenet1k.pkl")
df = load_data(path)

print(df.keys())

In [None]:
df = get_best_performance_per_config(df, agg_metric, CONFIG_COLS_SINGLE)

In [None]:
gradient_color(df[cols_to_display], metric_col)

In [None]:
best_single_model_name = 'Best single model'
best_single_model = df[0:1].copy()
best_single_model['mode'] = best_single_model_name
plotting_df = pd.concat([plotting_df, best_single_model])

Single model: Iterating over datasets

In [None]:
# for ds in ood_datasets:
#     path = path = os.path.join(OUTPUT_ROOT, ds+"/single_model/results_imagenet1k.pkl")
#     df = load_data(path)
#     df = get_best_performance_per_config(df, agg_metric, CONFIG_COLS_SINGLE)
#     to_display=df[cols_to_display]
#     print(ds.upper())
#     display(gradient_color(to_display, metric_col) )
#     print()

## Multiple models: Ensemble & Combined models

In [None]:
path = os.path.join(OUTPUT_ROOT, dataset + "/ensemble/results_imagenet1k.pkl")
df_en = load_data(path)

path = os.path.join(OUTPUT_ROOT, dataset + "/combined_models/results_imagenet1k.pkl")
df_comb = load_data(path)

df = pd.concat([df_en, df_comb])
print(df.keys(), df.shape)

In [None]:
df_TnB = df[df.sampling_method.isin(['cluster_best', 'top-k'])].copy()
df_R = df[df.sampling_method == 'random'].copy()
df_CR = df[df.sampling_method == 'cluster_random'].copy()

In [None]:
df_TnB = get_best_performance_per_config(df_TnB, agg_metric, CONFIG_COLS_MULTIPLE)

In [None]:
config_random = ['task', 'mode', 'dataset', 'combiner', 'feature_normalization', 'val_proportion', 'sampling_method',
                 'n_models']
df_R = get_best_performance_per_config(df_R, agg_metric, config_random)

config_random += ['similarity_method']
df_CR = get_best_performance_per_config(df_CR, agg_metric, config_random)

In [None]:
df = pd.concat([df_TnB, df_R, df_CR])
df.sampling_method.value_counts(), df.duplicated().sum()

In [None]:
vmin = df[metric_col].min()
vmax = df[metric_col].max()
vmin, vmax

In [None]:
## Display best performance of Top-K models
tmp_df = df[df.sampling_method == 'top-k'][['mode', 'n_models'] + cols_to_display[1:]].sort_values(['mode', 'n_models'])
print("TOP-K")
gradient_color(tmp_df, metric_col, vmin=vmin, vmax=vmax)

In [None]:
## Display best performance of best of each cluster selection for different similarity metrics
tmp_df = df[df.sampling_method == 'cluster_best'][
    ['mode', 'n_models', 'similarity_method'] + cols_to_display[1:]].sort_values(
    ['mode', 'n_models', 'similarity_method'])
print("Best of each cluster")
gradient_color(tmp_df, metric_col, vmin=vmin, vmax=vmax)

In [None]:
## Display best performance of random model of each cluster selection for different similarity metrics
tmp_df = df[df.sampling_method == 'cluster_random'][
    ['mode', 'n_models', 'similarity_method'] + cols_to_display[1:]].sort_values(
    ['mode', 'n_models', 'similarity_method'])
print("Random model of each cluster")
gradient_color(tmp_df, metric_col, vmin=vmin, vmax=vmax)

In [None]:
## Display best performance of Random sampling
tmp_df = df[df.sampling_method == 'random'][['mode', 'n_models'] + cols_to_display[1:]].sort_values(
    ['mode', 'n_models'])
print('Random')
gradient_color(tmp_df, metric_col, vmin=vmin, vmax=vmax)

In [None]:
ensemble = df.copy()
ensemble['mode'] = ensemble['mode'].map({'ensemble': 'Ensemble', 'combined_models': 'Combined (Concat)'})
ensemble['mode'].value_counts()

In [None]:
plotting_df = pd.concat([plotting_df, ensemble])
plotting_df.duplicated().sum()

Combined models: Iterating over datasets

In [None]:
# for ds in ood_datasets:
#     path = path = os.path.join(OUTPUT_ROOT, ds+"/combined_models/results_imagenet1k.pkl")
#     df = pd.read_pickle(path)
#     to_display=df[df['test_lp_acc1'] == df['test_lp_acc1'].max()][["model_ids", "sampling_method", "test_lp_acc1"]]
#     display(to_display)
#     print(ds, to_display["model_ids"].iloc[0])

## Bar plot

In [None]:
def create_bar_plot(one_sim_metric, single_row, grouped_data, yvmin, yvmax, metric_col, hue_order):
    tab20_colors = sns.color_palette("tab20")
    palette = dict(zip(hue_order, tab20_colors[:len(hue_order)]))

    # Initialize the plot
    fig, ax = plt.subplots(figsize=(15, 6))

    # Plot the single row with error bars
    ax.bar(single_row['mode'], single_row['mean'], label=f'Best Single model ({single_row["mean"].item():.2f})',
           color='#33cccc', capsize=5, width=0.1)
    ax.axhline(single_row['mean'].item(), ls=":", c='grey', alpha=0.5)

    # Plot grouped data with error bars
    sns.barplot(
        x='n_models',
        y=metric_col,
        hue='sampling_method_mappend',
        hue_order=hue_order,
        data=grouped_data,
        ax=ax,
        palette=palette,
        errorbar=None  # Disable the default confidence interval
    )

    n_diff_ks = np.sort(grouped_data.n_models.unique())

    # Manually add error bars for grouped data
    for i, bar in enumerate(ax.patches):
        if i < 1 or i > len(n_diff_ks) * len(hue_order):
            continue
        n_models = n_diff_ks[(i - 1) % len(n_diff_ks)]
        sampling_method = hue_order[(i - 1) // len(n_diff_ks)]

        # Get the corresponding std value
        subset = grouped_data[(grouped_data['n_models'] == n_models) &
                              (grouped_data['sampling_method_mappend'] == sampling_method)]
        if subset.shape[0] == 0:
            print(n_models, sampling_method)
            break
        std = subset['std'].values[0]

        # Get the bar coordinates
        bar_x = bar.get_x() + bar.get_width() / 2
        bar_y = bar.get_height()

        # Add the error bar
        ax.errorbar(bar_x, bar_y, yerr=std, fmt='none', c='black', capsize=5)

    # Customize the plot
    ax.set_title(f'Performance comparison for {ds_name_mapping[dataset]} ({sim_metric_name_mapping[one_sim_metric]})')
    ax.set_ylabel('Mean Top-1 Test Accuracy')
    ax.set_xlabel('Nr. of models')
    ax.legend(title='Model selection method', framealpha=1, loc='center left', bbox_to_anchor=(1, 0.5))

    ax.set_ylim([ylim_min, ylim_max])
    fig.tight_layout()
    return fig

In [None]:
# hue_order = [
#     'Ensemble: Top-k models',
#     'Ensemble: Best model of each cluster', 
#     'Ensemble: Random model of each cluster',
#     'Ensemble: Random k models', 
#     'Combined (Concat): Top-k models',
#     'Combined (Concat): Best model of each cluster',
#     'Combined (Concat): Random model of each cluster',
#     'Combined (Concat): Random k models',
#     ]

hue_order = [
    'Ensemble: Top-k models',
    'Combined (Concat): Top-k models',
    'Ensemble: Best model of each cluster',
    'Combined (Concat): Best model of each cluster',
    'Ensemble: Random model of each cluster',
    'Combined (Concat): Random model of each cluster',
    'Ensemble: Random k models',
    'Combined (Concat): Random k models',
]

In [None]:
for sim_metric in sim_metric_name_mapping.keys():
    curr_plotting_df = plotting_df[plotting_df.similarity_method.isin([sim_metric, np.nan])].copy()
    curr_plotting_df.similarity_method.value_counts(dropna=False)
    ylim_min = curr_plotting_df[metric_col].min() - 0.1
    ylim_max = curr_plotting_df[metric_col].max() + 0.02

    single_row = curr_plotting_df[curr_plotting_df['mode'] == best_single_model_name].copy()
    grouped_data = curr_plotting_df[curr_plotting_df['mode'] != best_single_model_name].copy()
    grouped_data['sampling_method_mappend'] = grouped_data.apply(
        lambda x: f"{x['mode']}: {model_selec_name_mapping[x['sampling_method']]}", axis=1)

    # # TODO: Remove if combined models have also cluster_random. 
    # tmp = grouped_data[(grouped_data['sampling_method_mappend']=='Combined (Concat): Random model of each cluster')&\
    # (grouped_data['n_models']==4)].copy()
    # for n in [3, 6, 7]:
    #     tmp.loc[:,'n_models'] = n
    #     grouped_data = pd.concat([grouped_data, tmp.copy()])
    # grouped_data = grouped_data.reset_index()

    fig = create_bar_plot(sim_metric, single_row, grouped_data, ylim_min, ylim_max, metric_col, hue_order)

    # # if SAVE:
    # #     plt.savefig(storing_path / f'comparison_models_{dataset}_{sim_metric}.pdf', bbox_inches='tight')
    # #     plt.savefig(storing_path / f'comparison_models_{dataset}_{sim_metric}.png', bbox_inches='tight')
    plt.show()