In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from constants import BASE_PATH_PROJECT, BASE_PATH_RESULTS, sim_metric_name_mapping, anchor_name_mapping
from helper import load_model_configs_and_allowed_models, save_or_show, pp_storing_path, load_all_datasetnames_n_info, load_similarity_matrices

In [2]:
SAVE = True
storing_path = pp_storing_path(BASE_PATH_RESULTS / 'plots/experiment_bootstrap', SAVE) 
storing_path




PosixPath('/home/space/diverse_priors/results_rebuttal/plots/experiment_bootstrap')

In [3]:
max_seed = 500
# ds_list = ['imagenet-subset-10k-seed-{seed}'.format(seed=i) for i in range(max_seed)]
ds_list = ['imagenet-subset-30k-seed-{seed}'.format(seed=i) for i in range(max_seed)]

In [4]:
base_path_similarity_matrices = BASE_PATH_PROJECT / 'model_similarities_bootstrap'

sim_metrics = [
    'cka_kernel_rbf_unbiased_sigma_0.2',
    # 'cka_kernel_rbf_unbiased_sigma_0.4',
    # 'cka_kernel_linear_unbiased',
    # 'rsa_method_correlation_corr_method_spearman',
]

model_configs, allowed_models = load_model_configs_and_allowed_models(
    path="../scripts/configs/models_config_anchor_models.json",
    exclude_models=[],
    exclude_alignment=True,
)
allowed_models

Nr. models original=6


['OpenCLIP_RN50_openai',
 'OpenCLIP_ViT-L-14_openai',
 'dinov2-vit-large-p14',
 'simclr-rn50',
 'resnet50',
 'vit_large_patch16_224']

In [5]:
sim_mats = load_similarity_matrices(
    path=base_path_similarity_matrices,
    ds_list=ds_list,
    sim_metrics=sim_metrics,
    allowed_models=allowed_models,
)

In [6]:
stackes_sim_mats = {}
for sim_metric, data_dict in sim_mats.items():
    stackes_sim_mats[sim_metric] = np.stack(list(data_dict.values()), axis=0)

In [7]:
def prepare_sim_data(one_metric_data):
    iu1 = np.triu_indices(one_metric_data.shape[-1], k=1)
    flattened_data = one_metric_data[:,iu1[0], iu1[1]]
    return flattened_data 
    

In [8]:
flattened_stacked_sim_mats = {}
for sim_metric, data_dict in stackes_sim_mats.items():
    flattened_stacked_sim_mats[sim_metric] = prepare_sim_data(data_dict)

In [9]:
# get model pairs 
iu1 = np.triu_indices(len(allowed_models), k=1)
M1 = sim_mats[sim_metrics[0]][ds_list[0]].index[iu1[0]].tolist()
M2 = sim_mats[sim_metrics[0]][ds_list[0]].columns[iu1[1]].tolist()
xlbls = [f'{anchor_name_mapping[m1]}, {anchor_name_mapping[m2]}' for m1, m2 in zip(M1, M2)]
xlbls

['OpenCLIP RN50, OpenCLIP ViT-L',
 'OpenCLIP RN50, DINOv2 ViT-L',
 'OpenCLIP RN50, SimCLR RN50',
 'OpenCLIP RN50, ResNet-50',
 'OpenCLIP RN50, ViT-L',
 'OpenCLIP ViT-L, DINOv2 ViT-L',
 'OpenCLIP ViT-L, SimCLR RN50',
 'OpenCLIP ViT-L, ResNet-50',
 'OpenCLIP ViT-L, ViT-L',
 'DINOv2 ViT-L, SimCLR RN50',
 'DINOv2 ViT-L, ResNet-50',
 'DINOv2 ViT-L, ViT-L',
 'SimCLR RN50, ResNet-50',
 'SimCLR RN50, ViT-L',
 'ResNet-50, ViT-L']

In [10]:
df_sim_mats = {}
for sim_metric, data_dict in flattened_stacked_sim_mats.items():
    df_sim_mats[sim_metric] = pd.DataFrame(data_dict, columns=xlbls)

In [11]:
n = len(flattened_stacked_sim_mats.keys())

In [12]:
n = len(flattened_stacked_sim_mats.keys())

fontsize = 14

fig, axes = plt.subplots(nrows=n, ncols=1, figsize=(8, n*7), sharex=True)

for  i, (sim_metric, data) in enumerate(flattened_stacked_sim_mats.items()):
    try:
        ax = axes[i]
    except:
        ax = axes

        
    sns.boxplot(data, ax=ax)
    ax.set_xticklabels(xlbls, rotation=90, ha='right',  fontsize=fontsize);
    ax.set_title(sim_metric_name_mapping[sim_metric], fontsize=fontsize+1)

plt.yticks(fontsize=fontsize)
plt.ylim([0.25, 0.93])
fig.tight_layout()

included_sims = "_".join(list(flattened_stacked_sim_mats.keys()))

save_or_show(fig, storing_path / f'distr_boxplot_cka_values_500_reps_{included_sims}_v3.pdf', SAVE)

  ax.set_xticklabels(xlbls, rotation=90, ha='right',  fontsize=fontsize);


stored img at /home/space/diverse_priors/results_rebuttal/plots/experiment_bootstrap/distr_boxplot_cka_values_500_reps_cka_kernel_rbf_unbiased_sigma_0.2_v3.pdf.


In [None]:
for sim_metric, df in df_sim_mats.items():
    df_stats = df.describe(percentiles=[0.025, 0.05, 0.25, 0.5, 0.75, 0.95, 0.975]).T
    df_stats['CI_95'] = df_stats['97.5%'] - df_stats['2.5%']
    if SAVE:
        df_stats.to_csv(storing_path / f'distr_stats_{sim_metric}.csv')
        print(storing_path / f'distr_stats_{sim_metric}.csv')
    else:
        print(sim_metric_name_mapping[sim_metric])
        display(df_stats)