## Notebook for the aggregation model similarities across for different datasets and similarity matrices.
This notebook gathers the model representational similarities across different datasets and similarity metrics. It stores them into a single CSV file with the following columns:
- Similarity metric
- Dataset
- Model 1
- Model 2
- Similarity value: The representational similarity value between model 1 and model 2.
- Objective pair : Containing the training objective type of model 1 and model 2.
- Architecture pair : Containing the architecture type of model 1 and model 2.
- Dataset pair : Containing the dataset type of model 1 and model 2.
- Model size pair : Containing the model size type of model 1 and model 2.

In [1]:
import numpy as np
import pandas as pd

from constants import (
    BASE_PATH_PROJECT,
    BASE_PATH_RESULTS,
    ds_list_sim_file,
    exclude_models,
    model_config_file,
    sim_metric_name_mapping
)
from helper import (
    load_all_datasetnames_n_info,
    load_model_configs_and_allowed_models,
    load_similarity_matrices,
    pp_storing_path
)
print(ds_list_sim_file)

../scripts/configs/webdatasets_w_insub30k.txt


#### Global variables

In [2]:
# base_path_similarity_matrices = BASE_PATH_PROJECT / 'model_similarities'

# sim_metrics = [
#     'cka_kernel_rbf_unbiased_sigma_0.4',
#     'cka_kernel_linear_unbiased',
#     'rsa_method_correlation_corr_method_spearman'
# ]

base_path_similarity_matrices = BASE_PATH_PROJECT / 'model_similarities_rbf02'

sim_metrics = [
    'cka_kernel_rbf_unbiased_sigma_0.2'
]

sim_metrics_mapped = [sim_metric_name_mapping[k] for k in sim_metrics]

ds_list, _ = load_all_datasetnames_n_info(ds_list_sim_file, verbose=True)

storing_path = pp_storing_path(BASE_PATH_RESULTS / f'aggregated/model_sims', True)

['wds_fer2013', 'wds_voc2007', 'wds_cars', 'wds_fgvc_aircraft', 'wds_stl10', 'wds_gtsrb', 'wds_country211', 'wds_vtab_caltech101', 'wds_vtab_cifar10', 'wds_vtab_cifar100', 'wds_vtab_diabetic_retinopathy', 'wds_vtab_dmlab', 'wds_vtab_dtd', 'wds_vtab_eurosat', 'wds_vtab_flowers', 'wds_vtab_pets', 'wds_vtab_pcam', 'wds_vtab_resisc45', 'wds_vtab_svhn', 'entity30', 'living17', 'nonliving26', 'imagenet-subset-30k'] 23



#### Load data

In [3]:
model_configs, allowed_models = load_model_configs_and_allowed_models(
    path=model_config_file,
    exclude_models=exclude_models,
    exclude_alignment=True,
)
print(model_configs.shape)

Nr. models original=64
(64, 14)


In [4]:
sim_mats = load_similarity_matrices(
    path=base_path_similarity_matrices,
    ds_list=ds_list,
    sim_metrics=sim_metrics,
    allowed_models=allowed_models,
)
sim_mats = {sim_metric_name_mapping[k]: v for k, v in sim_mats.items()}

#### Helper functions

In [5]:
def flatten_sim_values(sim_mat):
    iu2 = np.triu_indices(sim_mat.shape[0], k=1)
    flat_sim_mat = pd.DataFrame({
        'Model 1': sim_mat.index.values[iu2[0]],
        'Model 2': sim_mat.columns.values[iu2[1]],
        'Similarity value': sim_mat.values[iu2],
    })
    return flat_sim_mat


def sort_tuple(tup):
    return tuple(sorted(tup))


def get_cat_pair(row, cat):
    pair_info = (
        model_configs.loc[row['Model 1'], cat],
        model_configs.loc[row['Model 2'], cat]
    )
    return sort_tuple(pair_info)


def process_sim_mat(sim_mat, metric, ds):
    flat_sim_mat = flatten_sim_values(sim_mat)
    flat_sim_mat['Similarity metric'] = metric
    flat_sim_mat['DS'] = ds
    flat_sim_mat['Objective pair'] = flat_sim_mat.apply(get_cat_pair, axis=1, cat='objective')
    flat_sim_mat['Architecture pair'] = flat_sim_mat.apply(get_cat_pair, axis=1, cat='architecture_class')
    flat_sim_mat['Dataset pair'] = flat_sim_mat.apply(get_cat_pair, axis=1, cat='dataset_class')
    flat_sim_mat['Model size pair'] = flat_sim_mat.apply(get_cat_pair, axis=1, cat='size_class')
    cols = flat_sim_mat.columns.tolist()
    flat_sim_mat = flat_sim_mat[cols[3:5] + cols[:3] + cols[5:]]
    return flat_sim_mat


def get_similarity_dataframe(similarity_matrices):
    dfs = []
    for sim_metric, sim_mats_w_metric in similarity_matrices.items():
        for ds, curr_sim_mat in sim_mats_w_metric.items():
            dfs.append(process_sim_mat(curr_sim_mat, sim_metric, ds))
    df = pd.concat(dfs)
    return df

### Aggregate similarities across datasets and similarity metrics

In [6]:
sim_df = get_similarity_dataframe(sim_mats)

In [7]:
fn = storing_path / 'all_metric_ds_model_pair_similarity_only_rbf02.csv'
print(f"Storing aggrgated sims at {fn}")
sim_df.to_csv(fn, index=False)

Storing aggrgated sims at /home/space/diverse_priors/results_rebuttal/aggregated/model_sims/all_metric_ds_model_pair_similarity_only_rbf02.csv
