## Aggregation model similarities across for different datasets and similarity matrices

In [1]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd

from constants import exclude_models, model_config_file
from helper import load_model_configs_and_allowed_models, load_similarity_matrices

sys.path.append('..')
from scripts.helper import parse_datasets
from constants import sim_metric_name_mapping

In [2]:
base_path_similarity_matrices = Path('/home/space/diverse_priors/model_similarities')

sim_metrics = [
    'cka_kernel_rbf_unbiased_sigma_0.4',
    'cka_kernel_linear_unbiased',
    'rsa_method_correlation_corr_method_spearman'
]
sim_metrics_mapped = [sim_metric_name_mapping[k] for k in sim_metrics]

ds_list = parse_datasets('../scripts/webdatasets_w_insub10k.txt')
ds_list = list(map(lambda x: x.replace('/', '_'), ds_list))
print(ds_list)

storing_path = Path('/home/space/diverse_priors/results/aggregated/model_sims')
storing_path.mkdir(parents=True, exist_ok=True)

['wds_fer2013', 'wds_voc2007', 'wds_cars', 'wds_fgvc_aircraft', 'wds_stl10', 'wds_gtsrb', 'wds_country211', 'wds_vtab_caltech101', 'wds_vtab_cifar10', 'wds_vtab_cifar100', 'wds_vtab_diabetic_retinopathy', 'wds_vtab_dmlab', 'wds_vtab_dtd', 'wds_vtab_eurosat', 'wds_vtab_flowers', 'wds_vtab_pets', 'wds_vtab_pcam', 'wds_vtab_resisc45', 'wds_vtab_svhn', 'entity30', 'living17', 'nonliving26', 'imagenet-subset-10k']


In [3]:
model_configs, allowed_models = load_model_configs_and_allowed_models(
    path=model_config_file,
    exclude_models=exclude_models,
    exclude_alignment=True,
)
print(model_configs.shape)

Nr. models original=64
(64, 14)


In [4]:
sim_mats = load_similarity_matrices(
    path=base_path_similarity_matrices,
    ds_list=ds_list,
    sim_metrics=sim_metrics,
    allowed_models=allowed_models,
)
sim_mats = {sim_metric_name_mapping[k]: v for k, v in sim_mats.items()}

In [5]:
def flatten_sim_values(sim_mat):
    iu2 = np.triu_indices(sim_mat.shape[0], k=1)
    flat_sim_mat = pd.DataFrame({
        'Model 1': sim_mat.index.values[iu2[0]],
        'Model 2': sim_mat.columns.values[iu2[1]],
        'Similarity value': sim_mat.values[iu2],
    })
    return flat_sim_mat


def sort_tuple(tup):
    return tuple(sorted(tup))


def get_cat_pair(row, cat):
    pair_info = (
        model_configs.loc[row['Model 1'], cat],
        model_configs.loc[row['Model 2'], cat]
    )
    return sort_tuple(pair_info)


def process_sim_mat(sim_mat, metric, ds):
    flat_sim_mat = flatten_sim_values(sim_mat)
    flat_sim_mat['Similarity metric'] = metric
    flat_sim_mat['DS'] = ds
    flat_sim_mat['Objective pair'] = flat_sim_mat.apply(get_cat_pair, axis=1, cat='objective')
    flat_sim_mat['Architecture pair'] = flat_sim_mat.apply(get_cat_pair, axis=1, cat='architecture_class')
    flat_sim_mat['Dataset pair'] = flat_sim_mat.apply(get_cat_pair, axis=1, cat='dataset_class')
    flat_sim_mat['Model size pair'] = flat_sim_mat.apply(get_cat_pair, axis=1, cat='size_class')
    cols = flat_sim_mat.columns.tolist()
    flat_sim_mat = flat_sim_mat[cols[3:5] + cols[:3] + cols[5:]]
    return flat_sim_mat


def get_similarity_dataframe(similarity_matrices):
    dfs = []
    for sim_metric, sim_mats_w_metric in similarity_matrices.items():
        for ds, curr_sim_mat in sim_mats_w_metric.items():
            dfs.append(process_sim_mat(curr_sim_mat, sim_metric, ds))
    df = pd.concat(dfs)
    return df

In [6]:
sim_df = get_similarity_dataframe(sim_mats)
sim_df.head()

Unnamed: 0,Similarity metric,DS,Model 1,Model 2,Similarity value,Objective pair,Architecture pair,Dataset pair,Model size pair
0,CKA RBF 0.4,wds_fer2013,Kakaobrain_Align,OpenCLIP_EVA01-g-14-plus_merged2b_s11b_b114k,0.468318,"(Image-Text, Image-Text)","(Convolutional, Transformer)","(Large DS, XLarge DS)","(small, xlarge)"
1,CKA RBF 0.4,wds_fer2013,Kakaobrain_Align,OpenCLIP_EVA01-g-14_laion400m_s11b_b41k,0.419219,"(Image-Text, Image-Text)","(Convolutional, Transformer)","(Large DS, Large DS)","(small, xlarge)"
2,CKA RBF 0.4,wds_fer2013,Kakaobrain_Align,OpenCLIP_EVA02-B-16_merged2b_s8b_b131k,0.485452,"(Image-Text, Image-Text)","(Convolutional, Transformer)","(Large DS, XLarge DS)","(medium, small)"
3,CKA RBF 0.4,wds_fer2013,Kakaobrain_Align,OpenCLIP_EVA02-L-14_merged2b_s4b_b131k,0.459744,"(Image-Text, Image-Text)","(Convolutional, Transformer)","(Large DS, XLarge DS)","(small, xlarge)"
4,CKA RBF 0.4,wds_fer2013,Kakaobrain_Align,OpenCLIP_RN50_openai,0.518417,"(Image-Text, Image-Text)","(Convolutional, Convolutional)","(Large DS, Large DS)","(medium, small)"


In [7]:
sim_df.to_csv(storing_path / 'all_metric_ds_model_pair_similarity_with_rsa.csv', index=False)