## Notebook for aggregating r-coefficients for model set and anchor model pairs across datasets and similarity metrics
This notebook aggregates the r-coefficients for the similarity values of the anchor models combined with models from different models sets across different categories. The r-coefficients are computed for each dataset pair and similarity metric pair. The aggregated data is stored in a csv file.

In [None]:
from itertools import combinations, product

import pandas as pd
from scipy.stats import kendalltau, pearsonr, spearmanr
from tqdm import tqdm

from constants import (
    BASE_PATH_PROJECT,
    BASE_PATH_RESULTS,
    anchors,
    ds_list_sim_file,
    exclude_models,
    exclude_models_w_mae,
    model_cat_mapping,
    model_categories,
    model_config_file,
    sim_metric_name_mapping
)
from helper import (
    load_all_datasetnames_n_info,
    load_model_configs_and_allowed_models,
    load_similarity_matrices,
    pp_storing_path
)

In [None]:
BASE_PATH_RESULTS, exclude_models_w_mae, anchors

#### Global vaiables

In [None]:
## Data loading
ds_list, _ = load_all_datasetnames_n_info(ds_list_sim_file, verbose=True)

## Paths
base_path_similarity_matrices = BASE_PATH_PROJECT / 'model_similarities'
base_path_similarity_matrices_rbf02 = BASE_PATH_PROJECT / 'model_similarities_rbf02'
sim_metrics = [
    'cka_kernel_rbf_unbiased_sigma_0.2',
    # 'cka_kernel_rbf_unbiased_sigma_0.4',
    'cka_kernel_linear_unbiased',
    # 'rsa_method_correlation_corr_method_spearman',
]

# Correlation method
corr_methods = ['spearmanr', 'pearsonr']

# Combinations of correlation methods and models to exclude
combs = list(product(corr_methods, [exclude_models, exclude_models_w_mae]))
print(f"{len(combs)=}")
corr_method, curr_model_excl = combs[2]
print(f"{corr_method=}, {curr_model_excl=}")

# Suffix for storing
# suffix = '_wo_mae' if len(curr_model_excl) > 0 else ''
suffix = '_wo_mae_rbf02_n_linear' if len(curr_model_excl) > 0 else '_rbf02_n_linear'

SAVE = True
storing_path = pp_storing_path(BASE_PATH_RESULTS / f'aggregated/r_coeff_dist/with_anchor_models', SAVE)
storing_path

#### Load model configurations and similarity matrices

In [None]:
model_configs, allowed_models = load_model_configs_and_allowed_models(
    path=model_config_file,
    exclude_models=curr_model_excl,
    exclude_alignment=True,
)

In [None]:
if 'mae' in suffix and 'mae-vit-large-p16' in anchors:
    anchors.remove('mae-vit-large-p16')

In [None]:
model_configs['objective'].value_counts()

In [None]:
sim_mats_lin = load_similarity_matrices(
    path=base_path_similarity_matrices,
    ds_list=ds_list,
    sim_metrics=['cka_kernel_linear_unbiased'],
    allowed_models=allowed_models,
)

sim_mats_02 = load_similarity_matrices(
    path=base_path_similarity_matrices_rbf02,
    ds_list=ds_list[:-1] + ['imagenet-subset-30k'],
    sim_metrics=['cka_kernel_rbf_unbiased_sigma_0.2'],
    allowed_models=allowed_models,
)

sim_mats_lin.keys(), sim_mats_02.keys()

In [None]:
## HANDLE DIFFERENT NAMING OF IMAGENET SUBSETS
print('cka_kernel_linear_unbiased')
print(sim_mats_lin['cka_kernel_linear_unbiased'].keys())

if 'imagenet-subset-10k' in sim_mats_lin['cka_kernel_linear_unbiased'].keys():
    sim_mats_lin['cka_kernel_linear_unbiased']['imagenet-subset'] = sim_mats_lin['cka_kernel_linear_unbiased']['imagenet-subset-10k']
    del sim_mats_lin['cka_kernel_linear_unbiased']['imagenet-subset-10k']
    
    print(sim_mats_lin['cka_kernel_linear_unbiased'].keys())
    print()
    print()


print('cka_kernel_rbf_unbiased_sigma_0.2')
print(sim_mats_02['cka_kernel_rbf_unbiased_sigma_0.2'].keys())

if 'imagenet-subset-30k' in sim_mats_02['cka_kernel_rbf_unbiased_sigma_0.2'].keys():
    sim_mats_02['cka_kernel_rbf_unbiased_sigma_0.2']['imagenet-subset'] = sim_mats_02['cka_kernel_rbf_unbiased_sigma_0.2']['imagenet-subset-30k']
    del sim_mats_02['cka_kernel_rbf_unbiased_sigma_0.2']['imagenet-subset-30k']
    
    print(sim_mats_02['cka_kernel_rbf_unbiased_sigma_0.2'].keys())
    print()
    print()


In [None]:
# combine dictionaries
sim_mats = {}
for key, data in sim_mats_lin.items():
    sim_mats[key] = data
for key, data in sim_mats_02.items():
    sim_mats[key] = data

### Aggregate r-coefficients for model set and anchor model pairs across datasets and similarity metrics

In [None]:
## Define column names
anchor_col = 'Anchor Model'
other_col = 'Other Model'
other_ds_col = 'Dataset'
sim_metric_col = 'Similarity metric'
sim_ds_col = 'Similarity value DS'
info_orig_cols = model_categories
info_cols = list(model_cat_mapping.values())
id_cols = [anchor_col, other_col, sim_metric_col] + info_cols
comp_cat_col = 'Comparison category'
comp_cat_orig_col = 'Comparison category (orig. name)'
comp_val_col = 'Comparison values'
r_col = 'r coeff'


## Helper functions
def get_other_model_info(mid):
    model_config = model_configs.loc[mid]
    return model_config['objective'], model_config['architecture_class'], model_config['dataset_class'], model_config[
        'size_class']


def get_melted_sim_values_metric_anchor(anch, met, met_ds_mats):
    sim_vals_ds = []
    # for each dataset, get the similarity values of the anchor model to all other models
    for ds, curr_sim_mat in met_ds_mats.items():
        cols = curr_sim_mat.columns.tolist()
        cols.remove(anch)
        cols = sorted(list(set(cols).intersection(allowed_models)))
        row_sim_mat = curr_sim_mat.loc[anch, cols]
        row_sim_mat.name = ds
        sim_vals_ds.append(row_sim_mat)

    # concatenate the similarity values of the anchor model to all other models for all datasets
    anchor_sim_vals = pd.concat(sim_vals_ds, axis=1)
    anchor_sim_vals = anchor_sim_vals.reset_index(names=[other_col])
    # get the model information for all other models and add it to the dataframe
    anchor_sim_vals = pd.concat([anchor_sim_vals,
                                 pd.DataFrame(anchor_sim_vals[other_col].apply(get_other_model_info).tolist(),
                                              columns=info_cols)], axis=1)
    anchor_sim_vals[sim_metric_col] = sim_metric_name_mapping[met]
    anchor_sim_vals[anchor_col] = anch
    # melt the dataframe to have one row per dataset and anchor & other model pair 
    anchor_sim_vals = pd.melt(anchor_sim_vals,
                              id_vars=id_cols,
                              var_name=other_ds_col,
                              value_name=sim_ds_col,
                              )
    return anchor_sim_vals

#### Summarize similarity values for all anchor models
This cell only keep similarity values where the anchor model is one of the models in each model pair. Additionally, it flattens the dataframe to have one row per dataset and anchor & other model pair.

In [None]:
# get the similarity values for all anchor models and similarity metrics
dfs = []
for anchor in anchors:
    for sim_metric, ds_sim_mat in sim_mats.items():
        anchor_sim_vals = get_melted_sim_values_metric_anchor(anchor, sim_metric, ds_sim_mat)
        dfs.append(anchor_sim_vals)

all_sims = pd.concat(dfs, axis=0).reset_index(drop=True)

In [None]:
all_sims.head()

In [None]:
all_sims['Similarity metric'].unique(), all_sims['Dataset'].nunique()

#### Compute the r-coefficients for each dataset pair and similarity metric pair. 

In [None]:
# get all unique datasets and create all possible pairs
all_datasets = all_sims[other_ds_col].unique()
ds_pairs = list(combinations(all_datasets, 2))
print('Nr. dataset pairs: ', len(ds_pairs))

In [None]:
def correlation(x, y):
    corr = None
    if corr_method == 'spearmanr':
        corr, _ = spearmanr(x, y)
    elif corr_method == 'kendalltau':
        corr, _ = kendalltau(x, y)
    elif corr_method == 'pearsonr':
        corr, _ = pearsonr(x, y)
    return corr


def compute_corr(data):
    res = []
    for ds1, ds2 in ds_pairs:
        x = data[data[other_ds_col] == ds1][sim_ds_col]
        y = data[data[other_ds_col] == ds2][sim_ds_col]
        corr = correlation(x, y)
        res.append((ds1, ds2, corr))
    return res


def post_process_group_op(df):
    df = df.explode(r_col).reset_index(drop=True)
    df_split = df[r_col].apply(pd.Series)
    df_split.columns = ['DS 1', 'DS 2', r_col]
    df.drop(columns=[r_col], inplace=True)
    df = pd.concat([df, df_split], axis=1)
    return df


r_dfs = []
for strata in tqdm(info_cols):
    grouping_cols = [sim_metric_col, anchor_col, strata]
    print(grouping_cols)

    strata_rs = all_sims.groupby(grouping_cols, dropna=False).apply(compute_corr, include_groups=False).reset_index()
    strata_rs.columns = grouping_cols + [r_col]
    strata_rs = post_process_group_op(strata_rs)

    all_rs = all_sims.groupby(grouping_cols[:-1], dropna=False).apply(compute_corr, include_groups=False).reset_index()
    all_rs.columns = grouping_cols[:-1] + [r_col]
    all_rs = post_process_group_op(all_rs)
    all_rs[strata] = 'All'

    rs = pd.concat([all_rs, strata_rs], axis=0).reset_index(drop=True)
    rs = rs.sort_values([sim_metric_col, anchor_col]).reset_index(drop=True)
    r_dfs.append(rs)

In [None]:
for i in range(len(r_dfs)):
    r_dfs[i][comp_cat_col] = info_cols[i]
    r_dfs[i][comp_cat_orig_col] = info_orig_cols[i]
    r_dfs[i].rename(columns={info_cols[i]: comp_val_col}, inplace=True)

In [None]:
r_df = pd.concat(r_dfs, axis=0)
r_df.head()

In [None]:
if SAVE:
    r_df.to_csv(storing_path / f'agg_{corr_method}_all_ds{suffix}.csv', index=False)
    print(f"Stored aggregated data at: {storing_path / f'agg_{corr_method}_all_ds{suffix}.csv'}")