## Notebook for aggregating r-coefficients for model set pairs across datasets and similarity metrics
This notebook aggregates the correlation coefficients for model set pairs across datasets and similarity metrics. The correlation coefficients are calculated for each dataset pair and each model subcategory pair. The coefficients are calculated using Pearson's r and Spearman's r. The aggregated data is stored in a csv file.

In [None]:
from itertools import combinations, product

import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr

from constants import (
    exclude_models,
    exclude_models_w_mae,
    cat_name_mapping,
    model_config_file,
    model_categories,
    sim_metric_name_mapping,
    model_cat_mapping,
    BASE_PATH_PROJECT,
    BASE_PATH_RESULTS,
    ds_list_sim_file
)
from helper import (
    load_model_configs_and_allowed_models,
    load_similarity_matrices,
    pp_storing_path,
    load_all_datasetnames_n_info
)

#### Global variables

In [None]:
# base_path_similarity_matrices = BASE_PATH_PROJECT / 'model_similarities'
# sim_metrics = [
#     'cka_kernel_rbf_unbiased_sigma_0.4',
#     'cka_kernel_linear_unbiased',
#     'rsa_method_correlation_corr_method_spearman'
# ]

# base_path_similarity_matrices = BASE_PATH_PROJECT / 'model_similarities_rbf02'
# sim_metrics = [
#     'cka_kernel_rbf_unbiased_sigma_0.2'
# ]

base_path_similarity_matrices = BASE_PATH_PROJECT / 'model_similarities_intermediate'
sim_metrics = [
    'cka_kernel_rbf_unbiased_sigma_0.4',
    'cka_kernel_linear_unbiased',
    'rsa_method_correlation_corr_method_spearman'
]


sim_metrics_mapped = [sim_metric_name_mapping[k] for k in sim_metrics]

ds_list, _ = load_all_datasetnames_n_info(ds_list_sim_file, verbose=True)

suffix = '_intermediate_wo_siglip_n_dino_xcit'  # '', '_wo_mae'
# suffix = '_intermediate'  # '', '_wo_mae'

SAVE = True
storing_path = pp_storing_path(BASE_PATH_RESULTS / f'aggregated/r_coeff_dist/with_cats_as_anchors', SAVE)

#### Load model configurations and similarity matrices

In [None]:
if 'mae' in suffix:
    curr_excl_models = exclude_models_w_mae
elif 'siglip' in suffix.lower():
    curr_excl_models = ['OpenCLIP_ViT-B-16-SigLIP_webli', 'dino-xcit-medium-24-p16', 'dino-xcit-small-12-p16']
else:
    curr_excl_models =  exclude_models

model_configs, allowed_models = load_model_configs_and_allowed_models(
    # path=model_config_file,
    path="../scripts/configs/models_config_wo_alignment_vits_intermediate_allfixed_ocplusother.json",
    exclude_models=curr_excl_models,
    exclude_alignment=True,
)

In [None]:
sim_mats = load_similarity_matrices(
    path=base_path_similarity_matrices,
    ds_list=ds_list,
    sim_metrics=sim_metrics,
    allowed_models=allowed_models,
)
sim_mats = {sim_metric_name_mapping[k]: v for k, v in sim_mats.items()}
sim_mats.keys()

#### Helper functions

In [None]:
def get_sim_data(category, sim_metric):
    cat_groups = model_configs.reset_index(names=['mid']).groupby(category)['mid'].unique()
    cat_groups.index = [cat_name_mapping[x] for x in cat_groups.index]
    sim_metric_mats = sim_mats[sim_metric]
    return cat_groups, sim_metric_mats


def process_sim_mat(ds, sim_mat, cat):
    indx_i, indx_j = np.triu_indices(n=len(sim_mat), k=1)
    flat_sim_mat = pd.DataFrame(
        {'Similarity value': sim_mat.values[indx_i, indx_j],
         'Model 1': sim_mat.index.values[indx_i],
         'Model 2': sim_mat.columns.values[indx_j]
         })
    flat_sim_mat['M1 obj.'] = flat_sim_mat['Model 1'].apply(lambda x: cat_name_mapping[model_configs.loc[x, cat]])
    flat_sim_mat['M2 obj.'] = flat_sim_mat['Model 2'].apply(lambda x: cat_name_mapping[model_configs.loc[x, cat]])
    flat_sim_mat['DS'] = ds
    return flat_sim_mat


def r_coeff(df, ds1, ds2, corr_type):
    x = df[df['DS'] == ds1]['Similarity value']
    y = df[df['DS'] == ds2]['Similarity value']
    if 'pearsonr' == corr_type:
        corr, _ = pearsonr(x, y)
    elif 'spearmanr' == corr_type:
        corr, _ = spearmanr(x, y)
    else:
        raise ValueError(f'Unknown {corr_type=}! Need to select pearsonr or spearmanr r_corr in corr_type.')
    return corr


def get_r_coeff_df(sim_metric_mats, cat_groups, corr_type):
    # flatten each similarity matrix 
    pp_mats = []
    for ds, sim_mat in sim_metric_mats.items():
        pp_mats.append(process_sim_mat(ds, sim_mat, desired_cat))
    pp_mats = pd.concat(pp_mats)
    # get each dataset pair 
    combs = combinations(list(pp_mats['DS'].unique()), 2)
    r_coeffs = []
    for ds1, ds2 in combs:
        subset_ds = pp_mats[pp_mats['DS'].isin([ds1, ds2])]
        for cat in cat_groups.index.tolist():
            subset_data = subset_ds[(subset_ds['M1 obj.'] == cat) | (subset_ds['M2 obj.'] == cat)].copy()
            subset_data['other cat'] = subset_data[['M1 obj.', 'M2 obj.']].apply(
                lambda x: x['M2 obj.'] if x['M1 obj.'] == cat else x['M1 obj.'], axis=1)
            for other_cat, other_cat_data in subset_data.groupby('other cat'):

                if len(other_cat_data) <= 2:
                    print(f'Cannot compute correlation for {(ds1, ds2)} and {(cat, other_cat)}, too few model pairs')
                    continue
                r_coeffs.append(
                    {
                        'ds1': ds1,
                        'ds2': ds2,
                        'anchor_cat': cat,
                        'other_cat': other_cat,
                        'r coeff': r_coeff(other_cat_data, ds1, ds2, corr_type),
                        'cat_pair': tuple(sorted([cat, other_cat]))
                    }
                )
    df_r_coeffs = pd.DataFrame(r_coeffs)
    return df_r_coeffs


### Aggregating r-coefficients for model set pairs across datasets and similarity metrics

In [None]:
for corr_type in ['pearsonr', 'spearmanr']:

    corr_data = {}

    for desired_cat, sim_metric in product(sorted(model_categories), list(sim_mats.keys())):
        print(desired_cat, sim_metric)
        cat_groups, sim_metric_mats = get_sim_data(desired_cat, sim_metric)
        df_r_coeffs = get_r_coeff_df(sim_metric_mats, cat_groups, corr_type)
        df_r_coeffs['Comparison category'] = model_cat_mapping[desired_cat]
        df_r_coeffs['Similarity metric'] = sim_metric
        corr_data[f"{desired_cat}_{sim_metric}"] = df_r_coeffs

    r_df = pd.concat(list(corr_data.values()), axis=0)

    r_df = r_df[~r_df[['ds1', 'ds2', 'r coeff', 'cat_pair', 'Comparison category',
                       'Similarity metric']].duplicated()].copy().reset_index(drop=True)
    print(f"Nr. of entries {len(r_df)}")
    if SAVE:
        fn = storing_path / f'agg_{corr_type}_all_ds{suffix}.csv'
        r_df.to_csv(fn, index=False)
        print(f"Stored aggregated data at: {fn}")