In [None]:
import sys
from itertools import combinations, product
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import pearsonr, spearmanr
import json

from constants import (
    anchor_name_mapping, 
    available_data, 
    exclude_models, 
    exclude_models_w_mae, 
    cat_name_mapping, 
    model_config_file, 
    model_categories, 
    model_cat_mapping,
    fontsizes
)
from helper import load_model_configs_and_allowed_models, load_similarity_matrices, save_or_show

sys.path.append('..')
from scripts.helper import parse_datasets
from constants import sim_metric_name_mapping

In [None]:
base_path_similarity_matrices = Path('/home/space/diverse_priors/model_similarities')
sim_metrics = [
    'cka_kernel_rbf_unbiased_sigma_0.4',
    'cka_kernel_linear_unbiased',
]
sim_metrics_mapped = [sim_metric_name_mapping[k] for k in sim_metrics]

ds_list = parse_datasets('../scripts/webdatasets_w_insub10k.txt')
ds_list = list(map(lambda x: x.replace('/', '_'), ds_list))


corr_type = 'pearsonr' # 'pearsonr', 'spearmanr'
suffix = '_wo_mae' # '', '_wo_mae'

SAVE = True
storing_path = Path(
    '/home/space/diverse_priors/results/aggregated/r_coeff_dist/with_cats_as_anchors'
)
if SAVE:
    storing_path.mkdir(parents=True, exist_ok=True)

In [None]:
curr_excl_models = exclude_models_w_mae if 'mae' in suffix else exclude_models

model_configs, allowed_models = load_model_configs_and_allowed_models(
    path=model_config_file,
    exclude_models=curr_excl_models,
    exclude_alignment=True,
)
model_configs.columns

In [None]:
sim_mats = load_similarity_matrices(
    path=base_path_similarity_matrices,
    ds_list=ds_list,
    sim_metrics=sim_metrics,
    allowed_models=allowed_models,
)
sim_mats = {sim_metric_name_mapping[k]: v for k, v in sim_mats.items()}
sim_mats.keys()

In [None]:
def get_sim_data(category, sim_metric):
    cat_groups = model_configs.reset_index(names=['mid']).groupby(category)['mid'].unique()
    cat_groups.index = [cat_name_mapping[x] for x in cat_groups.index]
    sim_metric_mats = sim_mats[sim_metric]
    return cat_groups, sim_metric_mats
    

def process_sim_mat(ds, sim_mat, cat):
    indx_i, indx_j = np.triu_indices(n=len(sim_mat), k=1)
    flat_sim_mat = pd.DataFrame(
    {'Similarity value':sim_mat.values[indx_i, indx_j],
     'Model 1': sim_mat.index.values[indx_i],
     'Model 2': sim_mat.columns.values[indx_j]
    })
    flat_sim_mat['M1 obj.'] = flat_sim_mat['Model 1'].apply(lambda x : cat_name_mapping[model_configs.loc[x, cat]])
    flat_sim_mat['M2 obj.'] = flat_sim_mat['Model 2'].apply(lambda x : cat_name_mapping[model_configs.loc[x, cat]])
    flat_sim_mat['DS'] = ds
    return flat_sim_mat


def r_coeff(df, ds1, ds2):
    x = df[df['DS'] == ds1]['Similarity value']
    y = df[df['DS'] == ds2]['Similarity value']
    if 'pearsonr' == corr_type:
        corr, _ = pearsonr(x,y)
    elif 'spearmanr' == corr_type:
        corr, _ = spearmanr(x,y)
    else:
        raise ValueError(f'Unknown {corr_type=}! Need to select pearsonr or spearmanr r_corr in corr_type.')
    return corr


def get_r_coeff_df(sim_metric_mats, cat_groups):
    # flatten each similarity matrix 
    pp_mats = []
    for ds, sim_mat in sim_metric_mats.items():
        pp_mats.append(process_sim_mat(ds, sim_mat, desired_cat)) 
    pp_mats = pd.concat(pp_mats)
    # get each dataset pair 
    combs = combinations(list(pp_mats['DS'].unique()), 2)
    r_coeffs = []
    for ds1, ds2 in combs:
        subset_ds  = pp_mats[pp_mats['DS'].isin([ds1, ds2])]
        for cat in cat_groups.index.tolist():
            subset_data = subset_ds[(subset_ds['M1 obj.']==cat) | (subset_ds['M2 obj.']==cat)].copy()
            subset_data['other cat'] = subset_data[['M1 obj.', 'M2 obj.']].apply(lambda x: x['M2 obj.'] if x['M1 obj.'] == cat else x['M1 obj.'], axis=1)
            for other_cat, other_cat_data in subset_data.groupby('other cat'):
                r_coeffs.append(
                    {
                        'ds1': ds1,
                        'ds2': ds2,
                        'anchor_cat': cat, 
                        'other_cat': other_cat,
                        'r coeff': r_coeff(other_cat_data, ds1, ds2)
                    }
                ) 
    df_r_coeffs = pd.DataFrame(r_coeffs) 
    return df_r_coeffs


In [None]:
sim_cat = product(
    sorted(model_categories),
    list(sim_mats.keys())
)
info_cols = model_cat_mapping

corr_data = {}
for desired_cat, sim_metric in sim_cat:
    print(desired_cat, sim_metric)
    cat_groups, sim_metric_mats = get_sim_data(desired_cat, sim_metric)
    df_r_coeffs = get_r_coeff_df(sim_metric_mats, cat_groups)
    df_r_coeffs['Comparison category'] = model_cat_mapping[desired_cat]
    df_r_coeffs['Similarity metric'] = sim_metric
    corr_data[f"{desired_cat}_{sim_metric}"] = df_r_coeffs

In [None]:
r_df = pd.concat(list(corr_data.values()), axis=0)

In [None]:
r_df.head()

In [None]:
if SAVE:
    fn = storing_path / f'agg_{corr_type}_all_ds{suffix}.csv'
    r_df.to_csv(fn, index=False)
    print(f"Stored aggregated data at: {fn}")