In [None]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import torch

from constants import sim_metric_name_mapping, similarity_metrics
from helper import get_model_ids, plot_r_coeff_distribution, save_or_show, plot_scatter

sys.path.append('..')
from scripts.helper import load_models

#### Global variables

In [None]:
## DATASET AND MODEL CONFIG
datasets = "../scripts/webdatasets_wo_imagenet.txt"
model_config = "../scripts/filtered_models_config.json"
# anchor_model = "OpenCLIP_ViT-L-14_openai"  # ANCHOR MODEL 1
anchor_model = "resnet50"  # ANCHOR MODEL 2
combiner = 'concat'

## SIMILARITY METRICS 
regularitzation = 'L2'  # L1, L2 , 'weight_decay'

### IMAGENET SUBSET SIMILARITIES
model_similarities_base_path = Path('/home/space/diverse_priors/model_similarities')

### AGGREGATED RESULTS --> GOTTEN WITH gather_anchor_exp_results.ipynb
base_path_aggregated_results = Path('/home/space/diverse_priors/results/aggregated')

### SINGLE MODEL BEST PERFORMANCES --> structure path / [L1, L2, weight_decay] / [DATASET].json
single_model_best_perf_path = Path('/home/space/diverse_priors/results/aggregated/max_performance_per_model_n_ds')

#### Storing information

In [None]:
# base_storing_path = Path('/home/lciernik/projects/divers-priors/diverse_priors/benchmark/scripts/test_results/neg_corr_exp')
storing_path = Path('/home/space/diverse_priors/results/plots/performance_gap_ds__ds_sim')
storing_path = storing_path / f"{anchor_model}__{regularitzation}"
SAVE = True

if SAVE:
    storing_path.mkdir(parents=True, exist_ok=True)

#### Load experiment results

In [None]:
df = pd.read_pickle(base_path_aggregated_results / f'anchor_{anchor_model}.pkl')

In [None]:
HYPER_PARAM_COLS = ['task', 'mode', 'combiner', 'dataset', 'model_ids', 'fewshot_k', 'fewshot_epochs', 'batch_size',
                    'regularization']

In [None]:
df['model_ids'] = df['model_ids'].apply(eval).apply(tuple)
df['dataset'] = df['dataset'].apply(lambda x: x.replace('/', '_'))

In [None]:
mean_df = df.groupby(HYPER_PARAM_COLS, dropna=False).test_lp_acc1.mean().reset_index()

In [None]:
mean_df = mean_df[mean_df['regularization'] == regularitzation].copy().reset_index(drop=True)
assert len(mean_df) > 0, f"No values found for {regularitzation=}."

In [None]:
mean_df['dataset'].unique()

#### Load similarity values

In [None]:
sim_mats = {}

models, nmodels = load_models(model_config)
allowed_models = sorted(list(models.keys()))

for ds in mean_df['dataset'].unique():
    sim_mats[ds] = {}
    for sim_metric in similarity_metrics:
        model_similarities_path = model_similarities_base_path / ds / sim_metric

        model_ids_fn = model_similarities_path / 'model_ids.txt'
        sim_mat_fn = model_similarities_path / 'similarity_matrix.pt'

        model_ids = get_model_ids(model_ids_fn)
        sim_mat = torch.load(sim_mat_fn)
        sim_mat = pd.DataFrame(sim_mat, index=model_ids, columns=model_ids)

        avail_models = sorted(list(set(allowed_models).intersection(model_ids)))
        sim_mat = sim_mat.loc[avail_models, avail_models]
        sim_mats[ds][sim_metric] = sim_mat

#### Prepare data for plotting
Steps:
1. Compute performance gap between combined model (concat or ensemble) and single model for each dataset.
2. Add similarity value for each pair of model

In [None]:
single_performance = mean_df[mean_df['mode'] == 'single_model'].copy().reset_index(drop=True)
concat_performance = mean_df[mean_df['mode'] == 'combined_models'].copy().reset_index(drop=True)
ensemble_performance = mean_df[mean_df['mode'] == 'ensemble'].copy().reset_index(drop=True)
print(f"{single_performance.shape=}, {concat_performance.shape=}, {ensemble_performance.shape=}")

In [None]:
concat_performance['other_model'] = concat_performance['model_ids'].apply(
    lambda x: x[0] if x[1] == anchor_model else x[1])
ensemble_performance['other_model'] = ensemble_performance['model_ids'].apply(
    lambda x: x[0] if x[1] == anchor_model else x[1])

In [None]:
## THESE ARE THE ANCHOR MODEL PERFORMANCES FOR DIFFERENT REGULARIZATIONS
single_performance_pivot = pd.pivot_table(
    single_performance,
    index='dataset',
    columns='regularization',
    values='test_lp_acc1'
)

In [None]:
def get_performance_gap_n_sim_metric(row):
    other_model = row['other_model']
    comb_perf = row['test_lp_acc1']
    sing_perf = single_performance_pivot.loc[row['dataset'], row['regularization']]
    gap = comb_perf - sing_perf

    res = {'gap': gap}
    for key, curr_sim_mat in sim_mats[row['dataset']].items():
        try:
            res[sim_metric_name_mapping[key]] = curr_sim_mat.loc[other_model, anchor_model]
        except KeyError:
            print(row['dataset'], other_model, anchor_model, key)
            res[sim_metric_name_mapping[key]] = np.nan
    return pd.Series(res)

In [None]:
n_cols_before = concat_performance.shape[1]
assert n_cols_before == ensemble_performance.shape[1]

In [None]:
concat_performance = pd.concat([concat_performance,
                                concat_performance.apply(get_performance_gap_n_sim_metric, axis=1)],
                               axis=1)

ensemble_performance = pd.concat([ensemble_performance,
                                  ensemble_performance.apply(get_performance_gap_n_sim_metric, axis=1)],
                                 axis=1)

In [None]:
sim_val_col = 'Similarity value'
sim_met_col = 'Similarity metric'


def pp_df(cur_df):
    return pd.melt(
        cur_df,
        id_vars=cur_df.columns.tolist()[:(n_cols_before + 1)],
        var_name=sim_met_col,
        value_name=sim_val_col,
    )


concat_performance = pp_df(concat_performance)
ensemble_performance = pp_df(ensemble_performance)

#### Plot scatter plot and add correlation coefficient 

In [None]:
fig = plot_scatter(concat_performance,
                   f"Combined models (Concat) with anchor {anchor_model} and {regularitzation} regularitzation.",
                   "Downstream Dataset",
                   sim_met_col,
                   sim_val_col)

save_or_show(fig, storing_path / 'combined_concat.pdf', SAVE)

In [None]:
fig = plot_scatter(ensemble_performance,
                   f"Ensemble with anchor {anchor_model} and {regularitzation} regularitzation.",
                   "Downstream Dataset",
                   sim_met_col,
                   sim_val_col)

save_or_show(fig, storing_path / 'ensemble.pdf', SAVE)

In [None]:
fig = plot_r_coeff_distribution(concat_performance, sim_met_col, sim_val_col)
save_or_show(fig, storing_path / 'combined_concat_dist_r.pdf', SAVE)

In [None]:
fig = plot_r_coeff_distribution(ensemble_performance, sim_met_col, sim_val_col)
save_or_show(fig, storing_path / 'ensemble_dist_r.pdf', SAVE)