# Task-Specific Performance Gains Table Generation

This notebook generatestables showing task-specific performance gains across different representation methods.

- **Data**
    - **Input**: `complete_set_of_run.pkl` - 9 vision transformers, 20 datasets
    - **Methods**: 6 representation approaches (CLS baseline, attentive probes, linear/attentive multi-layer)
- **Statistical Analysis**
    - **Wilcoxon signed-rank tests**: Compare each method against the best-performing method per dataset
    - **Significance threshold**: p < 0.05
    - **Metrics**: Train and test balanced accuracy gains

- **Table Generation**: Creates two LaTeX table versions:
    1. **Statistical significance formatting**: Bold for statistically significant improvements over best method
    2. **Ranking-based formatting**: Bold for best performance, underline for second-best performance

- **Output**: LaTeX tables showing mean ± standard deviation performance gains across models, with datasets sorted by domain category and statistical formatting applied.

In [None]:
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import wilcoxon
import matplotlib.gridspec as gridspec

sys.path.append('..')
sys.path.append('../..')

from constants import base_model_name_mapping, BASE_PATH_PROJECT, FOLDER_SUBSTRING, experiment_with_probe_type_order_list, experiment_order_list
from helper import style_multimodel_heatmap, init_plotting_params

In [2]:
init_plotting_params()

{
  "agg.path.chunksize": 0,
  "axes.labelsize": 13.0,
  "axes.titlesize": 14.0,
  "axes3d.trackballsize": 0.667,
  "boxplot.flierprops.markersize": 6.0,
  "boxplot.meanprops.markersize": 6.0,
  "errorbar.capsize": 0.0,
  "figure.figsize": [
    6.4,
    4.8
  ],
  "figure.labelsize": "large",
  "figure.titlesize": "large",
  "font.cursive": [
    "Apple Chancery",
    "Textile",
    "Zapf Chancery",
    "Sand",
    "Script MT",
    "Felipa",
    "Comic Neue",
    "Comic Sans MS",
    "cursive"
  ],
  "font.family": [
    "sans-serif"
  ],
  "font.fantasy": [
    "Chicago",
    "Charcoal",
    "Impact",
    "Western",
    "xkcd script",
    "fantasy"
  ],
  "font.monospace": [
    "DejaVu Sans Mono",
    "Bitstream Vera Sans Mono",
    "Computer Modern Typewriter",
    "Andale Mono",
    "Nimbus Mono L",
    "Courier New",
    "Courier",
    "Fixed",
    "Terminal",
    "monospace"
  ],
  "font.sans-serif": [
    "DejaVu Sans",
    "Bitstream Vera Sans",
    "Computer Modern Sans Serif

In [None]:
SAVE = 'both'

base_storing_path = BASE_PATH_PROJECT / f'results_{FOLDER_SUBSTRING}_rebuttal/plots' 

if SAVE:
    base_storing_path.mkdir(parents=True, exist_ok=True)

In [None]:
all_runs= pd.read_pickle(BASE_PATH_PROJECT / f'results_{FOLDER_SUBSTRING}_rebuttal/aggregated/complete_set_of_run.pkl')

In [5]:
metrics_cols = [
    'abs_perf_gain_train_lp_bal_acc1',
    'abs_perf_gain_test_lp_bal_acc1',
    'test_lp_bal_acc1'
]

In [6]:
all_runs[metrics_cols] = all_runs[metrics_cols].astype(float)
all_runs[metrics_cols] *= 100

In [7]:
allowed_experiments = [
'CLS last layer',
'All tokens last layer (attentive)',
'CLS+AP last layer (linear)',
'CLS+AP layers from all blocks (linear)',
'CLS+AP last layer (attentive)',
'CLS+AP layers from all blocks (attentive)', 
]
not_allowed_models = ['mae-vit-base-p16', 'mae-vit-large-p16']

all_runs = all_runs[all_runs['Experiment'].isin(allowed_experiments) & (~all_runs['base_model'].isin(not_allowed_models))].copy().reset_index(drop=True)

In [8]:
all_runs = all_runs.drop(index=all_runs[(all_runs['nr_layers'] == 1) & all_runs['contains_intermediate']].index).copy().reset_index(drop=True)
all_runs = all_runs[all_runs['probe_type'].isin(['cae', 'linear'])].copy().reset_index(drop=True)

In [9]:
all_runs[["dataset", "Experiment"]].value_counts().sort_index().loc[('wds/imagenet1k',)]

Experiment
All tokens last layer (attentive)            9
CLS last layer                               9
CLS+AP last layer (attentive)                9
CLS+AP last layer (linear)                   9
CLS+AP layers from all blocks (attentive)    9
CLS+AP layers from all blocks (linear)       9
Name: count, dtype: int64

In [10]:
# ## temporarily filter wds/imagenet1k with All tokens last layer (attentive) 
# idx_to_drop = all_runs[(all_runs['dataset']=='wds/imagenet1k') & \
#                        (all_runs['Experiment']=='All tokens last layer (attentive)')].index
# all_runs = all_runs.drop(index = idx_to_drop).reset_index(drop=True)

### Table version with statistical tests

In [11]:
def format_if_is_bold(mean_val, std_val, is_significant):
    formatted = f"{mean_val:.2f} ± {std_val:.2f}"
    if is_significant:
        return formatted
    else:
        return f"\\textbf{{{formatted}}}"

In [12]:
metrics_cols = [
    'abs_perf_gain_train_lp_bal_acc1_mod',
    'abs_perf_gain_test_lp_bal_acc1_mod',
]

all_runs['abs_perf_gain_train_lp_bal_acc1_mod'] = all_runs['abs_perf_gain_train_lp_bal_acc1'].copy().astype(float)
all_runs['abs_perf_gain_test_lp_bal_acc1_mod'] = all_runs['abs_perf_gain_test_lp_bal_acc1'].copy().astype(float)

all_runs.loc[all_runs['Experiment'] == 'CLS last layer', 'abs_perf_gain_train_lp_bal_acc1_mod'] = all_runs.loc[all_runs['Experiment'] == 'CLS last layer', 'train_lp_bal_acc1'].astype(float)
all_runs.loc[all_runs['Experiment'] == 'CLS last layer', 'abs_perf_gain_test_lp_bal_acc1_mod'] = all_runs.loc[all_runs['Experiment'] == 'CLS last layer', 'test_lp_bal_acc1'].astype(float)

In [13]:
curr_order = [col for col in experiment_with_probe_type_order_list if ("middle & last" not in col) and ("quarterly" not in col) and ("AP last layer"!=col)]
curr_order

['CLS last layer',
 'All tokens last layer (attentive)',
 'CLS+AP last layer (linear)',
 'CLS+AP layers from all blocks (linear)',
 'CLS+AP last layer (attentive)',
 'CLS+AP layers from all blocks (attentive)']

In [14]:
subset_runs = all_runs[all_runs['Experiment'].isin(curr_order)].copy().reset_index(drop=True)

In [15]:
ds_mapping = subset_runs[['dataset', 'dataset_fmt', 'dataset_domain']].value_counts().reset_index().set_index('dataset')

In [16]:
def get_p_values_wrt_to_highest_mean(ds_data, alpha=0.05, metric_col = 'test_lp_bal_acc1'):
    best_mean_exp_per_ds = ds_data.groupby("Experiment")[metric_col].mean().idxmax()

    pvalues = {best_mean_exp_per_ds: np.nan}
    exp1_data = pd.to_numeric(ds_data[ds_data['Experiment'] == best_mean_exp_per_ds][metric_col]).values
    assert len(exp1_data)==9
    for exp in sorted(ds_data["Experiment"].unique()):
        if exp == best_mean_exp_per_ds:
            continue
        
        exp2_data = pd.to_numeric(ds_data[ds_data['Experiment'] == exp][metric_col]).values
        if len(exp1_data) != len(exp2_data):
            print(f"\n!!!Could not compute the statistical test between performances of {best_mean_exp_per_ds} and {exp} for dataset {ds_data.name}, because of mismatching run counts!!!\n")
            continue

        assert len(exp1_data) == len(exp2_data)
        
        statistic, pval = wilcoxon(exp1_data, exp2_data, alternative='greater')
        pvalues[exp] = pval

    rejected = {exp: pval < alpha for exp, pval in pvalues.items()}
    return dict(pvalues=pvalues, rejected=rejected)

In [17]:
res = subset_runs.groupby('dataset').apply(get_p_values_wrt_to_highest_mean)
res_df = pd.DataFrame(res.tolist(), index=res.index)

  res = subset_runs.groupby('dataset').apply(get_p_values_wrt_to_highest_mean)


In [18]:
grouped_data = subset_runs.groupby(["dataset", "Experiment"])

In [19]:
for metric_col in metrics_cols:
    aggr_data = grouped_data[metric_col].agg(["mean", "std"]).reset_index()
    
    mean_pivot = pd.pivot(
        aggr_data,
        index='dataset',
        columns="Experiment", 
        values="mean"
    ).loc[:, curr_order]
    
    formatted_data = []
    for idx, row in aggr_data.iterrows():
        dataset = row['dataset']
        exp = row['Experiment']
        if dataset in mean_pivot.index and exp in mean_pivot.columns:
            try:
                is_significant = res_df.loc[dataset, 'rejected'][exp]
            except:
                is_significant = False
            formatted = format_if_is_bold(row['mean'], row['std'], is_significant)
            formatted_data.append({
                'dataset': dataset,
                'Experiment': exp,
                'mean_std': formatted
            }) 
    
    formatted_df = pd.DataFrame(formatted_data)
    pivoted_aggr_data = pd.pivot(
        formatted_df,
        index='dataset',
        columns="Experiment",
        values="mean_std"
    )
    
    pivoted_aggr_data = pivoted_aggr_data.loc[:, curr_order]
    display(pivoted_aggr_data)
    pivoted_aggr_data = pivoted_aggr_data.sort_values('CLS+AP layers from all blocks (attentive)', 
                                                      key=lambda x: x.apply(lambda val: float(val.split('±')[0].split('{')[-1].strip())))
    
    pivoted_aggr_data.index.name = None
    pivoted_aggr_data.columns.name = None
    pivoted_aggr_data = pivoted_aggr_data.reset_index(names='Dataset')
    pivoted_aggr_data.insert(0, 'Category', ds_mapping.loc[pivoted_aggr_data["Dataset"].tolist(), 'dataset_domain'].reset_index(drop=True))
    pivoted_aggr_data['Dataset'] = ds_mapping.loc[pivoted_aggr_data["Dataset"].tolist(), 'dataset_fmt'].reset_index(drop=True)

    pivoted_aggr_data = pivoted_aggr_data.sort_values('Category', kind='stable')

    latex_version_pivoted_aggr_data = pivoted_aggr_data.to_latex(escape=False, index=False)

    if SAVE:
        filename = base_storing_path / 'tables_for_per_model_size_n_dataset_perf_gain' / f"{metric_col}_aggr_over_models_table_with_test_v2.tex"  # Creates filename based on metric_col
        filename.parent.mkdir(parents=True, exist_ok=True)
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(latex_version_pivoted_aggr_data)
        print(f"Stored latex table at {filename.parts[-4:]=}")

    else:
        print(metric_col)
        print()
        print(latex_version_pivoted_aggr_data)
        print()
        print()

Experiment,CLS last layer,All tokens last layer (attentive),CLS+AP last layer (linear),CLS+AP layers from all blocks (linear),CLS+AP last layer (attentive),CLS+AP layers from all blocks (attentive)
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
wds/cars,0.86 ± 0.05,\textbf{13.65 ± 5.44},4.03 ± 2.22,13.41 ± 5.37,2.12 ± 2.82,12.62 ± 5.27
wds/country211,0.31 ± 0.05,66.31 ± 9.81,5.59 ± 2.81,24.84 ± 8.62,8.62 ± 6.98,\textbf{48.02 ± 6.04}
wds/fer2013,0.56 ± 0.03,23.30 ± 6.85,3.40 ± 1.12,11.37 ± 1.59,3.74 ± 1.07,\textbf{16.47 ± 2.07}
wds/fgvc_aircraft,0.74 ± 0.09,\textbf{25.42 ± 8.32},3.75 ± 3.04,23.58 ± 7.18,3.42 ± 3.98,21.81 ± 8.30
wds/gtsrb,0.84 ± 0.03,\textbf{15.76 ± 3.13},5.74 ± 2.03,15.20 ± 2.97,5.83 ± 2.20,15.10 ± 2.87
wds/imagenet1k,0.83 ± 0.08,6.41 ± 3.85,\textbf{1.20 ± 0.78},7.13 ± 4.23,0.30 ± 0.91,\textbf{3.89 ± 3.89}
wds/stl10,\textbf{0.99 ± 0.00},\textbf{0.57 ± 0.45},\textbf{0.11 ± 0.34},\textbf{0.56 ± 0.42},\textbf{0.21 ± 0.22},\textbf{0.52 ± 0.44}
wds/voc2007,0.90 ± 0.04,9.40 ± 4.06,\textbf{2.93 ± 1.51},\textbf{7.21 ± 4.17},\textbf{3.55 ± 0.96},\textbf{9.03 ± 3.76}
wds/vtab/caltech101,0.99 ± 0.01,1.18 ± 0.97,0.72 ± 0.72,1.18 ± 0.97,0.30 ± 0.50,\textbf{1.18 ± 0.97}
wds/vtab/cifar10,0.96 ± 0.03,3.88 ± 2.81,0.73 ± 0.64,2.29 ± 1.70,0.71 ± 0.76,\textbf{2.67 ± 1.81}


abs_perf_gain_train_lp_bal_acc1_mod

\begin{tabular}{llllllll}
\toprule
Category & Dataset & CLS last layer & All tokens last layer (attentive) & CLS+AP last layer (linear) & CLS+AP layers from all blocks (linear) & CLS+AP last layer (attentive) & CLS+AP layers from all blocks (attentive) \\
\midrule
Natural (multi-domain) & STL-10 & \textbf{0.99 ± 0.00} & \textbf{0.57 ± 0.45} & \textbf{0.11 ± 0.34} & \textbf{0.56 ± 0.42} & \textbf{0.21 ± 0.22} & \textbf{0.52 ± 0.44} \\
Natural (multi-domain) & Caltech-101 & 0.99 ± 0.01 & 1.18 ± 0.97 & 0.72 ± 0.72 & 1.18 ± 0.97 & 0.30 ± 0.50 & \textbf{1.18 ± 0.97} \\
Natural (multi-domain) & CIFAR-10 & 0.96 ± 0.03 & 3.88 ± 2.81 & 0.73 ± 0.64 & 2.29 ± 1.70 & 0.71 ± 0.76 & \textbf{2.67 ± 1.81} \\
Natural (multi-domain) & ImageNet-1k & 0.83 ± 0.08 & 6.41 ± 3.85 & \textbf{1.20 ± 0.78} & 7.13 ± 4.23 & 0.30 ± 0.91 & \textbf{3.89 ± 3.89} \\
Natural (multi-domain) & PASCAL VOC 2007 & 0.90 ± 0.04 & 9.40 ± 4.06 & \textbf{2.93 ± 1.51} & \textbf{7.21 ± 4.17} & \te

Experiment,CLS last layer,All tokens last layer (attentive),CLS+AP last layer (linear),CLS+AP layers from all blocks (linear),CLS+AP last layer (attentive),CLS+AP layers from all blocks (attentive)
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
wds/cars,77.81 ± 10.65,\textbf{8.97 ± 5.22},0.50 ± 1.07,-0.86 ± 3.76,1.97 ± 1.95,6.35 ± 3.71
wds/country211,21.48 ± 6.35,-0.83 ± 1.66,1.18 ± 0.54,3.26 ± 1.05,1.35 ± 0.65,\textbf{4.96 ± 1.37}
wds/fer2013,59.08 ± 4.61,7.74 ± 2.15,2.18 ± 1.05,6.25 ± 1.19,3.61 ± 1.13,\textbf{10.05 ± 1.76}
wds/fgvc_aircraft,55.69 ± 12.18,\textbf{9.27 ± 4.37},-0.96 ± 2.22,-1.62 ± 5.01,1.84 ± 2.09,6.43 ± 3.25
wds/gtsrb,71.51 ± 7.46,\textbf{18.02 ± 6.37},4.23 ± 2.60,8.76 ± 4.20,4.69 ± 2.41,13.47 ± 4.92
wds/imagenet1k,81.40 ± 4.49,0.85 ± 1.43,\textbf{0.33 ± 0.46},0.99 ± 1.75,0.15 ± 0.62,\textbf{1.24 ± 1.62}
wds/stl10,\textbf{99.29 ± 0.51},\textbf{0.01 ± 0.16},\textbf{-0.01 ± 0.12},\textbf{0.03 ± 0.10},\textbf{0.03 ± 0.08},\textbf{0.04 ± 0.17}
wds/voc2007,87.82 ± 2.31,-0.22 ± 1.24,\textbf{1.38 ± 0.49},\textbf{1.46 ± 0.99},\textbf{1.19 ± 0.88},\textbf{1.24 ± 0.89}
wds/vtab/caltech101,95.57 ± 1.40,0.23 ± 0.52,0.43 ± 0.41,0.36 ± 0.63,0.09 ± 0.42,\textbf{0.88 ± 0.77}
wds/vtab/cifar10,96.91 ± 1.93,0.42 ± 0.58,0.08 ± 0.11,0.61 ± 0.71,0.19 ± 0.29,\textbf{0.77 ± 0.79}


abs_perf_gain_test_lp_bal_acc1_mod

\begin{tabular}{llllllll}
\toprule
Category & Dataset & CLS last layer & All tokens last layer (attentive) & CLS+AP last layer (linear) & CLS+AP layers from all blocks (linear) & CLS+AP last layer (attentive) & CLS+AP layers from all blocks (attentive) \\
\midrule
Natural (multi-domain) & STL-10 & \textbf{99.29 ± 0.51} & \textbf{0.01 ± 0.16} & \textbf{-0.01 ± 0.12} & \textbf{0.03 ± 0.10} & \textbf{0.03 ± 0.08} & \textbf{0.04 ± 0.17} \\
Natural (multi-domain) & CIFAR-10 & 96.91 ± 1.93 & 0.42 ± 0.58 & 0.08 ± 0.11 & 0.61 ± 0.71 & 0.19 ± 0.29 & \textbf{0.77 ± 0.79} \\
Natural (multi-domain) & Caltech-101 & 95.57 ± 1.40 & 0.23 ± 0.52 & 0.43 ± 0.41 & 0.36 ± 0.63 & 0.09 ± 0.42 & \textbf{0.88 ± 0.77} \\
Natural (multi-domain) & PASCAL VOC 2007 & 87.82 ± 2.31 & -0.22 ± 1.24 & \textbf{1.38 ± 0.49} & \textbf{1.46 ± 0.99} & \textbf{1.19 ± 0.88} & \textbf{1.24 ± 0.89} \\
Natural (multi-domain) & ImageNet-1k & 81.40 ± 4.49 & 0.85 ± 1.43 & \textbf{0.33 ± 0.46} & 0.

### Table version with highest median bold, second highest underline

In [20]:
def format_for_latex(mean_val, std_val, row_means):
    formatted = f"{mean_val:.2f} ± {std_val:.2f}"
    
    sorted_means =sorted(row_means, reverse=True)
    if mean_val not in sorted_means:
        return formatted
        
    rank = sorted_means.index(mean_val) + 1

    if rank == 1:
        return f"\\textbf{{{formatted}}}"
    elif rank == 2:
        return f"\\underline{{{formatted}}}"
    else:
        return formatted

In [21]:
for metric_col in metrics_cols:
    aggr_data = grouped_data[metric_col].agg(["mean", "std"]).reset_index()
    
    # Get means for ranking
    mean_pivot = pd.pivot(
        aggr_data,
        index='dataset',
        columns="Experiment", 
        values="mean"
    ).loc[:, curr_order]
    ranks = mean_pivot.iloc[:, 1:].rank(axis=1, ascending=False).mean()
    
    formatted_data = []
    for idx, row in aggr_data.iterrows():
        model = row['dataset']
        exp = row['Experiment']
        if model in mean_pivot.index and exp in mean_pivot.columns:
            row_means = mean_pivot.loc[model].values
            formatted = format_for_latex(row['mean'], row['std'], row_means[1:])
            formatted_data.append({
                'dataset': model,
                'Experiment': exp,
                'mean_std': formatted
            })
    
    formatted_df = pd.DataFrame(formatted_data)
    pivoted_aggr_data = pd.pivot(
        formatted_df,
        index='dataset',
        columns="Experiment",
        values="mean_std"
    )
    pivoted_aggr_data = pivoted_aggr_data.loc[:, curr_order]
    pivoted_aggr_data = pivoted_aggr_data.sort_values(curr_order[-1], key=lambda x: x.apply(lambda val: float(val.split('±')[0].split('{')[-1].strip())))
    
    pivoted_aggr_data.index.name = None
    pivoted_aggr_data.columns.name = None
    pivoted_aggr_data = pivoted_aggr_data.reset_index(names='Dataset')
    pivoted_aggr_data.insert(0, 'Category', ds_mapping.loc[pivoted_aggr_data["Dataset"].tolist(), 'dataset_domain'].reset_index(drop=True))
    pivoted_aggr_data['Dataset'] = ds_mapping.loc[pivoted_aggr_data["Dataset"].tolist(), 'dataset_fmt'].reset_index(drop=True)

    pivoted_aggr_data = pivoted_aggr_data.sort_values('Category', kind='stable')
    tmp = pd.concat([pivoted_aggr_data, ranks.to_frame().T])
    # latex_version_pivoted_aggr_data = pivoted_aggr_data.to_latex(escape=False, index=False)
    latex_version_pivoted_aggr_data = tmp.to_latex(escape=False, index=False)

    if SAVE:
        filename = base_storing_path / 'tables_for_per_model_size_n_dataset_perf_gain' / f"{metric_col}_aggr_over_models_table_v2.tex"  # Creates filename based on metric_col
        filename.parent.mkdir(parents=True, exist_ok=True)
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(latex_version_pivoted_aggr_data)
        print(f"Stored latex table at {filename.parts[-4:]=}")

    else:
        print(metric_col)
        print()
        print(latex_version_pivoted_aggr_data)
        print()
        print()

abs_perf_gain_train_lp_bal_acc1_mod

\begin{tabular}{llllllll}
\toprule
Category & Dataset & CLS last layer & All tokens last layer (attentive) & CLS+AP last layer (linear) & CLS+AP layers from all blocks (linear) & CLS+AP last layer (attentive) & CLS+AP layers from all blocks (attentive) \\
\midrule
Natural (multi-domain) & STL-10 & 0.99 ± 0.00 & \textbf{0.57 ± 0.45} & 0.11 ± 0.34 & \underline{0.56 ± 0.42} & 0.21 ± 0.22 & 0.52 ± 0.44 \\
Natural (multi-domain) & Caltech-101 & 0.99 ± 0.01 & \textbf{1.18 ± 0.97} & 0.72 ± 0.72 & \textbf{1.18 ± 0.97} & 0.30 ± 0.50 & \textbf{1.18 ± 0.97} \\
Natural (multi-domain) & CIFAR-10 & 0.96 ± 0.03 & \textbf{3.88 ± 2.81} & 0.73 ± 0.64 & 2.29 ± 1.70 & 0.71 ± 0.76 & \underline{2.67 ± 1.81} \\
Natural (multi-domain) & ImageNet-1k & 0.83 ± 0.08 & \underline{6.41 ± 3.85} & 1.20 ± 0.78 & \textbf{7.13 ± 4.23} & 0.30 ± 0.91 & 3.89 ± 3.89 \\
Natural (multi-domain) & PASCAL VOC 2007 & 0.90 ± 0.04 & \textbf{9.40 ± 4.06} & 2.93 ± 1.51 & 7.21 ± 4.17 & 3.55 ± 0.96 