## Notebook to aggregate single experiment results

First part loads all results (in `results.json` files) from all experiments into a single dataframe.

Second part filters the dataframe to keep only the best runs per experiment/dataset/model combination (based on validation balanced accuracy).

Finally, different aggregated results dataframes are merged together, additional information columns are added, and absolute/relative performance columns are computed.


In [None]:
%load_ext autoreload
%autoreload 2
    
import pandas as pd
import numpy as np
from helper import add_additional_info, filter_df_for_best_runs, get_abs_rel_performance
from constants import BASE_PATH_PROJECT, FOLDER_SUBSTRING

### Global variables


In [None]:
project_paths = [
    # BASE_PATH_PROJECT / f"results_{FOLDER_SUBSTRING}_exp",
    # BASE_PATH_PROJECT / f"results_{FOLDER_SUBSTRING}_exp_wd0.1",
    # BASE_PATH_PROJECT / f"results_{FOLDER_SUBSTRING}_exp_new_pcam",
    BASE_PATH_PROJECT / f"results_{FOLDER_SUBSTRING}_rebuttal",
    BASE_PATH_PROJECT / f"results_{FOLDER_SUBSTRING}_end2end_finetuning",
]

FILTER_FOR_BEST_MODEL = True
COMPUTE_RELATIVE_PERFORMANCES = False

In [None]:
storing_path = BASE_PATH_PROJECT / f"results_{FOLDER_SUBSTRING}_exp/aggregated"
# storing_path = BASE_PATH_PROJECT / f"results_{FOLDER_SUBSTRING}_rebuttal/aggregated"
# storing_path = BASE_PATH_PROJECT / f"results_{FOLDER_SUBSTRING}_end2end_finetuning/aggregated"
storing_path.mkdir(parents=True, exist_ok=True)

OVERWRITE = True

### Gather all results


In [None]:
probe_type_mapping = {
    "ep": "Efficient Probe",
    "aim": "AIM",
    "v-jepra": "V-JEPA",
    "linear": "linear",
    "cae": "cae",
    "cae_nowq": "Ours-Wq",
}


def get_probe_type(res_path):
    probe_type = res_path.absolute().as_posix().split("probe_type_")[-1].split("/")[0]
    if not probe_type:
        if "linear_probe" in res_path.absolute().as_posix():
            probe_type = "linear"
        else:
            probe_type = "cae"
    return probe_type_mapping[probe_type]

In [None]:
res = []
for project_path in project_paths:
    for res_path in project_path.rglob("seed_0/results.json"):

        df = pd.read_json(res_path)
        if "_unfrozen_premodel" in str(res_path):
            df["pipeline_type"] = "end2end_finetuning"
        elif "_frozen_premodel" in str(res_path):
            df["pipeline_type"] = "end2end_probe"
        else:
            df["pipeline_type"] = "feature_probe"
        df = add_additional_info(df)
        model_id_n_hopt_slug = "/".join(res_path.parts[-11:-1])
        df["model_id_n_hopt_slug"] = model_id_n_hopt_slug
        df["res_folder"] = project_path.name
        df["res_path"] = res_path
        df["probe_type"] = get_probe_type(res_path)
        res.append(df)
all_results = pd.concat(res).reset_index(drop=True)
all_results.shape

In [None]:
try:
    all_results = all_results[
        ~all_results["attention_dropout"].isin(["[0.1, 0.0]", "[0.3, 0.0]"])
    ].reset_index(drop=True)
except KeyError:
    print("No attentive probes included in all_results")
all_results.shape

### Post process the results

- Select for each combination the best run based on the validation accuracy
- Compute the relative performance gain compared to one layer


In [None]:
bak = all_results.copy()

In [None]:
all_results = bak.copy()

In [None]:
if FILTER_FOR_BEST_MODEL:
    print("Filtering all runs ...")
    all_results = filter_df_for_best_runs(
        df=all_results,
        metric_col="best_val_bal_acc1",
        group_cols=["task", "probe_type", "experiment", "dataset", "model_ids"],
    )
    all_results = all_results.reset_index(drop=True)

In [None]:
if COMPUTE_RELATIVE_PERFORMANCES:
    print(f"{all_results.shape=} before computing relative performances")
    all_results = (
        all_results.groupby(["dataset", "base_model"])
        .apply(get_abs_rel_performance, include_groups=False)
        .reset_index()
    )
    print(f"{all_results.shape=} after computing relative performances")

### Store the aggregated data


In [None]:
fn = storing_path / f"all_runs_rebuttal.pkl"
if fn.exists() and not OVERWRITE:
    raise FileExistsError(f"File {fn} already exists. No overwriting!!")
else:
    all_results.to_pickle(fn)
    print(f"Stored all aggregated results at {fn=}")

#### Merge aggregated results

Multiple aggregated results dataframes are merged together to form a single final dataframe of all experiments.


In [None]:
# Load new runs
all_runs_path = (
    BASE_PATH_PROJECT / f"results_{FOLDER_SUBSTRING}_rebuttal/aggregated/all_runs_rebuttal.pkl"
)
print(all_runs_path)
all_runs = pd.read_pickle(all_runs_path)
print(all_runs.shape)

# Load old runs
prev_runs_path = BASE_PATH_PROJECT / f"results_{FOLDER_SUBSTRING}_exp/aggregated/all_runs_v11.pkl"
print(prev_runs_path)
prev_runs = pd.read_pickle(prev_runs_path)
print(prev_runs.shape)

# Load end2end runs
runs_fine_tuning_path = (
    BASE_PATH_PROJECT
    / f"results_{FOLDER_SUBSTRING}_end2end_finetuning/aggregated/all_runs_rebuttal.pkl"
)
print(runs_fine_tuning_path)
runs_fine_tuning = pd.read_pickle(runs_fine_tuning_path)
print(runs_fine_tuning.shape)

# Combine all runs
all_runs = pd.concat([prev_runs, runs_fine_tuning, all_runs])
all_runs = all_runs[~all_runs["dataset"].isin(["imagenet-subset-50k"])].reset_index(
    drop=True
)

print("Concatenated:", all_runs.shape)

In [None]:
all_runs.loc[all_runs["dataset"] == "imagenet_torchvision", "dataset"] = (
    "wds/imagenet1k"
)

In [None]:
# Get relative performance
result = all_runs.groupby(["dataset", "base_model"]).apply(
    get_abs_rel_performance, include_groups=False
)
all_runs = result.reset_index(level=[0, 1])

# Remove not assessed models
all_runs = all_runs[~all_runs["dataset"].isin(["imagenet-subset-50k"])].reset_index(
    drop=True
)

# Add additional columns describing if it contains intermediate layers
all_runs["contains_intermediate"] = all_runs["model_ids"].apply(
    lambda x: len(
        [elem for elem in eval(x) if elem.split("@")[-1] not in ["norm", "visual"]]
    )
    > 0
)
all_runs.loc[all_runs["probe_type"].isna(), "probe_type"] = np.where(
    all_runs.loc[all_runs["probe_type"].isna(), "task"].str.contains("linear"),
    "linear",
    "cae",
)
all_runs.loc[all_runs["pipeline_type"].isna(), "pipeline_type"] = "feature_probe"

cols_distinct = list(
    set(all_runs.columns)
    - set(
        [
            "hopt_time_hr",
            "hopt_time_s",
            "level_2",
            "res_folder",
            "res_path",
            "test_data_inference__time_hr",
            "test_data_inference_time",
            "train_data_inference_time",
            "train_data_inference_time_hr",
            "training_time",
            "training_time_hr",
        ]
    )
)
print(all_runs.shape)
duplicated_tuns = all_runs[all_runs[cols_distinct].duplicated(keep="first")].copy()
all_runs = all_runs[~all_runs[cols_distinct].duplicated(keep="first")]
print(all_runs.shape)

In [None]:
fn = storing_path / f"complete_set_of_run.pkl"
if fn.exists() and not OVERWRITE:
    raise FileExistsError(f"File {fn} already exists. No overwriting!!")
else:
    all_runs.to_pickle(fn)
    print(f"Stored all aggregated results at {fn=}")