In [1]:
import os
import pandas as pd
import yaml

In [2]:
import warnings

warnings.filterwarnings("ignore", category=pd.core.common.SettingWithCopyWarning)

In [3]:
with open("../../enriched_experiments.yml", "r") as f:
    exps = yaml.safe_load(f)

In [4]:
experiments = exps["experiments"]

In [5]:
# Get all runs that produced some accuracy results
unprocessed_count = 0
processed_count = 0
failed_cnt = 0
structured_results = []
pretrain_results = []
for experiment in experiments:
    if experiment.get("failed"):
        failed_cnt += 1
        continue
    accuracy_cols = [key for key in experiment.keys() if "accuracy" in key]
    mse_cols = [key for key in experiment.keys() if "mse" in key]
    loss_cols = [
        key
        for key in experiment.keys()
        if "loss" in key and key != "model/auxiliary_loss_ratio"
    ]

    if len(accuracy_cols) == 0:
        # TODO: later collect STSN runs
        if len(mse_cols) == 0:
            # pretraining - no loss named accuracy or mse (/loss is used)
            pretrain_results.append(
                {col: experiment.get(col, {}).get("best_loss") for col in loss_cols}
                | {
                    "loss/best_ckpt": experiment.get("loss/best_ckpt"),
                    "wandb_urls": ",".join(experiment.get("wandb_urls", [])),
                    "experiment_nm": experiment.get("experiment_nm"),
                    "test_nm": experiment.get("test_nm"),
                    "based_on/slurm_id": experiment.get("based_on", {}).get("slurm_id"),
                    "max_epoch": experiment.get("max_epoch"),
                    "lr": experiment.get("lr"),
                    "img_size": experiment.get("img_size"),
                    "batch_size": experiment.get("batch_size"),
                    "model/auxiliary_loss_ratio": experiment.get(
                        "model/auxiliary_loss_ratio"
                    ),
                    "model/num_slots": experiment.get("model/num_slots"),
                    "model/num_iterations": experiment.get("model/num_iterations"),
                    "model/hid_dim": experiment.get("model/hid_dim"),
                    "model/class": experiment.get("model/class"),
                    "slurm_ids": ",".join(
                        [str(_x) for _x in experiment.get("slurm_id", [])]
                    ),
                }
            )
            processed_count += 1
        else:
            unprocessed_count += 1
        continue

    structured_results.append(
        {col: experiment[col]["best_loss"] for col in accuracy_cols}
        | {col: experiment[col]["best_loss"] for col in mse_cols}
        | {
            "loss/best_ckpt": experiment.get("loss/best_ckpt"),
            "wandb_urls": ",".join(experiment.get("wandb_urls", [])),
            "experiment_nm": experiment.get("experiment_nm"),
            "test_nm": experiment.get("test_nm"),
            "based_on/slurm_id": experiment.get("based_on", {}).get("slurm_id"),
            "max_epoch": experiment.get("max_epoch"),
            "finetuned_from_slurm_id": experiment.get("additional_inforamations", {})
            .get("finetuned_from", {})
            .get("slurm_id"),
            "finetuned_from_wandb_id": experiment.get("additional_inforamations", {})
            .get("finetuned_from", {})
            .get("wandb_id"),
            "lr": experiment.get("lr"),
            "img_size": experiment.get("img_size"),
            "batch_size": experiment.get("batch_size"),
            "model/auxiliary_loss_ratio": experiment.get("model/auxiliary_loss_ratio"),
            "model/class": experiment.get("model/class"),
            "relation-model/class": experiment.get("relation-model/class"),
            "val/loss": experiment.get("val/loss", {}).get("best_loss"),
            "slurm_ids": ",".join([str(_x) for _x in experiment.get("slurm_id", [])]),
        }
    )
    # TODO: enrich with information from "additional_inforamations" (TODO: add relevant fields in enrich_experiments.py)
    processed_count += 1
print(f"Processed {processed_count} experiments")
print(f"Unprocessed {unprocessed_count} experiments")
print(f"Failed {failed_cnt} experiments")

Processed 345 experiments
Unprocessed 0 experiments
Failed 2 experiments


In [6]:
# TODO: split by dataset (not mutually exclusive -- lets duplicate the information)
# VAEC/VASR/HOI/LOGO/LOGO+HOI/VAEC+VASR/LOGO+VAEC/HOI+VASR

# might be nice to add identifiers different than slurm_ids
# (autoincrement value would work but could be hard to
# automatically assign it so that it makes sense in paper)

In [7]:
pretrain_df = pd.DataFrame(pretrain_results)

In [8]:
df = pd.DataFrame(structured_results)

In [9]:
full_df = df.copy()

### normalize columns


In [10]:
cols = df.columns

In [11]:
pretrain_cols = pretrain_df.columns

In [12]:
train_acc = [col for col in cols if "train" in col and "accuracy" in col]
val_acc = [col for col in cols if "val" in col and "accuracy" in col]

In [13]:
train_loss = [col for col in pretrain_cols if "train" in col and "loss" in col]
val_loss = [col for col in pretrain_cols if "val" in col and "loss" in col]

In [14]:
datasets = ["bongard_logo", "bongard_hoi", "vaec", "vasr"]

In [15]:
for dataset in datasets:
    _cols = [col for col in train_acc if dataset in col]
    df[f"train/{dataset}/accuracy"] = df[_cols].bfill(axis=1).iloc[:, 0]

    _cols = [col for col in val_acc if dataset in col]
    df[f"val/{dataset}/accuracy"] = df[_cols].bfill(axis=1).iloc[:, 0]

    _cols = [col for col in train_loss if dataset in col]
    pretrain_df[f"train/{dataset}/mse"] = pretrain_df[_cols].bfill(axis=1).iloc[:, 0]

    _cols = [col for col in val_loss if dataset in col]
    pretrain_df[f"val/{dataset}/mse"] = pretrain_df[_cols].bfill(axis=1).iloc[:, 0]

In [16]:
df = df.drop(
    columns=set(train_acc) - {f"train/{dataset}/accuracy" for dataset in datasets}
)
df = df.drop(columns=set(val_acc) - {f"val/{dataset}/accuracy" for dataset in datasets})

In [17]:
pretrain_df = pretrain_df.drop(columns=[*train_loss, *val_loss])

In [18]:
test_acc = [col for col in cols if "test" in col and "accuracy" in col]

In [19]:
test_types = [
    ("bongard_logo", "test_bd"),
    ("bongard_logo", "test_ff"),
    ("bongard_logo", "test_hd_comb"),
    ("bongard_logo", "test_hd_novel"),
    ("vaec", "test1"),
    ("vaec", "test2"),
    ("vaec", "test3"),
    ("vaec", "test4"),
    ("vaec", "test5"),
    ("bongard_hoi", "seen-seen"),
    ("bongard_hoi", "seen-unseen"),
    ("bongard_hoi", "unseen-seen"),
    ("bongard_hoi", "unseen-unseen"),
    ("vasr", "vasr"),
]

In [20]:
for dataset, test_type in test_types:
    _cols = [col for col in test_acc if test_type in col]
    df[f"test/{dataset}/{test_type}"] = df[_cols].bfill(axis=1).iloc[:, 0]

In [21]:
df = df.drop(columns=test_acc)

### enrich data


In [22]:
from itertools import combinations

In [23]:
# datasets_short = ["logo", "hoi", "vaec", "vasr"]

# is_double = None
# for _a, _b in list(combinations(datasets_short, 2)):
#     if is_double is not None:
#         is_double |= (df.experiment_nm.str.contains(_a)) & (df.experiment_nm.str.contains(_b))
#     else:
#         is_double = (df.experiment_nm.str.contains(_a)) & (df.experiment_nm.str.contains(_b))

In [24]:
is_double = None
for _a, _b in list(combinations(datasets, 2)):
    if is_double is not None:
        is_double |= (~pretrain_df[f"train/{_a}/mse"].isna()) & (
            ~pretrain_df[f"train/{_b}/mse"].isna()
        )
    else:
        is_double = (~pretrain_df[f"train/{_a}/mse"].isna()) & (
            ~pretrain_df[f"train/{_b}/mse"].isna()
        )
pretrain_df.loc[:, "is_double"] = is_double

In [25]:
is_double = None
for _a, _b in list(combinations(datasets, 2)):
    if is_double is not None:
        is_double |= (~df[f"train/{_a}/accuracy"].isna()) & (
            ~df[f"train/{_b}/accuracy"].isna()
        )
    else:
        is_double = (~df[f"train/{_a}/accuracy"].isna()) & (
            ~df[f"train/{_b}/accuracy"].isna()
        )
df.loc[:, "is_double"] = is_double

In [26]:
pretrain_df.loc[:, "id"] = "0." + (pretrain_df.index + 1).astype(str)

### Split into groups


In [27]:
# manaully append finetune? (or base it on finetune from)

In [28]:
sdatasets = ["bongard_hoi", "vasr", "bongard_logo", "vaec"]

In [29]:
for dataset in sdatasets:
    cols = [
        # "wandb_urls",
        "id",
        "experiment_nm",
        "test_nm",
        # "based_on/slurm_id",
        "model/class",
        "img_size",
        "batch_size",
        # "model/auxiliary_loss_ratio",
        "model/num_slots",
        "model/num_iterations",
        "model/hid_dim",
        "max_epoch",
        f"train/{dataset}/mse",
        f"val/{dataset}/mse",
        "is_double",
    ]
    _df = pretrain_df[cols]
    # _df["model/auxiliary_loss_ratio"].fillna(0, inplace=True)
    _df = _df.dropna()
    _df.dataset = dataset
    _df["model/class"] = _df["model/class"].str.split(".").str[2]
    _df.max_epoch = _df.max_epoch.astype("Int32")
    display(_df)

Unnamed: 0,id,experiment_nm,test_nm,model/class,img_size,batch_size,model/num_slots,model/num_iterations,model/hid_dim,max_epoch,train/bongard_hoi/mse,val/bongard_hoi/mse,is_double
0,0.1,bongard_hoi_vasr_images,lr_check,STSN,256,16,20,3,64,2,0.0,0.025924,True
1,0.2,bongard_hoi_vasr_images,lr_check,STSN,128,32,20,3,64,5,0.0,0.020044,True
2,0.3,bongard_hoi_vasr_images,lr_check,STSN,256,16,20,3,64,7,0.0,0.023244,True
3,0.4,bongard_hoi_vasr_images,lr_check,STSNv3,256,8,40,3,64,1,0.0,0.017197,True
4,0.5,bongard_hoi_vasr_images,number_of_slots,STSNv3,128,64,20,3,64,18,0.0,0.014909,True
5,0.6,bongard_hoi_vasr_images,number_of_slots,STSNv3,128,128,10,3,64,17,0.0,0.018479,True
6,0.7,bongard_hoi_vasr_images,number_of_slots,STSNv3,128,32,30,3,64,19,0.0,0.011832,True
7,0.8,bongard_hoi_vasr_images,number_of_slots,STSNv3,128,32,40,3,64,24,0.0,0.010122,True
8,0.9,bongard_hoi_vasr_images,number_of_slots,STSNv3,128,32,50,3,64,15,0.0,0.009866,True
9,0.1,bongard_hoi_vasr_images,number_of_slots,STSNv3,128,16,60,3,64,18,0.0,0.008151,True


Unnamed: 0,id,experiment_nm,test_nm,model/class,img_size,batch_size,model/num_slots,model/num_iterations,model/hid_dim,max_epoch,train/vasr/mse,val/vasr/mse,is_double
0,0.1,bongard_hoi_vasr_images,lr_check,STSN,256,16,20,3,64,2,0.00765,0.024717,True
1,0.2,bongard_hoi_vasr_images,lr_check,STSN,128,32,20,3,64,5,0.005892,0.019478,True
2,0.3,bongard_hoi_vasr_images,lr_check,STSN,256,16,20,3,64,7,0.00671,0.021864,True
3,0.4,bongard_hoi_vasr_images,lr_check,STSNv3,256,8,40,3,64,1,0.004818,0.014318,True
4,0.5,bongard_hoi_vasr_images,number_of_slots,STSNv3,128,64,20,3,64,18,0.004018,0.012699,True
5,0.6,bongard_hoi_vasr_images,number_of_slots,STSNv3,128,128,10,3,64,17,0.004935,0.016218,True
6,0.7,bongard_hoi_vasr_images,number_of_slots,STSNv3,128,32,30,3,64,19,0.003141,0.009839,True
7,0.8,bongard_hoi_vasr_images,number_of_slots,STSNv3,128,32,40,3,64,24,0.002693,0.008314,True
8,0.9,bongard_hoi_vasr_images,number_of_slots,STSNv3,128,32,50,3,64,15,0.002618,0.008135,True
9,0.1,bongard_hoi_vasr_images,number_of_slots,STSNv3,128,16,60,3,64,18,0.002202,0.006632,True


Unnamed: 0,id,experiment_nm,test_nm,model/class,img_size,batch_size,model/num_slots,model/num_iterations,model/hid_dim,max_epoch,train/bongard_logo/mse,val/bongard_logo/mse,is_double
25,0.26,bongard_logo_vaec_images,number_of_slots,STSNv3,80,128,20,3,64,70,0.000256,0.000834,True
26,0.27,bongard_logo_vaec_images,number_of_slots,STSNv3,80,256,10,3,64,60,0.000696,0.002539,True
27,0.28,bongard_logo_images,number_of_slots,STSNv3,80,256,10,3,64,165,0.001149,0.001245,False
28,0.29,bongard_logo_images,number_of_slots,STSNv3,80,128,20,3,64,77,0.001243,0.001227,False


Unnamed: 0,id,experiment_nm,test_nm,model/class,img_size,batch_size,model/num_slots,model/num_iterations,model/hid_dim,max_epoch,train/vaec/mse,val/vaec/mse,is_double
25,0.26,bongard_logo_vaec_images,number_of_slots,STSNv3,80,128,20,3,64,70,0.000259,0.00092,True
26,0.27,bongard_logo_vaec_images,number_of_slots,STSNv3,80,256,10,3,64,60,0.000702,0.001655,True
29,0.3,vaec_images,number_of_slots,STSNv3,80,256,10,3,64,47,7e-06,0.001268,False
30,0.31,vaec_images,number_of_slots,STSNv3,80,128,20,3,64,34,4e-06,0.001885,False


In [None]:
for dataset in sdatasets:
    cols = [
        # "wandb_urls",
        "id",
        # "experiment_nm",
        "test_nm",
        "is_double",
        # "based_on/slurm_id",
        "lr",
        "model/class",
        "img_size",
        "batch_size",
        "max_epoch",
        # "model/auxiliary_loss_ratio",
        "model/num_slots",
        "model/num_iterations",
        "model/hid_dim",
        f"train/{dataset}/mse",
        f"val/{dataset}/mse",
    ]
    _df = pretrain_df[cols]
    # _df["model/auxiliary_loss_ratio"].fillna(0, inplace=True)
    _df = _df.dropna(subset=[f"train/{dataset}/mse", f"val/{dataset}/mse"])
    _df.dataset = dataset
    _df.max_epoch = _df.max_epoch.astype("Int32")
    _df["model/class"] = _df["model/class"].str.split(".").str[2]
    _df.max_epoch = _df.max_epoch.astype("Int32")
    # _df.wandb_urls = _df.wandb_urls.str.split("/").str[-1]
    out = []
    for column in _df.columns:
        splt = column.split("/")
        if len(splt) == 3:
            out.append((splt[0], splt[1], splt[2]))
        elif len(splt) == 2:
            if splt[0] in ["train", "val", "test"]:
                out.append((splt[0], "", splt[1]))
                continue
            if column == "model/auxilary_loss_ratio":
                out.append(("auxiliary", "loss", "ratio"))
                continue
            out.append(("", splt[0], splt[1]))
        else:
            if column == "finetune_from":
                out.append(("", "finetune", "from"))
                continue
            out.append(("", "", splt[0]))
    _df.columns = pd.MultiIndex.from_tuples(out)

    print(f"\n%{'-'*20} {dataset} {'-'*20}\n")

    lr_prec = 5 if _df[("", "", "lr")].min() < 1e-5 else 4
    _tbl = (
        _df.style.highlight_min(
            subset=[
                ("train", dataset, "mse"),
                ("val", dataset, "mse"),
            ],
            axis=0,
            props="textbf:--rwrap;",
        )
        .format(
            subset=[
                ("train", dataset, "mse"),
                ("val", dataset, "mse"),
            ],
            precision=5,
        )
        .format(
            subset=[("", "", "lr")],
            precision=lr_prec,
        )
        .hide()
        .to_latex(
            caption=f"Pretraining results for {dataset}",
            hrules=True,
            label=f"tab:pretrain-{dataset.replace('_', '-')}",
            # siunitx=True,
        )
        .replace("number_of", "no")
        .replace("no_iterations", "no_iter")
        .replace("_", "\\_")
    )
    # add \resizebox{\textwidth}{!}{% ... } between tabular and end tabular
    _tbl = _tbl.replace(
        r"\begin{tabular}",
        r"""\resizebox{\textwidth}{!}{%
\begin{tabular}""",
    )
    _tbl = _tbl.replace(
        r"\end{tabular}",
        r"""\end{tabular}
}""",
    )

    print(_tbl)

In [31]:
_mapper_df = pd.concat(
    (pretrain_df["id"], pretrain_df["slurm_ids"].str.split(",").explode()), axis=1
)

slurm_id_to_expr_map = _mapper_df.set_index("slurm_ids").to_dict()

In [32]:
# STSN - 1.1 - 1.N

ix = (
    (~df.experiment_nm.str.contains("combined"))
    & (~df.experiment_nm.str.contains("esnb"))
    & (~df.experiment_nm.str.contains("finetune"))
    & (
        df["model/class"]
        != "model.models.baseline_scoring_modules_v2.BaselineScoringModel"
    )
)
ids_connected = df.loc[ix, :].slurm_ids.str.split(",").explode().astype("Int32")
slurm_id_to_idx_map = (
    ids_connected.reset_index().set_index("slurm_ids").to_dict()["index"]
)

ix2 = (df.experiment_nm == "finetune") & (
    df["finetuned_from_slurm_id"].isin(ids_connected)
)
df_stsn = df[(ix) | (ix2)]
df_stsn.loc[:, "id"] = [f"1.{_x}" for _x in range(1, len(df_stsn) + 1)]
# based on mapping
df_stsn.loc[:, "based_on"] = (
    df_stsn["based_on/slurm_id"]
    .astype("Int32")
    .astype(str)
    .apply(lambda x: slurm_id_to_expr_map["id"].get(x, None))
)
finetune_ix = ~pd.isna(df_stsn.finetuned_from_slurm_id)
df_stsn.loc[finetune_ix, "finetune_from"] = (
    df_stsn.loc[finetune_ix, :]
    .finetuned_from_slurm_id.astype("Int32")
    .apply(lambda x: df_stsn.loc[slurm_id_to_idx_map.get(x, None), "id"])
)

In [None]:
for dataset in sdatasets:
    cols = [
        # "wandb_urls",
        "id",
        # "experiment_nm",
        "test_nm",
        # "based_on/slurm_id",
        "based_on",
        "finetune_from",
        "is_double",
        # "model/class",
        "img_size",
        "batch_size",
        "model/auxiliary_loss_ratio",
        # "model/num_slots",
        # "model/num_iterations",
        # "model/hid_dim",
        "max_epoch",
        "val/loss",
        f"train/{dataset}/accuracy",
        f"val/{dataset}/accuracy",
        # TODO: Add Test results for each regime)
    ]
    test_cols = [
        f"test/{dataset}/{test_type}"
        for test_type in map(
            lambda x: x[1], filter(lambda x: x[0] == dataset, test_types)
        )
    ]
    _df = df_stsn[[*cols, *test_cols]]
    _df["model/auxiliary_loss_ratio"] = (
        _df["model/auxiliary_loss_ratio"].fillna(0).astype("int")
    )
    _df = _df.dropna(subset=[f"train/{dataset}/accuracy", f"val/{dataset}/accuracy"])
    _df.dataset = dataset
    # _df["model/class"] = _df["model/class"].str.split(".").str[3]
    _df.max_epoch = _df.max_epoch.astype("Int32")
    _df.finetune_from = _df.finetune_from.combine_first(_df.based_on)
    _df.drop(columns=["based_on"], inplace=True)

    out = []
    for column in _df.columns:
        splt = column.split("/")
        if len(splt) == 3:
            out.append((splt[0], splt[1], splt[2]))
        elif len(splt) == 2:
            if splt[0] in ["train", "val", "test"]:
                out.append((splt[0], "", splt[1]))
                continue
            if column == "model/auxilary_loss_ratio":
                out.append(("auxiliary", "loss", "ratio"))
                continue
            out.append(("", splt[0], splt[1]))
        else:
            if column == "finetune_from":
                out.append(("", "finetune", "from"))
                continue
            out.append(("", "", splt[0]))
    _df.columns = pd.MultiIndex.from_tuples(out)
    # _df.loc[:,"test"] = _df.apply(
    #     lambda x: f'{", ".join([f"{x[col]:.2%}" for col in test_cols if not pd.isna(x[col])])}',
    #     axis=1,
    # )
    # _df.wandb_urls = _df.wandb_urls.str.split("/").str[-1]
    print(f"\n%{'-'*20} {dataset} {'-'*20}\n")

    _highlight_max = [
        f"train/{dataset}/accuracy",
        f"val/{dataset}/accuracy",
        *test_cols,
    ]
    _highlight_max = [col.split("/") for col in _highlight_max]
    _tbl = (
        _df.style.highlight_max(
            # subset=[f"train/{dataset}/accuracy", f"val/{dataset}/accuracy", *test_cols],
            subset=_highlight_max,
            axis=0,
            props="textbf:--rwrap;",
        )
        .highlight_min(
            subset=[("val", "", "loss")],
            axis=0,
            props="textbf:--rwrap;",
        )
        .hide()
        .format(
            formatter="{:.2%}".format,
            subset=[
                *_highlight_max,
                ("train", dataset, "accuracy"),
                ("val", dataset, "accuracy"),
            ],
        )
        .format(subset=[("val", "", "loss")], precision=3)
        .to_latex(
            caption=f"Classification results for {dataset}",
            hrules=True,
            label=f"tab:stsn-{dataset.replace('_', '-')}",
            # siunitx=True,
            # column_format="lp{0.5cm}p{1.8cm}p{1.5cm}p{1.0cm}p{0.7cm}p{0.7cm}p{0.7cm}p{0.6cm}p{0.6cm}p{0.4cm}p{0.4cm}p{0.5cm}"
            # + len(test_cols) * "p{0.4cm}",
            multicol_align="c",
        )
        .replace("train_", "")
        .replace("trained_", "")
        .replace("_test", "")
        .replace("finetune_on_single_task_based_on_dual", "finetune")
        .replace("_", "\\_")
        .replace("nan%", "---")
        .replace("nan", "---")
        .replace("None", "---")
        .replace("%", "\%")
    )
    # add \resizebox{\textwidth}{!}{% ... } between tabular and end tabular
    _tbl = _tbl.replace(
        r"\begin{tabular}",
        r"""\resizebox{\textwidth}{!}{%
\begin{tabular}""",
    )

    _tbl = _tbl.replace(
        r"\end{tabular}",
        r"""\end{tabular}
}""",
    )

    # display(_df.shape)
    # display(_df)
    print(_tbl)

In [None]:
for dataset in sdatasets:
    cols = [
        "wandb_urls",
        "id",
        "experiment_nm",
        "test_nm",
        # "based_on/slurm_id",
        "model/class",
        "img_size",
        "batch_size",
        "model/auxiliary_loss_ratio",
        # "model/num_slots",
        # "model/num_iterations",
        # "model/hid_dim",
        "max_epoch",
        "val/loss",
        f"train/{dataset}/accuracy",
        f"val/{dataset}/accuracy",
        # TODO: Add Test results for each regime)
        "based_on",
        "finetune_from",
        "is_double",
    ]
    test_cols = [
        f"test/{dataset}/{test_type}"
        for test_type in map(
            lambda x: x[1], filter(lambda x: x[0] == dataset, test_types)
        )
    ]
    _df = df_stsn[[*cols, *test_cols]]
    _df["model/auxiliary_loss_ratio"] = (
        _df["model/auxiliary_loss_ratio"].fillna(0).astype("int")
    )
    _df = _df.dropna(subset=[f"train/{dataset}/accuracy", f"val/{dataset}/accuracy"])
    _df.dataset = dataset
    _df["model/class"] = _df["model/class"].str.split(".").str[3]
    _df.max_epoch = _df.max_epoch.astype("Int32")
    _df.wandb_urls = _df.wandb_urls.str.split("/").str[-1]
    out = []
    for column in _df.columns:
        splt = column.split("/")
        if len(splt) == 3:
            out.append((splt[0], splt[1], splt[2]))
        elif len(splt) == 2:
            if splt[0] in ["train", "val", "test"]:
                out.append((splt[0], "", splt[1]))
                continue
            if column == "model/auxilary_loss_ratio":
                out.append(("auxiliary", "loss", "ratio"))
                continue
            out.append(("", splt[0], splt[1]))
        else:
            out.append(("", "", splt[0]))
    _df.columns = pd.MultiIndex.from_tuples(out)
    # _df.loc[:,"test"] = _df.apply(
    #     lambda x: f'{", ".join([f"{x[col]:.2%}" for col in test_cols if not pd.isna(x[col])])}',
    #     axis=1,
    # )
    # _df.wandb_urls = _df.wandb_urls.str.split("/").str[-1]
    print(f"\n%{'-'*20} {dataset} {'-'*20}\n")

    _highlight_max = [
        f"train/{dataset}/accuracy",
        f"val/{dataset}/accuracy",
        *test_cols,
    ]
    _highlight_max = [col.split("/") for col in _highlight_max]
    _tbl = (
        _df.style.highlight_max(
            # subset=[f"train/{dataset}/accuracy", f"val/{dataset}/accuracy", *test_cols],
            subset=_highlight_max,
            axis=0,
            props="textbf:--rwrap;",
        )
        .highlight_min(
            subset=[("val", "", "loss")],
            axis=0,
            props="textbf:--rwrap;",
        )
        .hide()
        .format(
            formatter="{:.2%}".format,
            subset=[
                *_highlight_max,
                ("train", dataset, "accuracy"),
                ("val", dataset, "accuracy"),
            ],
        )
        .to_latex(
            caption=f"Classification results for {dataset}",
            hrules=True,
            label=f"tab:pretrain-{dataset.replace('_', '-')}",
            # siunitx=True,
            # column_format="lp{0.5cm}p{1.8cm}p{1.5cm}p{1.0cm}p{0.7cm}p{0.7cm}p{0.7cm}p{0.6cm}p{0.6cm}p{0.4cm}p{0.4cm}p{0.5cm}"
            # + len(test_cols) * "p{0.4cm}",
            multicol_align="c",
        )
        .replace("_", "\\_")
        .replace("nan%", "---")
        .replace("nan", "---")
        .replace("%", "\%")
    )
    # add \resizebox{\textwidth}{!}{% ... } between tabular and end tabular
    _tbl = _tbl.replace(
        r"\begin{tabular}",
        r"""\resizebox{\textwidth}{!}{%
\begin{tabular}""",
    )

    _tbl = _tbl.replace(
        r"\end{tabular}",
        r"""\end{tabular}
}""",
    )

    display(_df.shape)
    display(_df)
    # print(_tbl)

In [35]:
# ESNB - 2.1 - 2.N

ix = (
    # (~df.experiment_nm.str.contains("combined"))
    df.experiment_nm.str.contains("esnb")
    # & (~df.experiment_nm.str.contains("finetune"))
    # & (
    #     df["model/class"]
    #     != "model.models.baseline_scoring_modules_v2.BaselineScoringModel"
    # )
)
ids_connected = df.loc[ix, :].slurm_ids.str.split(",").explode().astype("Int32")
slurm_id_to_idx_map = (
    ids_connected.reset_index().set_index("slurm_ids").to_dict()["index"]
)

ix2 = (df.experiment_nm == "finetune") & (
    df["finetuned_from_slurm_id"].isin(ids_connected)
)
df_esnb = df[(ix) | (ix2)]
df_esnb.loc[:, "id"] = [f"2.{_x}" for _x in range(1, len(df_esnb) + 1)]
# based on mapping
df_esnb.loc[:, "based_on"] = (
    df_esnb["based_on/slurm_id"]
    .astype("Int32")
    .astype(str)
    .apply(lambda x: slurm_id_to_expr_map["id"].get(x, None))
)
finetune_ix = ~pd.isna(df_esnb.finetuned_from_slurm_id)
df_esnb.loc[finetune_ix, "finetune_from"] = (
    df_esnb.loc[finetune_ix, :]
    .finetuned_from_slurm_id.astype("Int32")
    .apply(lambda x: df_esnb.loc[slurm_id_to_idx_map.get(x, None), "id"])
)

In [36]:
for dataset in sdatasets:
    cols = [
        # "wandb_urls",
        "id",
        # "experiment_nm",
        "test_nm",
        # "based_on/slurm_id",
        "based_on",
        "finetune_from",
        "is_double",
        "relation-model/class",
        # "img_size",
        # "batch_size",
        # "model/auxiliary_loss_ratio",
        # "model/num_slots",
        # "model/num_iterations",
        # "model/hid_dim",
        "max_epoch",
        "val/loss",
        f"train/{dataset}/accuracy",
        f"val/{dataset}/accuracy",
        # TODO: Add Test results for each regime)
    ]
    test_cols = [
        f"test/{dataset}/{test_type}"
        for test_type in map(
            lambda x: x[1], filter(lambda x: x[0] == dataset, test_types)
        )
    ]
    _df = df_esnb[[*cols, *test_cols]]
    # _df["model/auxiliary_loss_ratio"] = (
    #     _df["model/auxiliary_loss_ratio"].fillna(0).astype("int")
    # )
    _df = _df.dropna(subset=[f"train/{dataset}/accuracy", f"val/{dataset}/accuracy"])
    _df.dataset = dataset
    _df["relation-model/class"] = _df["relation-model/class"].str.split(".").str[2]
    _df.max_epoch = _df.max_epoch.astype("Int32")
    _df.finetune_from = _df.finetune_from.combine_first(_df.based_on)
    _df.drop(columns=["based_on"], inplace=True)
    out = []
    for column in _df.columns:
        splt = column.split("/")
        if len(splt) == 3:
            out.append((splt[0], splt[1], splt[2]))
        elif len(splt) == 2:
            if splt[0] in ["train", "val", "test"]:
                out.append((splt[0], "", splt[1]))
                continue
            if column == "model/auxilary_loss_ratio":
                out.append(("auxiliary", "loss", "ratio"))
                continue
            out.append(("", splt[0], splt[1]))
        else:
            if column == "finetune_from":
                out.append(("", "finetune", "from"))
                continue
            out.append(("", "", splt[0]))
    _df.columns = pd.MultiIndex.from_tuples(out)
    # _df.loc[:,"test"] = _df.apply(
    #     lambda x: f'{", ".join([f"{x[col]:.2%}" for col in test_cols if not pd.isna(x[col])])}',
    #     axis=1,
    # )
    # _df.wandb_urls = _df.wandb_urls.str.split("/").str[-1]
    print(f"\n%{'-'*20} {dataset} {'-'*20}\n")

    _highlight_max = [
        f"train/{dataset}/accuracy",
        f"val/{dataset}/accuracy",
        *test_cols,
    ]
    _highlight_max = [col.split("/") for col in _highlight_max]
    _tbl = (
        _df.style.highlight_max(
            # subset=[f"train/{dataset}/accuracy", f"val/{dataset}/accuracy", *test_cols],
            subset=_highlight_max,
            axis=0,
            props="textbf:--rwrap;",
        )
        .highlight_min(
            subset=[("val", "", "loss")],
            axis=0,
            props="textbf:--rwrap;",
        )
        .hide()
        .format(
            formatter="{:.2%}".format,
            subset=[
                *_highlight_max,
                ("train", dataset, "accuracy"),
                ("val", dataset, "accuracy"),
            ],
        )
        .format(subset=[("val", "", "loss")], precision=3)
        .to_latex(
            caption=f"Classification results with relation module for {dataset}. is_double columns informs if training was done on {dataset} and bongard logo dataset. Finetune from column informs if experiment was based on existing on, if it is 0.N - then it means and encoder was pretrained, if it is 1.N - then it means whole STSN was trained in 1.N experiment. Bold text mean best result in column.",
            hrules=True,
            label=f"tab:esbn-{dataset.replace('_', '-')}",
            # siunitx=True,
            # column_format="lp{0.5cm}p{1.8cm}p{1.5cm}p{1.0cm}p{0.7cm}p{0.7cm}p{0.7cm}p{0.6cm}p{0.6cm}p{0.4cm}p{0.4cm}p{0.5cm}"
            # + len(test_cols) * "p{0.4cm}",
            multicol_align="c",
        )
        .replace("finetune_on_single_task_based_on_dual", "finetune")
        .replace("relation_", "")
        .replace("_", "\\_")
        .replace("nan%", "---")
        .replace("nan", "---")
        .replace("None", "---")
        .replace("_scoring", "")
        .replace("%", "\%")
        .replace("esnb", "esbn")
    )
    # add \resizebox{\textwidth}{!}{% ... } between tabular and end tabular
    _tbl = _tbl.replace(
        r"\begin{tabular}",
        r"""\resizebox{\textwidth}{!}{%
\begin{tabular}""",
    )

    _tbl = _tbl.replace(
        r"\end{tabular}",
        r"""\end{tabular}
}""",
    )

    # display(_df.shape)
    # display(_df)
    print(_tbl)


%-------------------- bongard_hoi --------------------

\begin{table}
\caption{Classification results with relation module for bongard\_hoi. is\_double columns informs if training was done on bongard\_hoi and bongard logo dataset. Finetune from column informs if experiment was based on existing on, if it is 0.N - then it means and encoder was pretrained, if it is 1.N - then it means whole STSN was trained in 1.N experiment. Bold text mean best result in column.}
\label{tab:esbn-bongard-hoi}
\resizebox{\textwidth}{!}{%
\begin{tabular}{lllrlrrrrrrrr}
\toprule
\multicolumn{6}{c}{} & val & train & val & \multicolumn{4}{c}{test} \\
\multicolumn{2}{c}{} & finetune &  & relation-model &  &  & bongard\_hoi & bongard\_hoi & \multicolumn{4}{c}{bongard\_hoi} \\
id & test\_nm & from & is\_double & class & max\_epoch & loss & accuracy & accuracy & seen-seen & seen-unseen & unseen-seen & unseen-unseen \\
\midrule
2.3 & esbn\_single & --- & False & ESNB & 216 & \textbf{0.471} & 86.08\% & \textbf{78.

In [37]:
for dataset in sdatasets:
    cols = [
        "wandb_urls",
        "id",
        "experiment_nm",
        "test_nm",
        # "based_on/slurm_id",
        "relation-model/class",
        "img_size",
        "batch_size",
        # "model/auxiliary_loss_ratio",
        # "model/num_slots",
        # "model/num_iterations",
        # "model/hid_dim",
        "max_epoch",
        "val/loss",
        f"train/{dataset}/accuracy",
        f"val/{dataset}/accuracy",
        # TODO: Add Test results for each regime)
        "based_on",
        "finetune_from",
        "is_double",
    ]
    test_cols = [
        f"test/{dataset}/{test_type}"
        for test_type in map(
            lambda x: x[1], filter(lambda x: x[0] == dataset, test_types)
        )
    ]
    _df = df_esnb[[*cols, *test_cols]]
    # _df["model/auxiliary_loss_ratio"] = (
    #     _df["model/auxiliary_loss_ratio"].fillna(0).astype("int")
    # )
    _df.wandb_urls = _df.wandb_urls.str.split("/").str[-1]
    _df = _df.dropna(subset=[f"train/{dataset}/accuracy", f"val/{dataset}/accuracy"])
    _df.dataset = dataset
    _df["relation-model/class"] = _df["relation-model/class"].str.split(".").str[2]
    _df.max_epoch = _df.max_epoch.astype("Int32")

    out = []
    for column in _df.columns:
        splt = column.split("/")
        if len(splt) == 3:
            out.append((splt[0], splt[1], splt[2]))
        elif len(splt) == 2:
            if splt[0] in ["train", "val", "test"]:
                out.append((splt[0], "", splt[1]))
                continue
            if column == "model/auxilary_loss_ratio":
                out.append(("auxiliary", "loss", "ratio"))
                continue
            out.append(("", splt[0], splt[1]))
        else:
            out.append(("", "", splt[0]))
    _df.columns = pd.MultiIndex.from_tuples(out)
    # _df.loc[:,"test"] = _df.apply(
    #     lambda x: f'{", ".join([f"{x[col]:.2%}" for col in test_cols if not pd.isna(x[col])])}',
    #     axis=1,
    # )
    # _df.wandb_urls = _df.wandb_urls.str.split("/").str[-1]
    print(f"\n%{'-'*20} {dataset} {'-'*20}\n")

    _highlight_max = [
        f"train/{dataset}/accuracy",
        f"val/{dataset}/accuracy",
        *test_cols,
    ]
    _highlight_max = [col.split("/") for col in _highlight_max]
    _tbl = (
        _df.style.highlight_max(
            # subset=[f"train/{dataset}/accuracy", f"val/{dataset}/accuracy", *test_cols],
            subset=_highlight_max,
            axis=0,
            props="textbf:--rwrap;",
        )
        .highlight_min(
            subset=[("val", "", "loss")],
            axis=0,
            props="textbf:--rwrap;",
        )
        .hide()
        .format(
            formatter="{:.2%}".format,
            subset=[
                *_highlight_max,
                ("train", dataset, "accuracy"),
                ("val", dataset, "accuracy"),
            ],
        )
        .to_latex(
            caption=f"Classification results for {dataset}",
            hrules=True,
            label=f"tab:pretrain-{dataset.replace('_', '-')}",
            # siunitx=True,
            # column_format="lp{0.5cm}p{1.8cm}p{1.5cm}p{1.0cm}p{0.7cm}p{0.7cm}p{0.7cm}p{0.6cm}p{0.6cm}p{0.4cm}p{0.4cm}p{0.5cm}"
            # + len(test_cols) * "p{0.4cm}",
            multicol_align="c",
        )
        .replace("finetune_on_single_task_based_on_dual", "finetune")
        .replace("_", "\\_")
        .replace("nan%", "---")
        .replace("nan", "---")
        .replace("%", "\%")
    )
    # add \resizebox{\textwidth}{!}{% ... } between tabular and end tabular
    _tbl = _tbl.replace(
        r"\begin{tabular}",
        r"""\resizebox{\textwidth}{!}{%
\begin{tabular}""",
    )

    _tbl = _tbl.replace(
        r"\end{tabular}",
        r"""\end{tabular}
}""",
    )

    display(_df.shape)
    display(_df)
    # print(_tbl)


%-------------------- bongard_hoi --------------------



(19, 18)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,val,train,val,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,test,test,test,test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,relation-model,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,bongard_hoi,bongard_hoi,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,bongard_hoi,bongard_hoi,bongard_hoi,bongard_hoi
Unnamed: 0_level_2,wandb_urls,id,experiment_nm,test_nm,class,img_size,batch_size,max_epoch,loss,accuracy,accuracy,based_on,finetune_from,is_double,seen-seen,seen-unseen,unseen-seen,unseen-unseen
72,cmyrffip,2.3,bongard_hoi_scoring_esnb,relation_esnb_single,ESNB,80,16,216,0.471449,0.860813,0.782775,,,False,0.612154,0.665012,0.633494,0.665012
74,o0oz8bt5,2.5,bongard_logo_hoi_scoring_esnb,relation_esnb_dual,ESNB,80,8,17,0.692726,0.507617,0.517343,0.28,,True,0.503575,0.498759,0.503507,0.498759
75,h78u70gc,2.6,bongard_logo_hoi_scoring_esnb,relation_esnb_dual,ESNB,80,8,11,0.692724,0.507617,0.517343,0.29,,True,0.503575,0.498759,0.503507,0.498759
76,tiji5vvm,2.7,bongard_logo_hoi_scoring_esnb_vit,relation_esnb_dual,ESNB,80,4,17,0.632098,0.663079,0.669606,,,True,0.614388,0.624069,0.613766,0.624069
85,zasa73hq,2.16,bongard_hoi_scoring_esnb-2,relation_esnb_single_2,ESNB,80,256,499,0.628341,1.0,0.663727,,,False,0.619465,0.640819,0.619465,0.640819
89,vrdj235f,2.2,bongard_hoi_scoring_esnb-2_larger,relation_esnb_single_2,ESNB,80,128,499,0.610504,1.0,0.665197,,,False,0.610478,0.650124,0.610478,0.650124
92,7byrc875,2.23,bongard_logo_hoi_scoring_esnb_vit-2,relation_esnb_dual_2,ESNB,80,256,499,0.652056,1.0,0.644327,,,True,0.614643,0.638337,0.614643,0.638337
94,8cjb18ld,2.25,bongard_logo_hoi_scoring_esnb_vit-2_larger,relation_esnb_dual_2,ESNB,80,128,499,0.644687,1.0,0.64903,,,True,0.601052,0.639578,0.601052,0.639578
97,9btos9yk,2.28,bongard_hoi_scoring_esnb-2,relation_esnbv2_single,ESNBv2,80,256,499,0.624779,0.999132,0.666961,,,False,0.613985,0.635236,0.613985,0.635236
100,bzd3toty,2.31,bongard_logo_hoi_scoring_esnb_vit-2,relation_esnbv2_dual,ESNBv2,80,256,499,0.653035,0.998606,0.651675,,,True,0.611355,0.629032,0.611355,0.629032



%-------------------- vasr --------------------



(19, 15)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,val,train,val,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,relation-model,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,vasr,vasr,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,vasr
Unnamed: 0_level_2,wandb_urls,id,experiment_nm,test_nm,class,img_size,batch_size,max_epoch,loss,accuracy,accuracy,based_on,finetune_from,is_double,vasr
79,fe8alk5s,2.1,vasr_scoring_esnb,relation_esnb_single,ESNB,80,32,50,1.273663,0.560199,0.392536,0.31,,False,0.388606
81,a1djj3cf,2.12,vaec_vasr_scoring_esnb,relation_esnb_dual,ESNB,80,8,2,1.386482,0.249298,0.251667,0.3,,True,0.251667
82,hxi02x0k,2.13,vaec_vasr_scoring_esnb,relation_esnb_dual,ESNB,80,8,1,1.386459,0.249298,0.251667,0.31,,True,0.251667
83,czzd7aig,2.14,vaec_vasr_scoring_esnb_vit,relation_esnb_dual,ESNB,80,8,4,1.296681,0.311873,0.303448,,,True,0.308448
87,31davcan,2.18,vasr_scoring_esnb-2,relation_esnb_single_2,ESNB,80,256,499,1.242051,0.881502,0.402501,,,False,0.393305
91,zcs5hpdx,2.22,vasr_scoring_esnb-2_larger,relation_esnb_single_2,ESNB,80,128,466,1.251527,0.995999,0.400776,,,False,0.385082
93,otzj3e5r,2.24,vaec_vasr_scoring_esnb_vit-2,relation_esnb_dual_2,ESNB,80,256,499,1.359995,0.613656,0.363405,,,True,0.314032
95,8rnyacfx,2.26,vaec_vasr_scoring_esnb_vit-2_larger,relation_esnb_dual_2,ESNB,80,128,247,1.3645,0.685975,0.342121,,,True,0.310219
99,7qqlk8cf,2.3,vasr_scoring_esnb-2,relation_esnbv2_single,ESNBv2,80,256,216,1.253275,0.471844,0.398088,,,False,0.380388
101,zbtfqra0,2.32,vaec_vasr_scoring_esnb_vit-2,relation_esnbv2_dual,ESNBv2,80,256,112,1.262369,0.404882,0.404754,,,True,0.353573



%-------------------- bongard_logo --------------------



(21, 18)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,val,train,val,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,test,test,test,test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,relation-model,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,bongard_logo,bongard_logo,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,bongard_logo,bongard_logo,bongard_logo,bongard_logo
Unnamed: 0_level_2,wandb_urls,id,experiment_nm,test_nm,class,img_size,batch_size,max_epoch,loss,accuracy,accuracy,based_on,finetune_from,is_double,test_bd,test_ff,test_hd_comb,test_hd_novel
70,sww31yqe,2.1,bongard_logo_scoring_esnb,relation_esnb_single,ESNB,80,64,242,0.691456,0.514409,0.547778,0.28,,False,0.458333,0.485,0.455,0.453125
71,vlann6y8,2.2,bongard_logo_scoring_esnb,relation_esnb_single,ESNB,80,16,67,1.382411,0.515161,0.524444,0.29,,False,0.53125,0.533333,0.5425,0.565625
73,oxnqsd2g,2.4,bongard_logo_scoring_esnb_vit,relation_esnb_single,ESNB,80,16,292,0.662764,0.60043,0.616667,,,False,0.585417,0.556667,,
74,o0oz8bt5,2.5,bongard_logo_hoi_scoring_esnb,relation_esnb_dual,ESNB,80,8,17,0.692726,0.507335,0.525556,0.28,,True,0.483333,0.49,0.5,0.4875
75,h78u70gc,2.6,bongard_logo_hoi_scoring_esnb,relation_esnb_dual,ESNB,80,8,11,0.692724,0.507335,0.525556,0.29,,True,0.483333,0.49,0.5,0.4875
76,tiji5vvm,2.7,bongard_logo_hoi_scoring_esnb_vit,relation_esnb_dual,ESNB,80,4,17,0.632098,0.579153,0.587778,,,True,0.583333,0.528333,,
84,pr40wcrl,2.15,bongard_logo_scoring_esnb_vit-2,relation_esnb_single_2,ESNB,80,128,499,0.670173,1.0,0.567778,,,False,0.554167,0.525,0.5275,0.509375
88,uzfrln08,2.19,bongard_logo_scoring_esnb_vit-2_larger,relation_esnb_single_2,ESNB,80,128,499,0.670486,1.0,0.562222,,,False,0.525,0.505,0.5325,0.5375
92,7byrc875,2.23,bongard_logo_hoi_scoring_esnb_vit-2,relation_esnb_dual_2,ESNB,80,256,499,0.652056,1.0,0.576667,,,True,0.541667,0.518333,0.5075,0.53125
94,8cjb18ld,2.25,bongard_logo_hoi_scoring_esnb_vit-2_larger,relation_esnb_dual_2,ESNB,80,128,499,0.644687,1.0,0.573333,,,True,0.547917,0.495,0.5025,0.5125



%-------------------- vaec --------------------



(21, 19)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,val,train,val,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,test,test,test,test,test
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,relation-model,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,vaec,vaec,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,vaec,vaec,vaec,vaec,vaec
Unnamed: 0_level_2,wandb_urls,id,experiment_nm,test_nm,class,img_size,batch_size,max_epoch,loss,accuracy,accuracy,based_on,finetune_from,is_double,test1,test2,test3,test4,test5
77,4n0uo41y,2.8,vaec_scoring_esnb,relation_esnb_single,ESNB,80,128,241,1.386155,0.25,0.25,0.3,,False,0.25,0.25,0.25,0.25,0.25
78,6dn8ptmm,2.9,vaec_scoring_esnb,relation_esnb_single,ESNB,80,64,114,1.386141,0.25,0.25,0.31,,False,0.25,0.25,0.25,0.25,0.25
80,0nro4dv2,2.11,vaec_scoring_esnb_vit,relation_esnb_single,ESNB,80,32,184,1.325842,0.409716,0.355444,,,False,0.349079,0.36399,0.372575,0.327783,0.350652
81,a1djj3cf,2.12,vaec_vasr_scoring_esnb,relation_esnb_dual,ESNB,80,8,2,1.386482,0.250961,0.251436,0.3,,True,0.248004,0.248004,0.248004,0.248004,0.248004
82,hxi02x0k,2.13,vaec_vasr_scoring_esnb,relation_esnb_dual,ESNB,80,8,1,1.386459,0.250099,0.249055,0.31,,True,0.248004,0.248004,0.248004,0.248004,0.248004
83,czzd7aig,2.14,vaec_vasr_scoring_esnb_vit,relation_esnb_dual,ESNB,80,8,4,1.296681,0.358044,0.317806,,,True,0.327078,0.345233,0.35319,0.295858,0.332302
86,1wkm9t3h,2.17,vaec_scoring_esnb_vit-2,relation_esnb_single_2,ESNB,80,256,499,1.351307,0.486506,0.338746,,,False,0.329561,0.350889,0.354054,0.313285,0.327514
90,jv838pmq,2.21,vaec_scoring_esnb_vit-2_larger,relation_esnb_single_2,ESNB,80,128,499,1.345836,0.515943,0.340341,,,False,0.334213,0.346016,0.35291,0.317908,0.33338
93,otzj3e5r,2.24,vaec_vasr_scoring_esnb_vit-2,relation_esnb_dual_2,ESNB,80,256,499,1.359995,0.995102,0.354361,,,True,0.323634,0.340901,0.349347,0.305115,0.325914
95,8rnyacfx,2.26,vaec_vasr_scoring_esnb_vit-2_larger,relation_esnb_dual_2,ESNB,80,128,247,1.3645,0.967145,0.356744,,,True,0.323791,0.345167,0.341343,0.307002,0.329916


### Show top results

In [38]:
for dataset, test_type in test_types:
    _dataset_cols = [col for col in df.columns if dataset in col and "test" not in col]
    cols = [
        "wandb_urls",
        "experiment_nm",
        "test_nm",
        "based_on/slurm_id",
        "max_epoch",
        "finetuned_from_slurm_id",
        *_dataset_cols,
    ]
    display(
        df[[f"test/{dataset}/{test_type}"] + cols]
        .sort_values(f"test/{dataset}/{test_type}", ascending=False)
        .head(5)
    )

Unnamed: 0,test/bongard_logo/test_bd,wandb_urls,experiment_nm,test_nm,based_on/slurm_id,max_epoch,finetuned_from_slurm_id,val/bongard_logo/accuracy,val/bongard_logo/mse_loss,train/bongard_logo/mse_loss_epoch,train/bongard_logo/accuracy
290,0.95,https://wandb.ai/avr_universal/AVR_universal/r...,finetune,finetune_on_single_task_based_on_dual,,140,883839.0,0.865556,0.007597,0.007367,0.86871
282,0.95,https://wandb.ai/avr_universal/AVR_universal/r...,finetune,finetune_on_single_task_based_on_dual,,252,883823.0,0.887778,0.007941,0.007768,0.893548
284,0.941667,https://wandb.ai/avr_universal/AVR_universal/r...,finetune,finetune_on_single_task_based_on_dual,,251,883827.0,0.846667,0.00151,0.001455,0.860108
25,0.88125,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_logo_scoring,frozen_slot,871045.0,413,,0.807778,,,0.822796
54,0.864583,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_logo_scoring,trained_slot_aux_test,871045.0,174,,0.773333,0.000941,0.000931,0.764194


Unnamed: 0,test/bongard_logo/test_ff,wandb_urls,experiment_nm,test_nm,based_on/slurm_id,max_epoch,finetuned_from_slurm_id,val/bongard_logo/accuracy,val/bongard_logo/mse_loss,train/bongard_logo/mse_loss_epoch,train/bongard_logo/accuracy
282,0.915,https://wandb.ai/avr_universal/AVR_universal/r...,finetune,finetune_on_single_task_based_on_dual,,252,883823.0,0.887778,0.007941,0.007768,0.893548
290,0.873333,https://wandb.ai/avr_universal/AVR_universal/r...,finetune,finetune_on_single_task_based_on_dual,,140,883839.0,0.865556,0.007597,0.007367,0.86871
284,0.836667,https://wandb.ai/avr_universal/AVR_universal/r...,finetune,finetune_on_single_task_based_on_dual,,251,883827.0,0.846667,0.00151,0.001455,0.860108
286,0.771667,https://wandb.ai/avr_universal/AVR_universal/r...,finetune,finetune_on_single_task_based_on_dual,,253,883831.0,0.811111,0.000931,0.000945,0.813118
25,0.765,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_logo_scoring,frozen_slot,871045.0,413,,0.807778,,,0.822796


Unnamed: 0,test/bongard_logo/test_hd_comb,wandb_urls,experiment_nm,test_nm,based_on/slurm_id,max_epoch,finetuned_from_slurm_id,val/bongard_logo/accuracy,val/bongard_logo/mse_loss,train/bongard_logo/mse_loss_epoch,train/bongard_logo/accuracy
286,0.7425,https://wandb.ai/avr_universal/AVR_universal/r...,finetune,finetune_on_single_task_based_on_dual,,253,883831.0,0.811111,0.000931,0.000945,0.813118
199,0.725,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_logo_combined_transformer_v4,transformer_scoring,,102,,0.777778,,,0.979677
282,0.7175,https://wandb.ai/avr_universal/AVR_universal/r...,finetune,finetune_on_single_task_based_on_dual,,252,883823.0,0.887778,0.007941,0.007768,0.893548
25,0.71,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_logo_scoring,frozen_slot,871045.0,413,,0.807778,,,0.822796
181,0.71,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_logo_combined_v4_agg,no_context_parameter_search,,146,,0.784444,,,0.968817


Unnamed: 0,test/bongard_logo/test_hd_novel,wandb_urls,experiment_nm,test_nm,based_on/slurm_id,max_epoch,finetuned_from_slurm_id,val/bongard_logo/accuracy,val/bongard_logo/mse_loss,train/bongard_logo/mse_loss_epoch,train/bongard_logo/accuracy
195,0.765625,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_logo_combined_transformer_v4,small_transformer_scoring,,101,,0.763333,,,0.987097
199,0.765625,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_logo_combined_transformer_v4,transformer_scoring,,102,,0.777778,,,0.979677
165,0.753125,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_logo_combined_v2,asymmetric_only_parameter_search,,116,,0.778889,,,0.976774
181,0.74375,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_logo_combined_v4_agg,no_context_parameter_search,,146,,0.784444,,,0.968817
155,0.74375,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_logo_combined_v4_agg,baseline,,231,,0.797778,,,0.916344


Unnamed: 0,test/vaec/test1,wandb_urls,experiment_nm,test_nm,based_on/slurm_id,max_epoch,finetuned_from_slurm_id,val/vaec/accuracy,val/vaec/mse_loss,train/vaec/mse_loss_epoch,train/vaec/accuracy
61,0.996722,https://wandb.ai/avr_universal/AVR_universal/r...,vaec_scoring,trained_slot_aux_test,871052.0,114,,0.997057,0.001574,1.5e-05,1.0
33,0.994131,https://wandb.ai/avr_universal/AVR_universal/r...,vaec_scoring,train_slot_no_aux,871037.0,73,,0.993889,,,1.0
285,0.994089,https://wandb.ai/avr_universal/AVR_universal/r...,finetune,finetune_on_single_task_based_on_dual,,151,883827.0,0.995994,0.000328,5e-06,1.0
37,0.988054,https://wandb.ai/avr_universal/AVR_universal/r...,vaec_scoring,train_slot_no_aux,871052.0,115,,0.988237,,,1.0
46,0.986088,https://wandb.ai/avr_universal/AVR_universal/r...,vaec_scoring,trained_slot_aux_test,871039.0,208,,0.987817,0.002804,0.000661,1.0


Unnamed: 0,test/vaec/test2,wandb_urls,experiment_nm,test_nm,based_on/slurm_id,max_epoch,finetuned_from_slurm_id,val/vaec/accuracy,val/vaec/mse_loss,train/vaec/mse_loss_epoch,train/vaec/accuracy
33,0.868881,https://wandb.ai/avr_universal/AVR_universal/r...,vaec_scoring,train_slot_no_aux,871037.0,73,,0.993889,,,1.0
46,0.864647,https://wandb.ai/avr_universal/AVR_universal/r...,vaec_scoring,trained_slot_aux_test,871039.0,208,,0.987817,0.002804,0.000661,1.0
285,0.831056,https://wandb.ai/avr_universal/AVR_universal/r...,finetune,finetune_on_single_task_based_on_dual,,151,883827.0,0.995994,0.000328,5e-06,1.0
61,0.813528,https://wandb.ai/avr_universal/AVR_universal/r...,vaec_scoring,trained_slot_aux_test,871052.0,114,,0.997057,0.001574,1.5e-05,1.0
47,0.782212,https://wandb.ai/avr_universal/AVR_universal/r...,vaec_scoring,trained_slot_aux_test,871039.0,37,,0.959347,0.002243,3.4e-05,0.997632


Unnamed: 0,test/vaec/test3,wandb_urls,experiment_nm,test_nm,based_on/slurm_id,max_epoch,finetuned_from_slurm_id,val/vaec/accuracy,val/vaec/mse_loss,train/vaec/mse_loss_epoch,train/vaec/accuracy
33,0.799392,https://wandb.ai/avr_universal/AVR_universal/r...,vaec_scoring,train_slot_no_aux,871037.0,73,,0.993889,,,1.0
163,0.67833,https://wandb.ai/avr_universal/AVR_universal/r...,vaec_combined_v2,symmetric_only_parameter_search,,137,,0.639953,,,0.807199
246,0.673718,https://wandb.ai/avr_universal/AVR_universal/r...,vaec_combined_v4,hoi_module_substitute,,161,,0.63146,,,0.735934
249,0.661304,https://wandb.ai/avr_universal/AVR_universal/r...,vaec_combined_v4,vasr_module_substitute,,179,,0.619735,,,0.736673
263,0.649691,https://wandb.ai/avr_universal/AVR_universal/r...,vaec_combined_v2,combined_bongard_hoi_bongard_logo_fine_tune_no...,,261,,0.615746,,,0.818657


Unnamed: 0,test/vaec/test4,wandb_urls,experiment_nm,test_nm,based_on/slurm_id,max_epoch,finetuned_from_slurm_id,val/vaec/accuracy,val/vaec/mse_loss,train/vaec/mse_loss_epoch,train/vaec/accuracy
246,0.613046,https://wandb.ai/avr_universal/AVR_universal/r...,vaec_combined_v4,hoi_module_substitute,,161,,0.63146,,,0.735934
163,0.611865,https://wandb.ai/avr_universal/AVR_universal/r...,vaec_combined_v2,symmetric_only_parameter_search,,137,,0.639953,,,0.807199
249,0.606161,https://wandb.ai/avr_universal/AVR_universal/r...,vaec_combined_v4,vasr_module_substitute,,179,,0.619735,,,0.736673
224,0.583775,https://wandb.ai/avr_universal/AVR_universal/r...,vaec_bongard_logo_combined_v4,common_relationals_common_scoring,,106,,0.669087,,,0.951678
227,0.583268,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_hoi_vaec_combined_v5,common_relationals_separate_scoring,,103,,0.661649,,,0.991683


Unnamed: 0,test/vaec/test5,wandb_urls,experiment_nm,test_nm,based_on/slurm_id,max_epoch,finetuned_from_slurm_id,val/vaec/accuracy,val/vaec/mse_loss,train/vaec/mse_loss_epoch,train/vaec/accuracy
163,0.661281,https://wandb.ai/avr_universal/AVR_universal/r...,vaec_combined_v2,symmetric_only_parameter_search,,137,,0.639953,,,0.807199
249,0.660795,https://wandb.ai/avr_universal/AVR_universal/r...,vaec_combined_v4,vasr_module_substitute,,179,,0.619735,,,0.736673
246,0.659285,https://wandb.ai/avr_universal/AVR_universal/r...,vaec_combined_v4,hoi_module_substitute,,161,,0.63146,,,0.735934
263,0.628278,https://wandb.ai/avr_universal/AVR_universal/r...,vaec_combined_v2,combined_bongard_hoi_bongard_logo_fine_tune_no...,,261,,0.615746,,,0.818657
271,0.626817,https://wandb.ai/avr_universal/AVR_universal/r...,vaec_combined_v2,combined_vasr_vaec_fine_tune_no_asymetric,,214,,0.605286,,,0.793207


Unnamed: 0,test/bongard_hoi/seen-seen,wandb_urls,experiment_nm,test_nm,based_on/slurm_id,max_epoch,finetuned_from_slurm_id,val/bongard_hoi/accuracy,val/bongard_hoi/mse_loss,train/bongard_hoi/mse_loss_epoch,train/bongard_hoi/accuracy
198,0.761727,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_hoi_combined_transformer_v4,transformer_scoring,,54,,0.835097,,,0.997917
187,0.760631,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_hoi_combined_v4_agg,mlp_hidden_dim_256_128_64,,101,,0.837155,,,0.987891
186,0.754494,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_hoi_combined_v4_agg,mlp_hidden_dim_256_128,,101,,0.827454,,,0.977909
154,0.751644,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_hoi_combined_v4_agg,baseline,,101,,0.832452,,,0.979124
234,0.749452,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_hoi_bongard_logo_combined_v4_agg,partial_freezing_hoi_0_1,,157,,0.829512,,,0.964107


Unnamed: 0,test/bongard_hoi/seen-unseen,wandb_urls,experiment_nm,test_nm,based_on/slurm_id,max_epoch,finetuned_from_slurm_id,val/bongard_hoi/accuracy,val/bongard_hoi/mse_loss,train/bongard_hoi/mse_loss_epoch,train/bongard_hoi/accuracy
186,0.85749,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_hoi_combined_v4_agg,mlp_hidden_dim_256_128,,101,,0.827454,,,0.977909
184,0.856882,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_hoi_combined_v4_agg,mlp_hidden_dim_256,,100,,0.840388,,,0.982423
187,0.854148,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_hoi_combined_v4_agg,mlp_hidden_dim_256_128_64,,101,,0.837155,,,0.987891
188,0.85354,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_hoi_combined_v4_agg,mlp_hidden_dim_,,101,,0.82863,,,0.989627
226,0.852628,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_hoi_bongard_logo_combined_v5,common_relationals_separate_scoring,,101,,0.834803,,,0.988542


Unnamed: 0,test/bongard_hoi/unseen-seen,wandb_urls,experiment_nm,test_nm,based_on/slurm_id,max_epoch,finetuned_from_slurm_id,val/bongard_hoi/accuracy,val/bongard_hoi/mse_loss,train/bongard_hoi/mse_loss_epoch,train/bongard_hoi/accuracy
198,0.761727,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_hoi_combined_transformer_v4,transformer_scoring,,54,,0.835097,,,0.997917
187,0.760631,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_hoi_combined_v4_agg,mlp_hidden_dim_256_128_64,,101,,0.837155,,,0.987891
186,0.754494,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_hoi_combined_v4_agg,mlp_hidden_dim_256_128,,101,,0.827454,,,0.977909
154,0.751644,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_hoi_combined_v4_agg,baseline,,101,,0.832452,,,0.979124
234,0.749452,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_hoi_bongard_logo_combined_v4_agg,partial_freezing_hoi_0_1,,157,,0.829512,,,0.964107


Unnamed: 0,test/bongard_hoi/unseen-unseen,wandb_urls,experiment_nm,test_nm,based_on/slurm_id,max_epoch,finetuned_from_slurm_id,val/bongard_hoi/accuracy,val/bongard_hoi/mse_loss,train/bongard_hoi/mse_loss_epoch,train/bongard_hoi/accuracy
160,0.818859,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_hoi_combined_v2,symmetric_only_parameter_search,,119,,0.830394,,,0.920793
185,0.815757,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_hoi_combined_v4_agg,mlp_hidden_dim_64,,100,,0.820988,,,0.986589
187,0.815136,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_hoi_combined_v4_agg,mlp_hidden_dim_256_128_64,,101,,0.837155,,,0.987891
227,0.813275,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_hoi_vaec_combined_v5,common_relationals_separate_scoring,,103,,0.831276,,,0.985982
234,0.809553,https://wandb.ai/avr_universal/AVR_universal/r...,bongard_hoi_bongard_logo_combined_v4_agg,partial_freezing_hoi_0_1,,157,,0.829512,,,0.964107


Unnamed: 0,test/vasr/vasr,wandb_urls,experiment_nm,test_nm,based_on/slurm_id,max_epoch,finetuned_from_slurm_id,val/vasr/accuracy,val/vasr/mse_loss,train/vasr/mse_loss_epoch,train/vasr/accuracy
182,0.595039,https://wandb.ai/avr_universal/AVR_universal/r...,vasr_combined_v4,no_context_parameter_search,,100,,0.596076,,,0.956527
156,0.563144,https://wandb.ai/avr_universal/AVR_universal/r...,vasr_combined_v4,baseline,,81,,0.561884,,,0.931823
212,0.562642,https://wandb.ai/avr_universal/AVR_universal/r...,vasr_combined_v4,vasr_vaec_fine_tune,,61,,0.563879,,,0.913625
191,0.561662,https://wandb.ai/avr_universal/AVR_universal/r...,vasr_combined_v4,mlp_hidden_dim_256_128,,75,,0.561408,,,0.977587
243,0.553994,https://wandb.ai/avr_universal/AVR_universal/r...,vasr_vaec_combined_v4,partial_freezing_vaec_0_1,,64,,0.569186,,,0.911837


In [39]:
full_df.to_csv("../../structured_results.csv", index=False)

In [40]:
# split by dataset