In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import balanced_accuracy_score
from scipy.stats import bootstrap
from pathlib import Path
from collections import defaultdict
from pingouin import wilcoxon


def read_res(
    data,
    model,
    trainer,
    data_path,
    base_path=Path("/data/home/meiri.yoav/Cognitive-State-Decoding"),
    base_res_path="cross_validation_runs",
    wandb_job_type="cv",
    template="{}/+data={},+data_path={},+model={},+trainer={},trainer.wandb_job_type={}",
    on_error="raise",
) -> pd.DataFrame | None:
    file_path = (
        base_path
        / template.format(
            base_res_path, data, data_path, model, trainer, wandb_job_type
        )
        / "trial_level_test_results.csv"
    )
    try:
        res = pd.read_csv(
            file_path,
            # index_col=0,
            usecols=[
                "eval_regime",
                "eval_type",
                # "subjects",
                # "items",
                "binary_prediction",
                "binary_label",
                "prediction_prob",
                "fold_index",
            ],
        )
    except FileNotFoundError as e:
        print(f"File not found: {file_path}")
        if on_error == "raise":
            raise e
        else:
            return None
    return res

In [2]:
def balanced_accuracy_diff(y_true1, y_pred1, y_true2, y_pred2):
    ba1 = balanced_accuracy_score(y_true1, y_pred1)
    ba2 = balanced_accuracy_score(y_true2, y_pred2)
    return ba2 - ba1


def compare_models(df1, df2, eval_regime=None, eval_type=None, n_samples=9999):
    df1 = df1.copy()
    df2 = df2.copy()

    if eval_regime:
        df1 = df1[df1["eval_regime"] == eval_regime]
        df2 = df2[df2["eval_regime"] == eval_regime]
    if eval_type:
        df1 = df1[df1["eval_type"] == eval_type]
        df2 = df2[df2["eval_type"] == eval_type]

    # Prepare the data from df1
    labels1 = df1["binary_label"].values
    predictions1 = df1["binary_prediction"].values

    # Prepare the data from df2
    labels2 = df2["binary_label"].values
    predictions2 = df2["binary_prediction"].values

    data = (labels1, predictions1, labels2, predictions2)

    # Perform bootstrap resampling to compute the difference in balanced accuracy
    res = bootstrap(
        data,
        balanced_accuracy_diff,
        paired=True,
        confidence_level=0.95,
        n_resamples=n_samples,
        random_state=42,
        alternative="greater",
        method="basic",
    )

    # Extract the results
    # diff_observed = balanced_accuracy_diff(labels1, predictions1, labels2, predictions2)
    bootstrap_diff_distribution = res.bootstrap_distribution
    # confidence_interval = res.confidence_interval
    # standard_error = res.standard_error

    # Compute p-value for the difference
    p_value = np.mean(bootstrap_diff_distribution <= 0)

    # Print the results
    # print(f"Observed Difference in Balanced Accuracy: {diff_observed}")
    # print(f"Bootstrap Confidence Interval: {confidence_interval}")
    # print(f"Bootstrap Standard Error: {standard_error}")
    # print(f"Bootstrap p-value: {p_value}")
    return p_value

In [5]:
models = {
    "MAG": "MAG",
    # "MAGBase": "MAGBase",
    # "MAGFreeze": "MAGFreeze",
    # "MAGRace": "MAGRace",
    "MAGWords": "MAGWords",
    "MAGEyes": "MAGEyes",
    # "PostFusion": "PostFusion",
    # "PostFusionFreeze": "PostFusionFreeze",
    # "Baseline - Always A": "Roberta",
    "Baseline - RoBERTaNoEyes": "Roberta",
    "RobertaSelectedAnswersMultiClass": "RobertaSelectedAnswersMultiClass",
    "RoberteyeWord": "RoberteyeWord",
    "RoberteyeWordEyes": "RoberteyeWordEyes",
    # "RoBERTeyeFixation": "RoberteyeFixation",
    # "BEyeLSTM": "BEyeLSTMArgs",
    # "AhnCNN": "AhnCNN",
    # "LR": "LR",
    # "KNN": "KNN",
    # "SVM": "SVM",
    # "Eyettention": "Eyettention",
    # "PostFusionAnswers": "PostFusionAnswers",
    # "PostFusionMultiClass": "PostFusionMultiClass",
    # "PostFusionAnswersMultiClass": "PostFusionAnswersMultiClass",
    "PostFusionSelectedAnswersMultiClass": "PostFusionSelectedAnswersMultiClass",
    "RoberteyeWordSelectedAnswersMultiClass": "RoberteyeWordSelectedAnswersMultiClass",
    # "RoberteyeWordLingSelectedAnswersMultiClass": "RoberteyeWordLingSelectedAnswersMultiClass",
    "RoberteyeFixationSelectedAnswersMultiClass": "RoberteyeFixationSelectedAnswersMultiClass",
    "MAGSelectedAnswersMultiClass": "MAGSelectedAnswersMultiClass",
    # "MAGSelectedAnswersMultiClassLing": "MAGSelectedAnswersMultiClassLing",
    "RoberteyeWordLing": "RoberteyeWordLing",
}

all_res = defaultdict(dict)
for model, model_name in models.items():
    trainer = "IsCorrectSampling"
    if model == "BEyeLSTM":
        trainer = "BEyeLSTM"
    if model == "AhnCNN":
        trainer = "Ahn"
    if model in ["LR", "KNN", "SVM"]:
        trainer = "ML"
    if model in ["Eyettention"]:
        trainer = "Eyettention"

    for data_ in ["Hunting", "Gathering"]:
        res = read_res(
            data=data_,
            model=model_name,
            trainer=trainer,
            data_path="may05",
            wandb_job_type=f"hyperparameter_sweep_{model_name}",
            base_res_path="emnlp24_results",
            on_error="raise",
            base_path=Path(".."),
        )
        all_res[data_][model] = res

In [5]:
# to_compare_modes = [
#     # ("Baseline - RoBERTaNoEyes", "RoberteyeWord"),
#     # ("Baseline - RoBERTaNoEyes", "RoBERTeyeFixation"),
#     # ("Baseline - RoBERTaNoEyes", "PostFusion"),
#     # ("Baseline - RoBERTaNoEyes", "BEyeLSTM"),
#     # ("Baseline - RoBERTaNoEyes", "AhnCNN"),
#     # ("Baseline - RoBERTaNoEyes", "LR"),
#     # ("Baseline - RoBERTaNoEyes", "Eyettention"),
#     # ("Baseline - RoBERTaNoEyes", "MAG"),
#     # ("MAG", "MAGFreeze"),
#     # ("MAG", "MAGRace"),
#     # ("MAG", "MAGBase"),
#     # ("PostFusion", "PostFusionFreeze"),
# ]
# type_ = "test"
# for model1, model2 in to_compare_modes:
#     for eval_regime in ["new_item", "new_subject", "new_item_and_subject", "all"]:
#         query = (
#             f"eval_regime == '{eval_regime}'"
#             if eval_regime != "all"
#             else "index == index or index != index"
#         )
#         g_m1 = all_res["Gathering"][model1].query(query)
#         g_m2 = all_res["Gathering"][model2].query(query)
#         h_m1 = all_res["Hunting"][model1].query(query)
#         h_m2 = all_res["Hunting"][model2].query(query)

#         def balanced_accuracies_per_fold(df):
#             return [
#                 balanced_accuracy_score(
#                     df[df["fold_index"] == i]["binary_label"],
#                     df[df["fold_index"] == i]["binary_prediction"],
#                 )
#                 for i in range(10)
#             ]

#         # Compute balanced accuracy for each for each model
#         acc_g_m1 = balanced_accuracies_per_fold(g_m1)
#         acc_g_m2 = balanced_accuracies_per_fold(g_m2)
#         acc_h_m1 = balanced_accuracies_per_fold(h_m1)
#         acc_h_m2 = balanced_accuracies_per_fold(h_m2)

#         assert (
#             len(acc_g_m1) == len(acc_g_m2) == len(acc_h_m1) == len(acc_h_m2)
#             and len(acc_g_m1) > 0
#         )
#         # Perform Wilcoxon signed-rank test for each h/g and model pair
#         wilcoxon_g = wilcoxon(
#             [acc_g_m1[i] - acc_g_m2[i] for i in range(len(acc_g_m2))],
#             alternative="greater",
#         )
#         wilcoxon_h = wilcoxon(
#             [acc_h_m1[i] - acc_h_m2[i] for i in range(len(acc_h_m2))],
#             alternative="greater",
#         )

#         print(
#             f"{model1} vs {model2} - {eval_regime} - Gathering: {wilcoxon_g['p-val'].values[0]} - Hunting: {wilcoxon_h['p-val'].values[0]}"
#         )
#     print()

## Main Results


In [None]:
to_compare_modes = [
    ("Baseline - RoBERTaNoEyes", "RoberteyeWord"),
    ("Baseline - RoBERTaNoEyes", "RoBERTeyeFixation"),
    ("Baseline - RoBERTaNoEyes", "PostFusion"),
    ("Baseline - RoBERTaNoEyes", "BEyeLSTM"),
    ("Baseline - RoBERTaNoEyes", "AhnCNN"),
    ("Baseline - RoBERTaNoEyes", "LR"),
    ("Baseline - RoBERTaNoEyes", "Eyettention"),
    ("Baseline - RoBERTaNoEyes", "MAG"),
]
n_samples = 10000
for model1, model2 in to_compare_modes:
    for eval_regime in ["new_item", "new_subject", "new_item_and_subject", None]:
        p_value_g = compare_models(
            all_res["Gathering"][model1],
            all_res["Gathering"][model2],
            eval_regime=eval_regime,
            eval_type="test",
            n_samples=n_samples,
        )
        p_value_h = compare_models(
            all_res["Hunting"][model1],
            all_res["Hunting"][model2],
            eval_regime=eval_regime,
            eval_type="test",
            n_samples=n_samples,
        )
        print(
            f"{model1} vs {model2} - {eval_regime} - Gathering: {p_value_g} - Hunting: {p_value_h}"
        )

In [6]:
to_compare_modes = [
    ("RobertaSelectedAnswersMultiClass", "RoberteyeWordSelectedAnswersMultiClass"),
    ("RobertaSelectedAnswersMultiClass", "RoberteyeFixationSelectedAnswersMultiClass"),
    ("RobertaSelectedAnswersMultiClass", "MAGSelectedAnswersMultiClass"),
    ("RobertaSelectedAnswersMultiClass", "PostFusionSelectedAnswersMultiClass"),
]
n_samples = 10000
for model1, model2 in to_compare_modes:
    for eval_regime in ["new_item", "new_subject", "new_item_and_subject", None]:
        p_value_g = compare_models(
            all_res["Gathering"][model1],
            all_res["Gathering"][model2],
            eval_regime=eval_regime,
            eval_type="test",
            n_samples=n_samples,
        )
        p_value_h = compare_models(
            all_res["Hunting"][model1],
            all_res["Hunting"][model2],
            eval_regime=eval_regime,
            eval_type="test",
            n_samples=n_samples,
        )
        print(
            f"{model1} vs {model2} - {eval_regime} - Gathering: {p_value_g} - Hunting: {p_value_h}"
        )

RobertaSelectedAnswersMultiClass vs RoberteyeWordSelectedAnswersMultiClass - new_item - Gathering: 0.0357 - Hunting: 0.0002
RobertaSelectedAnswersMultiClass vs RoberteyeWordSelectedAnswersMultiClass - new_subject - Gathering: 0.5706 - Hunting: 0.9593
RobertaSelectedAnswersMultiClass vs RoberteyeWordSelectedAnswersMultiClass - new_item_and_subject - Gathering: 0.0079 - Hunting: 0.1931
RobertaSelectedAnswersMultiClass vs RoberteyeWordSelectedAnswersMultiClass - None - Gathering: 0.0721 - Hunting: 0.1358
RobertaSelectedAnswersMultiClass vs RoberteyeFixationSelectedAnswersMultiClass - new_item - Gathering: 0.0283 - Hunting: 0.0
RobertaSelectedAnswersMultiClass vs RoberteyeFixationSelectedAnswersMultiClass - new_subject - Gathering: 0.5237 - Hunting: 0.4157
RobertaSelectedAnswersMultiClass vs RoberteyeFixationSelectedAnswersMultiClass - new_item_and_subject - Gathering: 0.5379 - Hunting: 0.2019
RobertaSelectedAnswersMultiClass vs RoberteyeFixationSelectedAnswersMultiClass - None - Gathering

In [None]:
# to_compare_modes = [
#     ("Baseline - RoBERTaNoEyes", "RoberteyeWord"),
#     ("Baseline - RoBERTaNoEyes", "RoBERTeyeFixation"),
#     ("Baseline - RoBERTaNoEyes", "PostFusion"),
#     ("Baseline - RoBERTaNoEyes", "BEyeLSTM"),
#     ("Baseline - RoBERTaNoEyes", "AhnCNN"),
#     ("Baseline - RoBERTaNoEyes", "LR"),
#     ("Baseline - RoBERTaNoEyes", "Eyettention"),
#     ("Baseline - RoBERTaNoEyes", "MAG"),
# ]
# type_ = "test"
# for model1, model2 in to_compare_modes:
#     for eval_regime in ["new_item", "new_subject", "new_item_and_subject", None]:
#         g_m1 = all_res["Gathering"][model1]
#         g_m2 = all_res["Gathering"][model2]
#         h_m1 = all_res["Hunting"][model1]
#         h_m2 = all_res["Hunting"][model2]

#         def balanced_accuracies_per_fold(df, eval_regime=None):
#             df = df.copy()
#             if eval_regime:
#                 df = df[df["eval_regime"] == eval_regime]
#             return [
#                 balanced_accuracy_score(
#                     df[df["fold_index"] == i]["binary_label"],
#                     df[df["fold_index"] == i]["binary_prediction"],
#                 )
#                 for i in range(10)
#             ]

#         # Compute balanced accuracy for each for each model
#         acc_g_m1 = balanced_accuracies_per_fold(g_m1, eval_regime=eval_regime)
#         acc_g_m2 = balanced_accuracies_per_fold(g_m2, eval_regime=eval_regime)
#         acc_h_m1 = balanced_accuracies_per_fold(h_m1, eval_regime=eval_regime)
#         acc_h_m2 = balanced_accuracies_per_fold(h_m2, eval_regime=eval_regime)

#         # Perform Wilcoxon signed-rank test for each h/g and model pair
#         wilcoxon_g = wilcoxon(acc_g_m1, acc_g_m2, alternative="less")
#         wilcoxon_h = wilcoxon(acc_h_m1, acc_h_m2, alternative="less")

#         print(
#             f"{model1} vs {model2} - {eval_regime} - Gathering: {round(wilcoxon_g['p-val'].values[0],3)} - Hunting: {round(wilcoxon_h['p-val'].values[0],3)}"
#         )

## Ablations


In [4]:
to_compare_modes = [
    ("MAG", "MAGFreeze"),
    ("MAG", "MAGRace"),
    ("MAG", "MAGBase"),
    ("PostFusion", "PostFusionFreeze"),
]
n_samples = 10000
for model1, model2 in to_compare_modes:
    for eval_regime in ["new_item", "new_subject", "new_item_and_subject", None]:
        p_value_g = compare_models(
            all_res["Gathering"][model1],
            all_res["Gathering"][model2],
            eval_regime=eval_regime,
            eval_type="test",
            n_samples=n_samples,
        )
        p_value_h = compare_models(
            all_res["Hunting"][model1],
            all_res["Hunting"][model2],
            eval_regime=eval_regime,
            eval_type="test",
            n_samples=n_samples,
        )
        print(
            f"{model1} vs {model2} - {eval_regime} - Gathering: {p_value_g} - Hunting: {p_value_h}"
        )

MAG vs MAGFreeze - new_item - Gathering: 0.7265 - Hunting: 0.7048
MAG vs MAGFreeze - new_subject - Gathering: 1.0 - Hunting: 0.9961
MAG vs MAGFreeze - new_item_and_subject - Gathering: 0.8092 - Hunting: 0.275
MAG vs MAGFreeze - None - Gathering: 0.998 - Hunting: 0.9703
MAG vs MAGRace - new_item - Gathering: 0.5481 - Hunting: 1.0
MAG vs MAGRace - new_subject - Gathering: 0.1679 - Hunting: 0.2875
MAG vs MAGRace - new_item_and_subject - Gathering: 0.6826 - Hunting: 0.9876
MAG vs MAGRace - None - Gathering: 0.3809 - Hunting: 0.9999
MAG vs MAGBase - new_item - Gathering: 0.9941 - Hunting: 0.9634
MAG vs MAGBase - new_subject - Gathering: 0.5377 - Hunting: 0.0336
MAG vs MAGBase - new_item_and_subject - Gathering: 0.1076 - Hunting: 0.4772
MAG vs MAGBase - None - Gathering: 0.9593 - Hunting: 0.6329
PostFusion vs PostFusionFreeze - new_item - Gathering: 0.1493 - Hunting: 0.9846
PostFusion vs PostFusionFreeze - new_subject - Gathering: 0.9994 - Hunting: 0.9997
PostFusion vs PostFusionFreeze - new

In [7]:
to_compare_modes = [
    # ----------------------------------------------
    # Table 7: Input ablation - binary
    ("Baseline - RoBERTaNoEyes", "RoberteyeWord"),
    ("Baseline - RoBERTaNoEyes", "RoberteyeWordLing"),
    ("Baseline - RoBERTaNoEyes", "RoberteyeWordEyes"),
    ("Baseline - RoBERTaNoEyes", "MAG"),
    ("Baseline - RoBERTaNoEyes", "MAGWords"),
    ("Baseline - RoBERTaNoEyes", "MAGEyes"),
    # ----------------------------------------------
    # # Table 8: Input ablation - multiclass
    ("RobertaSelectedAnswersMultiClass", "MAGSelectedAnswersMultiClass"),
    ("RobertaSelectedAnswersMultiClass", "MAGSelectedAnswersMultiClassLing"),
    ("RobertaSelectedAnswersMultiClass", "RoberteyeWordSelectedAnswersMultiClass"),
    ("RobertaSelectedAnswersMultiClass", "RoberteyeWordLingSelectedAnswersMultiClass"),
]
n_samples = 10000
for model1, model2 in to_compare_modes:
    for eval_regime in ["new_item", "new_subject", "new_item_and_subject", None]:
        p_value_g = compare_models(
            all_res["Gathering"][model1],
            all_res["Gathering"][model2],
            eval_regime=eval_regime,
            eval_type="test",
            n_samples=n_samples,
        )
        p_value_h = compare_models(
            all_res["Hunting"][model1],
            all_res["Hunting"][model2],
            eval_regime=eval_regime,
            eval_type="test",
            n_samples=n_samples,
        )
        print(
            f"{model1} vs {model2} - {eval_regime} - Gathering: {p_value_g} - Hunting: {p_value_h}"
        )


Baseline - RoBERTaNoEyes vs RoberteyeWord - new_item - Gathering: 0.1628 - Hunting: 0.9133
Baseline - RoBERTaNoEyes vs RoberteyeWord - new_subject - Gathering: 0.2574 - Hunting: 0.144
Baseline - RoBERTaNoEyes vs RoberteyeWord - new_item_and_subject - Gathering: 0.9349 - Hunting: 0.4356
Baseline - RoBERTaNoEyes vs RoberteyeWord - None - Gathering: 0.2164 - Hunting: 0.6512
Baseline - RoBERTaNoEyes vs RoberteyeWordLing - new_item - Gathering: 0.0054 - Hunting: 0.9911
Baseline - RoBERTaNoEyes vs RoberteyeWordLing - new_subject - Gathering: 0.1646 - Hunting: 0.3996
Baseline - RoBERTaNoEyes vs RoberteyeWordLing - new_item_and_subject - Gathering: 0.1497 - Hunting: 0.4226
Baseline - RoBERTaNoEyes vs RoberteyeWordLing - None - Gathering: 0.0024 - Hunting: 0.9605
Baseline - RoBERTaNoEyes vs RoberteyeWordEyes - new_item - Gathering: 0.1953 - Hunting: 0.781
Baseline - RoBERTaNoEyes vs RoberteyeWordEyes - new_subject - Gathering: 0.3578 - Hunting: 0.6706
Baseline - RoBERTaNoEyes vs RoberteyeWordEy

KeyError: 'MAGSelectedAnswersMultiClassLing'

In [15]:
to_compare_modes = [
    # ----------------------------------------------
    # Table 8: Input ablation - multiclass
    ("RobertaSelectedAnswersMultiClass", "MAGSelectedAnswersMultiClass"),
    ("RobertaSelectedAnswersMultiClass", "MAGSelectedAnswersMultiClassLing"),
    ("RobertaSelectedAnswersMultiClass", "RoberteyeWordSelectedAnswersMultiClass"),
    ("RobertaSelectedAnswersMultiClass", "RoberteyeWordLingSelectedAnswersMultiClass"),
]
n_samples = 10000
for model1, model2 in to_compare_modes:
    for eval_regime in ["new_item", "new_subject", "new_item_and_subject", None]:
        p_value_g = compare_models(
            all_res["Gathering"][model1],
            all_res["Gathering"][model2],
            eval_regime=eval_regime,
            eval_type="test",
            n_samples=n_samples,
        )
        p_value_h = compare_models(
            all_res["Hunting"][model1],
            all_res["Hunting"][model2],
            eval_regime=eval_regime,
            eval_type="test",
            n_samples=n_samples,
        )
        print(
            f"{model1} vs {model2} - {eval_regime} - Gathering: {p_value_g} - Hunting: {p_value_h}"
        )


RobertaSelectedAnswersMultiClass vs MAGSelectedAnswersMultiClass - new_item - Gathering: 0.0009 - Hunting: 0.1509
RobertaSelectedAnswersMultiClass vs MAGSelectedAnswersMultiClass - new_subject - Gathering: 0.4282 - Hunting: 0.8401
RobertaSelectedAnswersMultiClass vs MAGSelectedAnswersMultiClass - new_item_and_subject - Gathering: 0.0007 - Hunting: 0.6726
RobertaSelectedAnswersMultiClass vs MAGSelectedAnswersMultiClass - None - Gathering: 0.0028 - Hunting: 0.567
RobertaSelectedAnswersMultiClass vs MAGSelectedAnswersMultiClassLing - new_item - Gathering: 0.0145 - Hunting: 0.0084
RobertaSelectedAnswersMultiClass vs MAGSelectedAnswersMultiClassLing - new_subject - Gathering: 0.6887 - Hunting: 0.8197
RobertaSelectedAnswersMultiClass vs MAGSelectedAnswersMultiClassLing - new_item_and_subject - Gathering: 0.0626 - Hunting: 0.0149
RobertaSelectedAnswersMultiClass vs MAGSelectedAnswersMultiClassLing - None - Gathering: 0.0743 - Hunting: 0.0952
RobertaSelectedAnswersMultiClass vs RoberteyeWordSe