# Results of Experimental Evaluation
In this notebook, we load the experimental results and summarize them in tabular form.

In [None]:
%load_ext autoreload
%autoreload 2

import sys

sys.path.append("../")
import pandas as pd

pd.set_option("display.max_columns", None)
import numpy as np

from copy import deepcopy

from IPython.display import display, HTML
from evaluation.config_utils import config_query, gen_results_table
from evaluation.run_experiment import RESULT_PATH

# Flag indicating whether the results should be printed as LaTeX code.
PRINT_LATEX = False

### Default Configurations of Multi-annotator Supervised Learning Techniques
The following dictionary defines the default hyperparameters/settings of each multi-annotator supervised learning technique for the simulated annotator set independent and for the two data sets with real-world annotators. These configuration can be interpreted as kind of queries asking for certain results. One can alter the different parameters to ablate the effect of these parameters. Of course, in this case, the parameters need also to be adapted in the corresponding experiments.

In [None]:
default_params = {
    "GT": {
        "model_name": "gt",
        "embed_x": "learned",
        "confusion_matrix": "full",
        "embed_size": 16,
        "ap_use_residual": True,
        "ap_use_outer_product": True,
        "use_annotator_features": False,
    },
    "MR": {
        "model_name": "mr",
        "embed_x": "learned",
        "confusion_matrix": "full",
        "embed_size": 16,
        "ap_use_residual": True,
        "ap_use_outer_product": True,
        "use_annotator_features": False,
    },
    "CL": {
        "model_name": "cl",
    },
    "REAC": {
        "model_name": "reac",
        "lmbda": 0.01,
    },
    "UNION": {
        "model_name": "union",
        "epsilon": 1e-5,
    },
    "LIA": {
        "model_name": "lia",
        "ap_latent_dim": 16,
        "n_em_steps": 7,
        "n_fine_tune_epochs": 25,
        "warm_start": True,
        "use_annotator_features": False,
    },
    "CoNAL": {
        "model_name": "conal",
        "embed_size": 20,
        "lmbda": 1e-5,
        "use_annotator_features": False,
    },
    "MaDL(not X, I)": {
        "model_name": "madl",
        "embed_x": "none",
        "confusion_matrix": "isotropic",
        "embed_size": 16,
        "lmbda": 0,
        "eta": 0.8,
        "alpha": 1.25,
        "beta": 0.25,
        "ap_use_residual": False,
        "ap_use_outer_product": False,
        "use_annotator_features": False,
    },
    "MaDL(not X, P)": {
        "model_name": "madl",
        "embed_x": "none",
        "confusion_matrix": "diagonal",
        "embed_size": 16,
        "lmbda": 0,
        "eta": 0.8,
        "alpha": 1.25,
        "beta": 0.25,
        "ap_use_residual": False,
        "ap_use_outer_product": False,
        "use_annotator_features": False,
    },
    "MaDL(not X, F)": {
        "model_name": "madl",
        "embed_x": "none",
        "confusion_matrix": "full",
        "embed_size": 16,
        "lmbda": 0,
        "eta": 0.8,
        "alpha": 1.25,
        "beta": 0.25,
        "ap_use_residual": False,
        "ap_use_outer_product": False,
        "use_annotator_features": False,
    },
    "MaDL(X, I)": {
        "model_name": "madl",
        "embed_x": "learned",
        "confusion_matrix": "isotropic",
        "embed_size": 16,
        "lmbda": 0,
        "eta": 0.8,
        "alpha": 1.25,
        "beta": 0.25,
        "ap_use_residual": True,
        "ap_use_outer_product": True,
        "use_annotator_features": False,
    },
    "MaDL(X, P)": {
        "model_name": "madl",
        "embed_x": "learned",
        "confusion_matrix": "diagonal",
        "embed_size": 16,
        "lmbda": 0,
        "eta": 0.8,
        "alpha": 1.25,
        "beta": 0.25,
        "ap_use_residual": True,
        "ap_use_outer_product": True,
        "use_annotator_features": False,
    },
    "MaDL(X, F)": {
        "model_name": "madl",
        "embed_x": "learned",
        "confusion_matrix": "full",
        "embed_size": 16,
        "lmbda": 0,
        "eta": 0.8,
        "alpha": 1.25,
        "beta": 0.25,
        "ap_use_residual": True,
        "ap_use_outer_product": True,
        "use_annotator_features": False,
    },
}
data_type_dict = {
    "none": "independent",
    "correlated": "interdependent",
    "rand-dep_10_100": "random-interdependent",
    "inductive_25": "inductive",
}

### Results for Default Parameters
The following tables presents the results with the default hyperparameters for different data and annotator sets.

In [None]:
data_set_list = ["letter", "fmnist", "cifar10", "svhn", "music", "label-me"]
pm_symbol = "$\\pm$" if PRINT_LATEX else "+-"
for data_type, annotator_set in data_type_dict.items():
    params_dict = deepcopy(default_params)
    if data_type in ["correlated", "rand_dep_10_100"]:
        params_dict["MaDL(W)"] = params_dict.pop("MaDL(X, F)")
        params_dict["MaDL(not W)"] = deepcopy(params_dict["MaDL(W)"])
        params_dict["MaDL(not W)"]["alpha"] = None
        params_dict["MaDL(not W)"]["beta"] = None
    elif data_type == "inductive_25":
        params_dict["GT"]["use_annotator_features"] = True
        params_dict["MR"]["use_annotator_features"] = True
        params_dict["LIA(not A)"] = params_dict.pop("LIA")
        params_dict["LIA(A)"] = deepcopy(params_dict["LIA(not A)"])
        params_dict["LIA(A)"]["use_annotator_features"] = True
        params_dict["CoNAL(not A)"] = params_dict.pop("CoNAL")
        params_dict["CoNAL(A)"] = deepcopy(params_dict["CoNAL(not A)"])
        params_dict["CoNAL(A)"]["use_annotator_features"] = True
        params_dict["MaDL(not A)"] = params_dict.pop("MaDL(X, F)")
        params_dict["MaDL(A)"] = deepcopy(params_dict["MaDL(not A)"])
        params_dict["MaDL(A)"]["use_annotator_features"] = True
    for data_set in data_set_list:
        res_dict = {}
        for model, model_dict in params_dict.items():
            config_dict = {
                "data_set_name": data_set,
                "data_type": data_type,
                **model_dict,
            }
            try:
                model_res_dict = config_query(
                    path=RESULT_PATH,
                    config_dict=config_dict,
                )
            except Exception as e:
                continue
            if len(model_res_dict) > 0:
                res_dict.setdefault("model", []).append(model)
                best_run = None
                best_val_acc = None
                for run, res in model_res_dict.items():
                    val_acc = np.array(res["valid-micro-accuracy-gt"])
                    if best_val_acc is None:
                        best_val_acc = val_acc
                        best_run = np.array([run] * len(val_acc), dtype=object)
                    is_better = val_acc > best_val_acc
                    best_val_acc[is_better] = val_acc[is_better]
                    best_run[is_better] = run
                for idx, run in enumerate(best_run):
                    for perf, scores in model_res_dict[run].items():
                        if "test" in perf:
                            res_dict.setdefault(perf, [])
                            if idx == 0:
                                res_dict[perf].append([])
                            res_dict[perf][-1].append(scores[idx])
                            if idx == len(best_run) - 1:
                                res_dict[perf][-1] = np.array(res_dict[perf][-1])
                                res_dict[perf][
                                    -1
                                ] = f"{np.round(res_dict[perf][-1].mean(), 3):.3f} {pm_symbol} {np.round(res_dict[perf][-1].std(), 3):.3f}"
        if len(res_dict) > 0:
            print(f"Data Set: {data_set}; Annotator Set: {annotator_set}")
            res_df = pd.DataFrame(res_dict)
            appendix = "" if "inductive" in data_type else ""
            columns = [
                "model",
                "test-micro-accuracy-gt",
                "test-cross-entropy-gt",
                "test-brier-score-gt",
                "test-micro-accuracy-ap" + appendix,
                "test-cross-entropy-ap" + appendix,
                "test-brier-score-ap" + appendix,
                "test-macro-accuracy-ap" + appendix,
            ]
            res_df = res_df[columns]
            if PRINT_LATEX:
                latex = res_df.to_latex()
                latex = latex.replace("\\$", "$").replace("\\textbackslash pm", "\\pm").replace("nan$\\pm$nan", "--").replace(" $\\pm$ ", "$\\pm$")
                print(latex)
            else:
                display(HTML(res_df.to_html()))

### Results for One-factor-at-a-time Ablation Study
The following tables presents the results of the ablation study for MaDL. Thereby, each table shows the results for one of pair of data and annotator set. Since it is a one-variable-at-a-time study, only the parameter named in the corresponding is changed from the default configuration of MaDL.

In [None]:
hyper_param_dict = {
    "MaDL(X, F)": ["embed_size", "eta", "ap_use_outer_product", "ap_use_residual", "alpha"],
}
data_set_list = ["letter", "music", "label-me"]
for data_type, annotator_set in data_type_dict.items():
    for data_set in data_set_list:
        appendix = "-inductive" if "inductive" in data_type else ""
        columns = [
            "model",
            "param",
            "value",
            "test-micro-accuracy-gt",
            "test-cross-entropy-gt",
            "test-brier-score-gt",
            "test-micro-accuracy-ap",
            "test-cross-entropy-ap",
            "test-brier-score-ap",
            "test-macro-accuracy-ap",
        ]
        res_df = pd.DataFrame(columns=columns)
        for model, hyper_param_list in hyper_param_dict.items():
            for hyper_param in hyper_param_list:
                res_dict = {}
                model_dict = deepcopy(default_params[model])
                if data_type == "inductive_25":
                    model_dict["use_annotator_features"] = True
                config_dict = {
                    "data_set_name": data_set,
                    "data_type": data_type,
                    **model_dict,
                }
                if hyper_param == "alpha":
                    config_dict.pop("alpha")
                    config_dict.pop("beta")
                else:
                    config_dict.pop(hyper_param)

                try:
                    model_res_dict = config_query(
                        path=RESULT_PATH,
                        config_dict=config_dict,
                    )
                except Exception:
                    continue

                if len(model_res_dict) > 0:
                    model_res_df = gen_results_table(model_res_dict, param=hyper_param, decimals=7)
                    model_res_df["model"] = model
                    model_res_df = model_res_df[columns]
                    res_df = res_df.append(model_res_df)
        if len(res_df) > 0:
            print(f"Data Set: {data_set}; Annotator Set: {annotator_set}")
            if PRINT_LATEX:
                latex = res_df.to_latex()
                latex = latex.replace(" +- ", "$\\pm$").replace("nan$\\pm$nan", "--")
                print(latex)
            else:
                display(HTML(res_df.to_html(index=False)))