# Results of Experimental Evaluation
In this notebook, we load the experimental results and summarize them in tabular form.

In [None]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../")

from copy import deepcopy

import matplotlib.pyplot as plt

import numpy as np

import pandas as pd
pd.set_option("display.max_columns", None)

from IPython.display import display, HTML
from evaluation.config_utils import config_query, gen_results_table
from evaluation.run_experiment import RESULT_PATH
from matplotlib.ticker import FormatStrFormatter
plt.rc( 'text', usetex=True)

# Flag indicating whether the results should be printed as LaTeX code.
PRINT_LATEX = True

### Default Configurations of Multi-annotator Supervised Learning Techniques
The following dictionary defines the default hyperparameters/settings of each multi-annotator supervised learning technique for the simulated annotator set independent and for the two data sets with real-world annotators. These configuration can be interpreted as kind of queries asking for certain results. One can alter the different parameters to ablate the effect of these parameters. Of course, in this case, the parameters need also to be adapted in the corresponding experiments.

In [None]:
default_params = {
    "GT": {
        "model_name": "gt",
        "embed_x": "learned",
        "confusion_matrix": "full",
        "embed_size": 16,
        "ap_use_residual": True,
        "ap_use_outer_product": True,
        "use_annotator_features": False,
    },
    "MR": {
        "model_name": "mr",
        "embed_x": "learned",
        "confusion_matrix": "full",
        "embed_size": 16,
        "ap_use_residual": True,
        "ap_use_outer_product": True,
        "use_annotator_features": False,
    },
    "CL": {
        "model_name": "cl",
    },
    "REAC": {
        "model_name": "reac",
        "lmbda": 0.01,
    },
    "UNION": {
        "model_name": "union",
        "epsilon": 1e-5,
    },
    "LIA": {
        "model_name": "lia",
        "ap_latent_dim": 16,
        "n_em_steps": 7,
        "n_fine_tune_epochs": 25,
        "warm_start": True,
        "use_annotator_features": False,
    },
    "CoNAL": {
        "model_name": "conal",
        "embed_size": 20,
        "lmbda": 1e-5,
        "use_annotator_features": False,
    },
    "MaDL(not X, I)": {
        "model_name": "madl",
        "embed_x": "none",
        "confusion_matrix": "isotropic",
        "embed_size": 16,
        "eta": 0.8,
        "alpha": 1.25,
        "beta": 0.25,
        "ap_use_residual": False,
        "ap_use_outer_product": False,
        "use_annotator_features": False,
    },
    "MaDL(not X, P)": {
        "model_name": "madl",
        "embed_x": "none",
        "confusion_matrix": "diagonal",
        "embed_size": 16,
        "eta": 0.8,
        "alpha": 1.25,
        "beta": 0.25,
        "ap_use_residual": False,
        "ap_use_outer_product": False,
        "use_annotator_features": False,
    },
    "MaDL(not X, F)": {
        "model_name": "madl",
        "embed_x": "none",
        "confusion_matrix": "full",
        "embed_size": 16,
        "eta": 0.8,
        "alpha": 1.25,
        "beta": 0.25,
        "ap_use_residual": False,
        "ap_use_outer_product": False,
        "use_annotator_features": False,
    },
    "MaDL(X, I)": {
        "model_name": "madl",
        "embed_x": "learned",
        "confusion_matrix": "isotropic",
        "embed_size": 16,
        "eta": 0.8,
        "alpha": 1.25,
        "beta": 0.25,
        "ap_use_residual": True,
        "ap_use_outer_product": True,
        "use_annotator_features": False,
    },
    "MaDL(X, P)": {
        "model_name": "madl",
        "embed_x": "learned",
        "confusion_matrix": "diagonal",
        "embed_size": 16,
        "lmbda": 0,
        "eta": 0.8,
        "alpha": 1.25,
        "beta": 0.25,
        "ap_use_residual": True,
        "ap_use_outer_product": True,
        "use_annotator_features": False,
    },
    "MaDL(X, F)": {
        "model_name": "madl",
        "embed_x": "learned",
        "confusion_matrix": "full",
        "embed_size": 16,
        "eta": 0.8,
        "alpha": 1.25,
        "beta": 0.25,
        "ap_use_residual": True,
        "ap_use_outer_product": True,
        "use_annotator_features": False,
    },
}
data_type_dict = {
    "none": "independent",
    "correlated": "interdependent",
    "rand-dep_10_100": "random-interdependent",
    "inductive_25": "inductive",
}

### Results for Default Parameters
The following code cells produce the tables presenting the results with the default hyperparameters for different data and annotator sets.

In [None]:
data_set_list = ["letter", "fmnist", "cifar10", "svhn", "music", "label-me"]
pm_symbol = "$\\pm$" if PRINT_LATEX else "+-"
for data_type, annotator_set in data_type_dict.items():
    params_dict = deepcopy(default_params)
    if data_type in ["correlated", "rand_dep_10_100"]:
        params_dict["MaDL(W)"] = params_dict.pop("MaDL(X, F)")
        params_dict["MaDL(not W)"] = deepcopy(params_dict["MaDL(W)"])
        params_dict["MaDL(not W)"]["alpha"] = None
        params_dict["MaDL(not W)"]["beta"] = None
    elif data_type == "inductive_25":
        params_dict["GT"]["use_annotator_features"] = True
        params_dict["MR"]["use_annotator_features"] = True
        params_dict["LIA(not A)"] = params_dict.pop("LIA")
        params_dict["LIA(A)"] = deepcopy(params_dict["LIA(not A)"])
        params_dict["LIA(A)"]["use_annotator_features"] = True
        params_dict["CoNAL(not A)"] = params_dict.pop("CoNAL")
        params_dict["CoNAL(A)"] = deepcopy(params_dict["CoNAL(not A)"])
        params_dict["CoNAL(A)"]["use_annotator_features"] = True
        params_dict["MaDL(not A)"] = params_dict.pop("MaDL(X, F)")
        params_dict["MaDL(A)"] = deepcopy(params_dict["MaDL(not A)"])
        params_dict["MaDL(A)"]["use_annotator_features"] = True
    for data_set in data_set_list:
        res_dict = {}
        for model, model_dict in params_dict.items():
            config_dict = {
                "data_set_name": data_set,
                "data_type": data_type,
                **model_dict,
            }
            try:
                model_res_dict = config_query(
                    path=RESULT_PATH,
                    config_dict=config_dict,
                )
            except Exception as e:
                continue
            if len(model_res_dict) > 0:
                res_dict.setdefault("model", []).append(model)
                best_run = None
                best_val_acc = None
                for run, res in model_res_dict.items():
                    val_acc = np.array(res["valid-micro-accuracy-gt"])
                    if best_val_acc is None:
                        best_val_acc = val_acc
                        best_run = np.array([run] * len(val_acc), dtype=object)
                    is_better = val_acc > best_val_acc
                    best_val_acc[is_better] = val_acc[is_better]
                    best_run[is_better] = run
                for idx, run in enumerate(best_run):
                    for perf, scores in model_res_dict[run].items():
                        if "test" in perf:
                            res_dict.setdefault(perf, [])
                            if idx == 0:
                                res_dict[perf].append([])
                            res_dict[perf][-1].append(scores[idx])
                            if idx == len(best_run) - 1:
                                res_dict[perf][-1] = np.array(res_dict[perf][-1])
                                res_dict[perf][
                                    -1
                                ] = f"{np.round(res_dict[perf][-1].mean(), 3):.3f} {pm_symbol} {np.round(res_dict[perf][-1].std(), 3):.3f}"
        if len(res_dict) > 0:
            print(f"Data Set: {data_set}; Annotator Set: {annotator_set}")
            res_df = pd.DataFrame(res_dict)
            appendix = "" if "inductive" in data_type else ""
            columns = [
                "model",
                "test-micro-accuracy-gt",
                "test-cross-entropy-gt",
                "test-brier-score-gt",
                "test-micro-accuracy-ap" + appendix,
                "test-cross-entropy-ap" + appendix,
                "test-brier-score-ap" + appendix,
                "test-macro-accuracy-ap" + appendix,
            ]
            res_df = res_df[columns]
            if PRINT_LATEX:
                latex = res_df.to_latex()
                latex = latex.replace("\\$", "$").replace("\\textbackslash pm", "\\pm").replace("nan$\\pm$nan", "--").replace(" $\\pm$ ", "$\\pm$")
                print(latex)
            else:
                display(HTML(res_df.to_html()))

### Results for One-factor-at-a-time Ablation Study
The following code cells produce tables presenting the results of the ablation study for MaDL. Thereby, each table shows the results for one of pair of data and annotator set. Since it is a one-factor-at-a-time study, only the parameter named in the corresponding is changed from the default configuration of MaDL.

In [None]:
hyper_param_dict = {
    "MaDL(X, F)": ["embed_size", "eta", "ap_use_outer_product", "ap_use_residual", "alpha"],
}
data_set_list = ["letter", "music", "label-me"]
for data_type, annotator_set in data_type_dict.items():
    for data_set in data_set_list:
        appendix = "-inductive" if "inductive" in data_type else ""
        columns = [
            "model",
            "param",
            "value",
            "test-micro-accuracy-gt",
            "test-cross-entropy-gt",
            "test-brier-score-gt",
            "test-micro-accuracy-ap",
            "test-cross-entropy-ap",
            "test-brier-score-ap",
            "test-macro-accuracy-ap",
        ]
        res_df = pd.DataFrame(columns=columns)
        for model, hyper_param_list in hyper_param_dict.items():
            for hyper_param in hyper_param_list:
                res_dict = {}
                model_dict = deepcopy(default_params[model])
                if data_type == "inductive_25":
                    model_dict["use_annotator_features"] = True
                config_dict = {
                    "data_set_name": data_set,
                    "data_type": data_type,
                    **model_dict,
                }
                if hyper_param == "alpha":
                    config_dict.pop("alpha")
                    config_dict.pop("beta")
                else:
                    config_dict.pop(hyper_param)

                try:
                    model_res_dict = config_query(
                        path=RESULT_PATH,
                        config_dict=config_dict,
                    )
                except Exception:
                    continue

                if len(model_res_dict) > 0:
                    model_res_df = gen_results_table(model_res_dict, param=hyper_param, decimals=7)
                    model_res_df["model"] = model
                    model_res_df = model_res_df[columns]
                    res_df = res_df.append(model_res_df)
        if len(res_df) > 0:
            print(f"Data Set: {data_set}; Annotator Set: {annotator_set}")
            if PRINT_LATEX:
                latex = res_df.to_latex()
                latex = latex.replace(" +- ", "$\\pm$").replace("nan$\\pm$nan", "--")
                print(latex)
            else:
                display(HTML(res_df.to_html(index=False)))

###  Results for Varying Annotation Ratios on CIFAR10
The following code cells produce the results for the CIFAR10 dataset and varying annotation ratios, i.e., $\{0.2, 0.4, 0.6, 0.8\}$.

In [None]:
pm_symbol = "$\\pm$" if PRINT_LATEX else "+-"
params_dict = deepcopy(default_params)
mlr_array = np.array([0.8, 0.6, 0.4, 0.2])
data_type = "none"
data_set = "cifar100"
res_df_list = []
res_df_list_std = []
columns = [
    "model",
    "test-micro-accuracy-gt",
    "test-cross-entropy-gt",
    "test-brier-score-gt",
    "test-micro-accuracy-ap",
    "test-cross-entropy-ap",
    "test-brier-score-ap",
    "test-macro-accuracy-ap",
]
for mlr in mlr_array:
    res_dict = {}
    res_dict_std = {}
    for model, model_dict in params_dict.items():
        if model in ["MaDL(not X, I)", "MaDL(not X, P)", "MaDL(X, I)", "MaDL(X, P)", "MaDL(not X, F)"]:
            continue
        config_dict = {
            "data_set_name": data_set,
            "data_type": data_type,
            "missing_label_ratio": mlr,
            **model_dict,
        }
        try:
            model_res_dict = config_query(
                path=RESULT_PATH,
                config_dict=config_dict,
            )
        except Exception as e:
            continue
        if len(model_res_dict) > 0:
            res_dict.setdefault("model", []).append(model)
            res_dict_std.setdefault("model", []).append(model)
            best_run = None
            best_val_acc = None
            for run, res in model_res_dict.items():
                val_acc = np.array(res["valid-micro-accuracy-gt"])
                if best_val_acc is None:
                    best_val_acc = val_acc
                    best_run = np.array([run] * len(val_acc), dtype=object)
                is_better = val_acc > best_val_acc
                best_val_acc[is_better] = val_acc[is_better]
                best_run[is_better] = run
            for idx, run in enumerate(best_run):
                for perf, scores in model_res_dict[run].items():
                    if "test" in perf:
                        res_dict.setdefault(perf, [])
                        res_dict_std.setdefault(perf, [])
                        if idx == 0:
                            res_dict[perf].append([])
                            res_dict_std[perf].append([])
                        res_dict[perf][-1].append(scores[idx])
                        if idx == len(best_run) - 1:
                            res_dict[perf][-1] = np.array(res_dict[perf][-1])
                            res_dict_std[perf][-1] = np.round(res_dict[perf][-1].std(), 3)
                            res_dict[perf][-1] = np.round(res_dict[perf][-1].mean(), 3)
    if len(res_dict) > 0:
        print(f"Data Set: {data_set}; Annotator Set: independent")
        res_df = pd.DataFrame(res_dict)
        res_df_std = pd.DataFrame(res_dict_std)
        res_df = res_df[columns]
        res_df_std = res_df_std[columns]
        res_df.set_index("model", inplace=True)
        res_df_std.set_index("model", inplace=True)
        res_df_list.append(res_df)
        res_df_list_std.append(res_df_std)
        if PRINT_LATEX:
            latex = res_df.to_latex()
            latex = latex.replace("\\$", "$").replace("\\textbackslash pm", "\\pm").replace("nan$\\pm$nan", "--")
            print(latex)
        else:
            display(HTML(res_df.to_html()))

In [None]:
cm = plt.get_cmap('rainbow')
annotation_ratios = 1 - mlr_array
color_dict = {
    "GT": ["k", "-"],
    "MR": ["k", "--"],
    "LIA": ["r", "-"],
    "REAC": ["g", "-"],
    "CL": ["orange", "-"],
    "UNION": ["b", "-"],
    "CoNAL": ["y", "-"],
    "MaDL(not X, F)": ["b", "--"],
    "MaDL(X, F)": ["b", "-"],
}
print(len(res))
for col in columns[1:]:
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.set_prop_cycle(color=[cm(1.*i/len(res_df_list[0])) for i in range(len(res_df_list[0]))])
    perfs = pd.concat([res[col].T for res in res_df_list], axis=1)
    perfs_std = pd.concat([res[col].T for res in res_df_list_std], axis=1)
    for i in perfs.index:
        plt.errorbar(x=annotation_ratios, y=perfs.loc[i].values.T, yerr=perfs_std.loc[i].values.T, label=i)
    ax.set_xticks(annotation_ratios)
    ax.yaxis.set_major_locator(plt.MaxNLocator(5))
    ax.yaxis.set_major_formatter(FormatStrFormatter('$%.2f$'))
    ax.tick_params(axis='both', which='major', labelsize=16)
    ax.tick_params(axis='both', which='minor', labelsize=16)
    plt.savefig(f"../figures/{data_set}-annotation_ratios-{col}.pdf", bbox_inches="tight", pad_inches=0)
    plt.tight_layout()
    plt.legend()
    plt.show()

In [None]:
from evaluation.data_utils import load_data, DATA_PATH
from evaluation.experiment_utils import aggregate_labels
from lfma.utils import introduce_missing_annotations

# Load data set.
seed = 0
data_type = "none"
data_set = "cifar10"
ds = load_data(data_path=DATA_PATH, data_set_name=data_set, data_type=data_type, n_repeats=5, valid_size=0.05, test_size=0.2)

# Missing label ratios to be tested.
mlr_array = np.array([0.8, 0.6, 0.4, 0.2])
annotation_ratios = 1 - mlr_array

# Compute accuracy of the majority vote and fraction of correction annotations.
mr_accuracies = []
correct_annotation_proportion = []
for missing_label_ratio in mlr_array:
    mr_accuracies.append([])
    correct_annotation_proportion.append([])
    n_iter = 0
    for tr, val, te in zip(ds["train"], ds["valid"], ds["test"]):
        n_iter += 1

        # Get true labels of training samples.
        y_true_train = ds["y_true"][tr]

        # Randomly add missing labels.
        y_partial = introduce_missing_annotations(
            y=ds["y"][tr],
            missing_label=-1,
            percentage=missing_label_ratio,
            random_state=seed + n_iter,
        )
        y_mr = aggregate_labels(y=y_partial, y_true=y_true_train, aggregation_method="mr")
        is_lbld = y_mr != -1
        accuracy = np.mean(y_mr[is_lbld] == y_true_train[is_lbld])
        mr_accuracies[-1].append(accuracy)
        is_lbld = y_partial != -1
        proportion = np.sum((y_partial == y_true_train[:, None]) * is_lbld)/np.sum(is_lbld)
        correct_annotation_proportion[-1].append(proportion)

# Plot results.
fig = plt.figure(figsize=(6.4, 4.9))
ax = fig.add_subplot(111)
plt.errorbar(x=1-mlr_array, y=np.mean(mr_accuracies, axis=1), yerr=np.std(mr_accuracies, axis=1), label="majority rule", c="k")
plt.errorbar(x=1-mlr_array, y=np.mean(correct_annotation_proportion, axis=1), yerr=np.std(correct_annotation_proportion, axis=1), label="correct annotation proportion", c="k", ls=":")
ax.set_xticks(annotation_ratios)
ax.yaxis.set_major_locator(plt.MaxNLocator(5))
ax.yaxis.set_major_formatter(FormatStrFormatter('$%.2f$'))
ax.tick_params(axis='both', which='major', labelsize=16)
ax.tick_params(axis='both', which='minor', labelsize=16)
plt.savefig(f"../figures/{data_set}-annotation-accuracies.pdf", bbox_inches="tight", pad_inches=0)
plt.tight_layout()
plt.legend()
plt.show()

### Results for Varying Numbers of Annotated Instance on LETTER

The following code cells produce the results for the LETTER dataset for varying number of instances.

In [None]:
pm_symbol = "$\\pm$" if PRINT_LATEX else "+-"
params_dict = deepcopy(default_params)
tr_array = np.array([512, 1024, 1536, 2048, 2560, 3072, 3584, 4096, 4608, 5120])
data_type = "none"
data_set = "letter"
res_df_list = []
res_df_list_std = []
columns = [
    "model",
    "test-micro-accuracy-gt",
    "test-cross-entropy-gt",
    "test-brier-score-gt",
    "test-micro-accuracy-ap",
    "test-cross-entropy-ap",
    "test-brier-score-ap",
    "test-macro-accuracy-ap",
]
for tr in tr_array:
    res_dict = {}
    res_dict_std = {}
    for model, model_dict in params_dict.items():
        config_dict = {
            "data_set_name": data_set,
            "data_type": data_type,
            "training_size": tr,
            **model_dict,
        }
        try:
            model_res_dict = config_query(
                path=RESULT_PATH,
                config_dict=config_dict,
            )
        except Exception as e:
            continue
        if len(model_res_dict) > 0:
            res_dict.setdefault("model", []).append(model)
            res_dict_std.setdefault("model", []).append(model)
            best_run = None
            best_val_acc = None
            for run, res in model_res_dict.items():
                val_acc = np.array(res["valid-micro-accuracy-gt"])
                if best_val_acc is None:
                    best_val_acc = val_acc
                    best_run = np.array([run] * len(val_acc), dtype=object)
                is_better = val_acc > best_val_acc
                best_val_acc[is_better] = val_acc[is_better]
                best_run[is_better] = run
            for idx, run in enumerate(best_run):
                for perf, scores in model_res_dict[run].items():
                    if "test" in perf:
                        res_dict.setdefault(perf, [])
                        res_dict_std.setdefault(perf, [])
                        if idx == 0:
                            res_dict[perf].append([])
                            res_dict_std[perf].append([])
                        res_dict[perf][-1].append(scores[idx])
                        if idx == len(best_run) - 1:
                            res_dict[perf][-1] = np.array(res_dict[perf][-1])
                            res_dict_std[perf][-1] = np.round(res_dict[perf][-1].std(), 3)
                            res_dict[perf][-1] = np.round(res_dict[perf][-1].mean(), 3)
    if len(res_dict) > 0:
        print(f"Data Set: {data_set}; Annotator Set: independent")
        res_df = pd.DataFrame(res_dict)
        res_df_std = pd.DataFrame(res_dict_std)
        res_df = res_df[columns]
        res_df_std = res_df_std[columns]
        res_df.set_index("model", inplace=True)
        res_df_std.set_index("model", inplace=True)
        res_df_list.append(res_df)
        res_df_list_std.append(res_df_std)
        if PRINT_LATEX:
            latex = res_df.to_latex()
            latex = latex.replace("\\$", "$").replace("\\textbackslash pm", "\\pm").replace("nan$\\pm$nan", "--")
            print(latex)
        else:
            display(HTML(res_df.to_html()))

In [None]:
cm = plt.get_cmap('rainbow')
color_dict = {
    "GT": ["k", "-"],
    "MR": ["k", "--"],
    "LIA": ["r", "-"],
    "REAC": ["g", "-"],
    "CL": ["orange", "-"],
    "UNION": ["b", "-"],
    "CoNAL": ["y", "-"],
    "MaDL(not X, F)": ["b", "--"],
    "MaDL(X, F)": ["b", "-"],
}
col = "test-micro-accuracy-ap"
fig = plt.figure()
ax = fig.add_subplot(111)
ax.set_prop_cycle(color=[cm(1.*i/len(res_df_list[0])) for i in range(len(res_df_list[0]))])
perfs = pd.concat([res[col].T for res in res_df_list], axis=1)
perfs_std = pd.concat([res[col].T for res in res_df_list_std], axis=1)
for i in perfs.index:
    plt.errorbar(x=tr_array, y=perfs.loc[i].values.T, yerr=perfs_std.loc[i].values.T, label=i)
ax.set_xticks(tr_array)
ax.set_yticks(np.array([0.5, 0.6, 0.7]))
ax.tick_params(axis='both', which='major', labelsize=16)
ax.tick_params(axis='both', which='minor', labelsize=16)
plt.savefig(f"../figures/{data_set}-training_size-{col}.pdf", bbox_inches="tight", pad_inches=0)
plt.tight_layout()
plt.legend()
plt.show()