# Tabular Results

This notebook allows us to query tabular results via [`mlflow`](https://mlflow.org/). As a prerequisite, the experiments have to be performed in a first step. If this is the case, we can load the results for the different tables in the accompanied article.

Update the `mlruns_path` to the path used in your config file [`experiment.yaml`](../conf/experiment.yaml).

In [None]:
import pandas as pd
import numpy as np

from hydra.utils import to_absolute_path
from mlflow import set_tracking_uri, get_experiment_by_name, search_runs

def evaluate(mlruns_path="/mnt/work/anonymous/maml/mlruns", experiment_name="cifar_10_h", update_columns=None, perf_type="gt", version="test", metric="acc", epoch="best"):
    """
    Queries the evaluation results via mlflow.
    
    Parameters
    ----------
    mlruns_path : str  
        Path to the results saved via mlflow.
    experiment_name : str
        Name of the mlflow experiment.
    update_columns : dict
        Optional dictionary of columns to be included in the outputted table of results.
    perf_type : str
        Either 'gt' representing the classification models estimates of the ground truth (gt) class labels or 'ap' representing the annotator models' estimates the 
        annotators' performances.
    version : str
        Either 'train', 'valid', or 'test' representing the results for different subsets.
    metric : str
        Either 'acc' or 'auroc' as the two used performance metrics.
    epoch : str
        Show the results either after the 'last' or 'best' epoch.
        
    Returns
    -------
    runs: pd.DataFrame
        Table of results.
    """
    set_tracking_uri(uri=f"file://{to_absolute_path(mlruns_path)}")
    exp = get_experiment_by_name(experiment_name)
    if exp is None:
        return None
    query = "status = 'FINISHED' and params.classifier.name != 'aggregate'"
    runs = search_runs(experiment_ids=exp.experiment_id, filter_string=query, output_format="pandas")
    if len(runs) == 0:
        return None
    columns = {
        "params.data.class_definition._target_": "data",
        "params.classifier.name": "clf",
        "params.classifier.aggregation_method": "agg",
    }
    if update_columns is not None:
        columns.update(update_columns)
    aggregation_dict = {f"metrics.{perf_type}_{version}_{metric}_{epoch}": ["mean", "std"], "params.seed": ["sum"]}
    runs = runs.drop_duplicates(list(columns.keys()) + ["params.seed"]).fillna("--")
    runs = runs.groupby(list(columns.keys()), as_index=False).agg(aggregation_dict)
    reindex_columns = [c1 + c2 for c1, c2 in runs.columns]
    runs.columns = runs.columns.droplevel(level=1)
    runs.columns = reindex_columns
    for c in runs.columns:
        if c.startswith("metrics"):
            runs[c] = np.round(runs[c].values * 100 , 3)
    columns[f"params.seedsum"] = "n_runs"
    columns[f"metrics.{perf_type}_{version}_{metric}_{epoch}mean"] = f"{version}-{perf_type}-{metric}-{epoch}-mean"
    columns[f"metrics.{perf_type}_{version}_{metric}_{epoch}std"] = f"{version}-{perf_type}-{metric}-{epoch}-std"
    runs = runs.rename(columns=columns)
    return runs

# Increase maximum number of displayed rows.
pd.set_option('display.max_rows', 1000)

# List of all dataset names.
datasets = [
    "music_genres",
    "label_me",
    "cifar_10_h",
    "cifar_10_n",
    "cifar_100_n",
    "letter_sim",
    "flowers_102_sim",
    "trec_6_sim",
    "aloi_sim",
    "dtd_sim",
    "ag_news_sim",
]

## Benchmark Study

Load the table with the results of the benchmark study for the classification models.

In [None]:
all_runs = []
for e in ["last", "best"]:
    print(f"\n\n{e}")
    for d in datasets:
        runs = evaluate(experiment_name=d+"_benchmark", perf_type="gt", version="test", metric="acc", epoch=e)
        if runs is None:
            continue
        new_runs = runs.drop_duplicates(subset=["data", "clf", "agg"], keep="first")
        all_runs.append(new_runs)
    if len(all_runs):
        runs_df = pd.concat(all_runs)
    print(runs_df.to_markdown())

Load the table with the results of the benchmark study for the annotator models.

In [None]:
all_runs = []
for e in ["last", "best"]:
    print(f"\n\n{e}")
    for d in datasets:
        runs = evaluate(experiment_name=d+"_benchmark", perf_type="ap", version="test", metric="auroc", epoch=e)
        if runs is None:
            continue
        new_runs = runs.drop_duplicates(subset=["data", "clf", "agg"], keep="first")
        all_runs.append(new_runs)
    if len(all_runs):
        runs_df = pd.concat(all_runs)
    print(runs_df.to_markdown())

## Ablation Study

Load the table with the results of the ablation study for the classification model of annot-mix.

In [None]:
all_runs = []
alpha_column = {"params.classifier.params.alpha": "alpha"}
for e in ["last", "best"]:
    print(f"\n\n{e}")
    for d in datasets:
        runs = evaluate(experiment_name=d+"_ablation", perf_type="gt", version="test", metric="acc", update_columns=alpha_column, epoch=e)
        if runs is None:
            continue
        new_runs = runs.drop_duplicates(subset=["data", "clf", "agg"] + list(alpha_column.values()), keep="first")
        all_runs.append(new_runs)
    if len(all_runs):
        runs_df = pd.concat(all_runs)
    print(runs_df.to_markdown())