# Tabular Results

This notebook allows us to query tabular results via [`mlflow`](https://mlflow.org/). As a prerequisite, the experiments have to be performed in a first step. If this is the case, we can load the results for the different tables in the accompanied article. Update the `mlruns_path` to the path used in your config file [`experiment.yaml`](../conf/experiment.yaml).

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from hydra.utils import to_absolute_path
from mlflow import set_tracking_uri, get_experiment_by_name, search_runs
from matplotlib.colors import LinearSegmentedColormap

# TODO: Adjust `MLRUNS_PATH`.
MLRUNS_PATH = "."

# Flag for showing results in LaTex format.
SHOW_LATEX = False

# Multi-annotator learning approaches and variants of dopanim.
APPROACHES = ["ground-truth", "majority-vote", "crowd_layer", "trace_reg", "conal", "union_net", "madl", "geo_reg_w", "geo_reg_f", "crowdar", "annot_mix"]
VARIANTS = ["worst-1", "worst-2", "worst-var", "rand-1", "rand-2", "rand-var", "full"]

# # Increase maximum number of displayed rows.
pd.set_option('display.max_rows', 1000)

def evaluate(mlruns_path: str, experiment_name: str, update_columns: dict = None, perf_type: str = "gt", version: str = "test", metric: str = "acc", epoch: str = "best"):
    """
    Queries the evaluation results via mlflow.
    
    Parameters
    ----------
    mlruns_path : str  
        Path to the results saved via mlflow.
    experiment_name : str
        Name of the mlflow experiment.
    update_columns : dict, default=None
        Optional dictionary of columns to be included in the outputted table of results.
    perf_type : str, default="gt"
        Either 'gt' representing the classification models estimates of the ground truth (gt) class labels or 'ap' representing the annotator models' estimates of the 
        annotators' performances.
    version : str, default="str"
        Either 'train', 'valid', or 'test' representing the results for different subsets.
    metric : str, default="acc"
        Either 'acc', 'brier_score', or 'tce' as the two used performance metrics.
    epoch : str
        Show the results either after the 'last' or 'best' epoch.
        
    Returns
    -------
    runs: pd.DataFrame
        Table of results.
    """
    set_tracking_uri(uri=f"file://{to_absolute_path(mlruns_path)}")
    exp = get_experiment_by_name(experiment_name)
    if exp is None:
        return None
    query = "status = 'FINISHED'"
    runs = search_runs(experiment_ids=exp.experiment_id, filter_string=query, output_format="pandas")
    if len(runs) == 0:
        return None
    columns = {
        "params.data.class_definition._target_": "data",
        "params.classifier.name": "clf",
        "params.classifier.aggregation_method": "agg",
    }
    if update_columns is not None:
        columns.update(update_columns)
    aggregation_dict = {f"metrics.{perf_type}_{version}_{metric}_{epoch}": ["mean", "std"], "params.seed": ["sum"]}
    runs = runs.drop_duplicates(list(columns.keys()) + ["params.seed"]).fillna("--")
    runs = runs.sort_values(by="params.seed")
    runs = runs.groupby(list(columns.keys()), as_index=False).agg(aggregation_dict)
    reindex_columns = [c1 + c2 for c1, c2 in runs.columns]
    runs.columns = runs.columns.droplevel(level=1)
    runs.columns = reindex_columns
    for c in runs.columns:
        if c.startswith("metrics") and "acc" in c:
            runs[c] = np.round(runs[c].values * 100 , 3)
    columns[f"params.seedsum"] = "n_runs"
    columns[f"metrics.{perf_type}_{version}_{metric}_{epoch}mean"] = f"{version}-{perf_type}-{metric}-{epoch}-mean"
    columns[f"metrics.{perf_type}_{version}_{metric}_{epoch}std"] = f"{version}-{perf_type}-{metric}-{epoch}-std"
    runs = runs.rename(columns=columns)
    return runs

## Hyperparameter Study

Load the table with the results of the hyperparameter study with the ground truth model.

In [None]:
update_columns = {"params.data.optimizer.gt_params.lr": "lr", "params.data.train_batch_size": "bs", "params.data.optimizer.gt_params.weight_decay": "wd"}
runs_df = evaluate(mlruns_path=MLRUNS_PATH, experiment_name="hyperparameter_search", update_columns=update_columns, perf_type="gt", version="valid", metric="acc", epoch="best")
if runs_df is not None:
    runs_df = runs_df.sort_values(by=['bs', 'wd', 'lr'])
    runs_df["lr"] = runs_df["lr"].astype(float)
    runs_df["bs"] = runs_df["bs"].astype(int)
    runs_df["wd"] = runs_df["wd"].astype(float)
    runs_df_compressed = runs_df.drop(columns=["data", "clf", "agg"])
    means_df = runs_df_compressed.pivot(index="lr", columns=["bs", "wd"], values=f"valid-gt-acc-best-mean")
    std_df = runs_df_compressed.pivot(index="lr", columns=["bs", "wd"], values=f"valid-gt-acc-best-std")
    merged_df = means_df.applymap(lambda x: '')  # Create an empty DataFrame with the same structure
    if SHOW_LATEX:
            for col in means_df.columns:
                for idx in means_df.index:
                        merged_df.at[idx, col] = f"${means_df.at[idx, col]:.1f}_{{\pm {std_df.at[idx, col]:.1f}}}$"
            latex = merged_df.apply(lambda row: ' & '.join(row.values.astype(str)), axis=1)
            print(latex.to_markdown())
    else:
        for col in means_df.columns:
            for idx in means_df.index:
                    merged_df.at[idx, col] = f"{means_df.at[idx, col]:.1f} +- {std_df.at[idx, col]:.1f}"
        print(merged_df.to_markdown())

List the best hyperparameters.

In [None]:
if runs_df is not None:
    val_acc = [runs_df[f"valid-gt-acc-best-mean"].values]
    val_acc = np.mean(val_acc, axis=0)
    best_idx = np.argmax(val_acc)
    print(runs_df.iloc[best_idx])

## Benchmark and Case Studies

Load the table with the results of the benchmark or case studies.

In [None]:
# TODO: Define the experiment name. Possible experiments are: "benchmark", "beyond_hard_labels", or "annotator_metadata".
experiment_name = "benchmark" 

# Load empirical results.
ranks_dict = {}
perf_type = "gt"
version = "test"
ranks_df = None
for metric, ascending in zip(["acc", "brier_score", "tce"], [False, True, True]):
    all_runs = []
    full_metric_name = f'{version}-{perf_type}-{metric}-last'
    for v in VARIANTS:
        runs = evaluate(mlruns_path=MLRUNS_PATH, experiment_name=f"{experiment_name}_{v}", update_columns={"params.data.class_definition.variant": "variant"}, perf_type=perf_type, version=version, metric=metric, epoch="last")
        if runs is None:
            continue
        new_runs = runs.drop_duplicates(subset=["data", "variant", "clf", "agg"], keep="first")
        all_runs.append(new_runs)
    if len(all_runs):
        runs_df = pd.concat(all_runs)
        ranks_df = runs_df[runs_df["agg"] != "ground-truth"].copy()
        ranks_df["rank"] = ranks_df.groupby('variant')[f"{full_metric_name}-mean"].rank(ascending=ascending, method="min")
        ranks_df = pd.DataFrame(ranks_df.groupby("clf")["rank"].mean())
        ranks_df = ranks_df.reindex(APPROACHES[1:])
        ranks_dict[metric] = ranks_df["rank"].values
        print(f"\n############################ {version.upper()} {metric.upper()} ############################")
        
        means_df = runs_df.copy()
        means_df["approach"] = means_df[['clf', 'agg']].agg(' '.join, axis=1) 
        means_df["approach"] = [approach.replace("aggregate ", "").replace(" None", "") for approach in means_df["approach"].values]
        means_df = means_df.pivot(index="variant", columns="approach", values=f"{full_metric_name}-mean")
        means_df = means_df.reindex(columns=APPROACHES, index=VARIANTS)
        def max_and_second_max(row):
            sorted_cols = row.sort_values(ascending=False).index
            return pd.Series([sorted_cols[0], sorted_cols[1]], index=['max_col', 'second_max_col'])
        best_ids = means_df[APPROACHES[1:]].apply(lambda row: max_and_second_max(row), axis=1)
        
        std_df = runs_df.copy()
        std_df["approach"] = std_df[['clf', 'agg']].agg(' '.join, axis=1) 
        std_df["approach"] = [approach.replace("aggregate ", "").replace(" None", "") for approach in std_df["approach"].values]
        std_df = std_df.pivot(index="variant", columns="approach", values=f"{full_metric_name}-std")
        std_df = std_df.reindex(columns=APPROACHES, index=VARIANTS)
            
        if SHOW_LATEX:
            merged_df = means_df.applymap(lambda x: '')  # Create an empty DataFrame with the same structure
            for col in means_df.columns:
                for idx in means_df.index:
                    if col == "ground-truth":
                        merged_df.at[idx, col] = f"${{\color{{partialcolor}}{means_df.at[idx, col]:.1f}_{{\pm {std_df.at[idx, col]:.1f}}}}}$"
                    elif col == best_ids.loc[idx].values[0]:
                        merged_df.at[idx, col] = f"\\textBF{{${{\color{{color_annot_green}} \\text{{{means_df.at[idx, col]:.1f}}}_{{\pm {std_df.at[idx, col]:.1f}}}}}$}}"
                    elif col == best_ids.loc[idx].values[1]:
                        merged_df.at[idx, col] = f"\\textBF{{${{\color{{color_annot_violet}} \\text{{{means_df.at[idx, col]:.1f}}}_{{\pm {std_df.at[idx, col]:.1f}}}}}$}}"
                    else:
                        merged_df.at[idx, col] = f"${means_df.at[idx, col]:.1f}_{{\pm {std_df.at[idx, col]:.1f}}}$"
            latex = merged_df.apply(lambda row: ' & '.join(row.values.astype(str)), axis=1)
            print(latex.to_markdown())
        else:
            merged_df = means_df.applymap(lambda x: '')  # Create an empty DataFrame with the same structure
            for col in means_df.columns:
                for idx in means_df.index:
                    if metric == "acc":
                        merged_df.at[idx, col] = f"{means_df.at[idx, col]:.1f} +- {std_df.at[idx, col]:.1f}"
                    else:
                        merged_df.at[idx, col] = f"{means_df.at[idx, col]:.2f} +- {std_df.at[idx, col]:.2f}"
            print(merged_df.to_markdown())
    
if experiment_name == "benchmark" and ranks_df is not None:
    # Define the colors
    colors = ['#007d7d99', '#7f007fff']

    # Create a custom colormap
    cmap = LinearSegmentedColormap.from_list('custom_diverging', colors, N=256)

    ranks = pd.DataFrame(np.column_stack((ranks_dict["acc"], ranks_dict["brier_score"], ranks_dict["tce"])), index=APPROACHES[1:], columns=["acc", "brier", "tce"])
    #ranks = ranks.reindex(["aggregate", "crowd_layer", "trace_reg", "conal", "union_net", "madl", "geo_reg_w", "geo_reg_f", "crowdar", "annot_mix"])
    fig = plt.figure(figsize=(3, 7), dpi=100)
    ax = fig.add_axes(
        [0.1, 0.1, 0.8, 0.8]
    )
    sns.heatmap(ranks, annot=True, cmap=cmap, cbar=False, ax=ax, annot_kws={"size": 12}, fmt=".3g", alpha=.6)
    plt.savefig("ranks.pdf")
    plt.show()