# Results of Online Bayesian Inference

In [18]:
import os
import json

import numpy as np
import pylab as plt
import pandas as pd

from omegaconf import OmegaConf
from pathlib import Path
from scipy.stats import ttest_rel


def load_results(exp_path: str):
    exp_path = Path(exp_path)
    json_file = exp_path / 'results.json'
    with open(json_file, 'r') as f:
        data = json.load(f)
    config_file = exp_path / '.hydra' / 'config.yaml'
    with open(config_file, 'r') as f:
        args = OmegaConf.load(f)
    return data, args


def moving_average(a, n=3):
    return np.convolve(a, np.ones(n), 'valid') / n


def pad_array(arr1, arr2):
    return np.pad(arr1, (0, len(arr2) - len(arr1)), constant_values=float('nan'))


plot_path = Path('./plots/')
os.makedirs(plot_path, exist_ok=True)

In [19]:
def get_metrics(experiments, metric_name, reweight_samples=[8]):
    all_metric_values_baseline = []
    all_metric_values_reweighted = { f'reweighted{i}': [] for i in reweight_samples}
    for exp_name, exp_data in experiments.items():
        res = exp_data['results']
        all_metric_values_baseline.append([d['baseline_test_stats'][metric_name] for d in res])

        for key in all_metric_values_reweighted:
            accs = [d[f'{key}_test_stats'][metric_name] for d in res]
            all_metric_values_reweighted[key].append(accs)

    n_train_samples = np.array([d['n_train_samples'] for d in res])
    divisor = 100 if metric_name == "test_acc1" else 1
    all_metric_values_baseline = np.array(all_metric_values_baseline)/divisor
    all_metric_values_reweighted = {key: np.array(val)/divisor for key, val in all_metric_values_reweighted.items()}
    d = {
        'n_train_samples': n_train_samples,
        f'baseline_{metric_name}': all_metric_values_baseline,
        f'reweighted_{metric_name}': all_metric_values_reweighted,
    }
    return d

def retrieve_results(metric, reweight_samples, experiments_dict, test_alternative="greater", pvalue=0.01):
    max_reweight_samples = int(np.max(reweight_samples))
    min_reweight_samples = int(np.min(reweight_samples))
    exclude_samples = int(max_reweight_samples / min_reweight_samples)
    results_dict = {}
    auc_results_dict = {}
    for param in experiments_dict:
        results_dict[param] = {}
        auc_results_dict[param] = {}
        for param_val in experiments_dict[param]:
            # Load results.results.
            d = get_metrics(experiments_dict[param][param_val], metric, reweight_samples=reweight_samples)

            # Create storages.
            param_val = f"${param_val}$"
            results_dict[param][param_val] = {}
            auc_results_dict[param][param_val] = {}

            # Load specific data.
            n_train_samples = d['n_train_samples']
            all_baseline_metrics = d[f'baseline_{metric}']
            all_reweighted_metrics = d[f'reweighted_{metric}']
            all_retrained_metrics = {f'retrained{(i*min_reweight_samples)}': np.array([pad_array(all_baseline_metrics[j, i:], n_train_samples) for j in range(all_baseline_metrics.shape[0])]) for i in range(1, 1 + max_reweight_samples // min_reweight_samples)}

            # Exclude trailing samples.
            n_train_samples = n_train_samples[:-exclude_samples]
            all_baseline_metrics = all_baseline_metrics[:, :-exclude_samples]
            all_reweighted_metrics = {key: val[:, :-exclude_samples] for key, val in all_reweighted_metrics.items()}
            all_retrained_metrics = {key: val[:, :-exclude_samples] for key, val in all_retrained_metrics.items()}

            # Store results in dictionaries.
            results_dict[param][param_val]['n_train_samples'] = n_train_samples
            results_dict[param][param_val]['metric_baseline'] = np.mean(all_baseline_metrics, 0)
            auc_results_dict[param][param_val]['metric_baseline'] = f"${np.round(np.mean(results_dict[param][param_val]['metric_baseline']), 3)}$"
            for metric_1, metric_2 in zip(all_reweighted_metrics, all_retrained_metrics):
                results_dict[param][param_val][metric_1] = np.mean(all_reweighted_metrics[metric_1], 0)
                results_dict[param][param_val][metric_2] = np.mean(all_retrained_metrics[metric_2], 0)
                t_reweight = ttest_rel(np.mean(all_reweighted_metrics[metric_1], 1), np.mean(all_baseline_metrics, 1), alternative=test_alternative).pvalue <= pvalue
                t_reweight = "^{\\ast}" if t_reweight else "\phantom{${^\\ast}$}"
                t_retrain = ttest_rel(np.mean(all_retrained_metrics[metric_2], 1), np.mean(all_baseline_metrics, 1), alternative=test_alternative).pvalue <= pvalue
                t_retrain = "{^\\ast}" if t_retrain else "\phantom{${^\\ast}$}"
                auc_results_dict[param][param_val][metric_1] = f"${np.round(np.mean(results_dict[param][param_val][metric_1]) - np.mean(results_dict[param][param_val]['metric_baseline']), 3)}{t_reweight}$"
                auc_results_dict[param][param_val][metric_2] = f"${np.round(np.mean(results_dict[param][param_val][metric_2]) - np.mean(results_dict[param][param_val]['metric_baseline']), 3)}{t_retrain}$"

    return results_dict, auc_results_dict

def main_table(metric_list, reweight_samples, experiments_dict, batch_size=[(32, 1)], test_alternative_list="greater", pvalue=0.01):
    max_reweight_samples = int(np.max(reweight_samples))
    min_reweight_samples = int(np.min(reweight_samples))
    exclude_samples = int(max_reweight_samples / min_reweight_samples)
    results_dict = {}
    auc_results_dict = {}
    for param in experiments_dict:
        results_dict[param] = {}
        auc_results_dict[param] = {}
        for param_val in experiments_dict[param]:
            # Create storages.
            #param_val = f"${param_val}$"
            results_dict[param][param_val] = {}
            auc_results_dict[param][param_val] = {}
            for metric, test_alternative in zip(metric_list, test_alternative_list):
                try:
                    # Load results.results.
                    d = get_metrics(experiments_dict[param][param_val], metric, reweight_samples=reweight_samples)


                    # Load specific data.
                    n_train_samples = d['n_train_samples']
                    all_baseline_metrics = d[f'baseline_{metric}']
                    all_reweighted_metrics = d[f'reweighted_{metric}']
                    all_reweighted_metrics = {f"cupd{b}": all_reweighted_metrics[f"reweighted{b}"] for (b, _) in batch_size}
                    all_retrained_metrics = {f'retr{b}': np.array([pad_array(all_baseline_metrics[j, i:], n_train_samples) for j in range(all_baseline_metrics.shape[0])]) for (b, i) in batch_size}

                    # Exclude trailing samples.
                    n_train_samples = n_train_samples[:-exclude_samples]
                    all_baseline_metrics = all_baseline_metrics[:, :-exclude_samples]
                    all_reweighted_metrics = {key: val[:, :-exclude_samples] for key, val in all_reweighted_metrics.items()}
                    all_retrained_metrics = {key: val[:, :-exclude_samples] for key, val in all_retrained_metrics.items()}

                    # Store results in dictionaries.
                    results_dict[param][param_val]['n_train_samples'] = n_train_samples
                    results_dict[param][param_val][metric + 'basl'] = np.mean(all_baseline_metrics, 0)
                    if metric == "training_time":
                        for metric_1, metric_2 in zip(all_reweighted_metrics, all_retrained_metrics):
                            tr_reweighted = np.mean(all_reweighted_metrics[metric_1], -1).min()
                            tr_retrained = np.mean(all_retrained_metrics[metric_2], -1).min()
                            print(all_retrained_metrics[metric_2].mean(-1))
                            #diff = np.round((tr_reweighted / tr_retrained - 1) * 100, 2)
                            auc_results_dict[param][param_val]['z' + metric + metric_1] = f"{tr_reweighted:.03f}"
                            auc_results_dict[param][param_val]['zz' + metric + metric_2] = f"{tr_retrained:.03f}"
                    elif metric == "test_prediction_time":
                        for metric_1 in all_reweighted_metrics:
                            prediction_reweighted = np.mean(all_reweighted_metrics[metric_1], -1).min()
                            print(all_reweighted_metrics[metric_1].mean(-1))
                            auc_results_dict[param][param_val]['zzz' + metric + metric_1] = f"{prediction_reweighted:.03f}"
                    else:
                        auc_results_dict[param][param_val][metric + 'basl'] = f"{np.mean(results_dict[param][param_val][metric + 'basl']):.03f}"
                        for metric_1, metric_2 in zip(all_reweighted_metrics, all_retrained_metrics):
                            results_dict[param][param_val][metric + metric_1] = np.mean(all_reweighted_metrics[metric_1], 0)
                            results_dict[param][param_val][metric + metric_2] = np.mean(all_retrained_metrics[metric_2], 0)
                            t_reweight = ttest_rel(np.mean(all_reweighted_metrics[metric_1], 1), np.mean(all_baseline_metrics, 1), alternative=test_alternative).pvalue <= pvalue
                            t_reweight = "$^{\\ast}$" if t_reweight else "\phantom{${^\\ast}$}"
                            t_retrain = ttest_rel(np.mean(all_retrained_metrics[metric_2], 1), np.mean(all_baseline_metrics, 1), alternative=test_alternative).pvalue <= pvalue
                            t_retrain = "${^\\ast}$" if t_retrain else "\phantom{${^\\ast}$}"
                            diff = np.mean(results_dict[param][param_val][metric + metric_1]) - np.mean(results_dict[param][param_val][metric + 'basl'])
                            auc_results_dict[param][param_val][metric + metric_1] = f"{'+' if diff > 0 else '-'}{diff:.03f}{t_reweight}"
                            diff = np.mean(results_dict[param][param_val][metric + metric_2]) - np.mean(results_dict[param][param_val][metric + 'basl'])
                            auc_results_dict[param][param_val][metric + metric_2] = f"{'+' if diff > 0 else '-'}{diff:.03f}{t_retrain}"
                except Exception as e:
                    print(e)
                    continue


    return results_dict, auc_results_dict

## Load Tabular Results

In [12]:
models = ['ensemble', 'mc_dropout', 'sngp', 'sngp_sampling']
reweight_samples = [16, 32, 64]
datasets = ['LETTER', "PENDIGITS", "MNIST", "FashionMNIST"]
ood_datasets = ["PENDIGITS", "LETTER", "FashionMNIST", "MNIST"]
total_results_dict = {}
evaluate_only_default = True

for dataset, ood_dataset in zip(datasets, ood_datasets):
    total_results_dict[dataset] = {}
    print(dataset)
    for model in models:
        print(model)

        # TODO:
        root_path = Path(f'/path/to/model/{model}')

        # Get experiment paths.
        all_experiments = {}
        if model == 'sngp':
            default_parameters = {
                'lmb': 1,
                'norm_bound': 6 if "MNIST" in dataset else 1,
                'num_inducing': 1024,
                'kernel_scale': 1 if "MNIST" in dataset else 8,
                'n_residual_layers': 6 if "MNIST" in dataset else 2
            }
            possible_parameters = {
                'lmb': [1, 2, 5],
                'norm_bound': [1, 6, 12] if "MNIST" in dataset else [0.5, 1, 2],
                'num_inducing': [256, 1024, 2048],
                'kernel_scale': [1, 8, 256],
                'n_residual_layers': [6] if "MNIST" in dataset else [2]
            }
            for param in default_parameters:
                all_experiments[param] = {}
                param_value_list = [default_parameters[param]] if evaluate_only_default else possible_parameters[param]
                for param_value in param_value_list:
                    query_string = f'{dataset}-' \
                                   f'sngp-' \
                                   f'lmb{default_parameters["lmb"]}-' \
                                   f'norm_bound{default_parameters["norm_bound"]}-' \
                                   f'num_inducing{default_parameters["num_inducing"]}-' \
                                   f'kernel_scale{default_parameters["kernel_scale"]}-' \
                                   f'n_residual_layers{default_parameters["n_residual_layers"]}-' \
                                   f'seed*'
                    query_string = query_string.replace(param+str(default_parameters[param]), param+str(param_value))
                    all_experiments[param][param_value] = sorted(list(root_path.glob(query_string)))
                    if evaluate_only_default:
                        break
                if evaluate_only_default:
                        break
        elif model == 'sngp_sampling':
            default_parameters = {
                'draws': 20000,
                'norm_bound':  6 if "MNIST" in dataset else 1,
                'num_inducing': 1024,
                'kernel_scale': 1 if "MNIST" in dataset else 8,
                'n_residual_layers': 6 if "MNIST" in dataset else 2
            }
            possible_parameters = {
                'draws': [1000, 10000, 20000],
                'norm_bound': [1, 6, 12] if "MNIST" in dataset else [0.5, 1, 2],
                'num_inducing': [256, 1024, 2048],
                'kernel_scale': [1, 8, 256],
                'n_residual_layers': [6] if "MNIST" in dataset else [2]
            }
            for param in default_parameters:
                all_experiments[param] = {}
                param_value_list = [default_parameters[param]] if evaluate_only_default else possible_parameters[param]
                for param_value in param_value_list:
                    query_string = f'{dataset}-' \
                                   f'sngp_sampling-' \
                                   f'draws{default_parameters["draws"]}-' \
                                   f'norm_bound{default_parameters["norm_bound"]}-' \
                                   f'num_inducing{default_parameters["num_inducing"]}-' \
                                   f'kernel_scale{default_parameters["kernel_scale"]}-' \
                                   f'n_residual_layers{default_parameters["n_residual_layers"]}-' \
                                   f'seed*'
                    query_string = query_string.replace(param+str(default_parameters[param]), param+str(param_value))
                    all_experiments[param][param_value] = sorted(list(root_path.glob(query_string)))
                    if evaluate_only_default:
                        break
                if evaluate_only_default:
                        break
        elif model == 'ensemble':
            default_parameters = {
                'draws': 10,
                'n_residual_layers': 6 if "MNIST" in dataset else 2
            }
            possible_parameters = {
                'draws': [5, 10, 20],
                'n_residual_layers': [6] if "MNIST" in dataset else [2]
            }
            for param in default_parameters:
                all_experiments[param] = {}
                param_value_list = [default_parameters[param]] if evaluate_only_default else possible_parameters[param]
                for param_value in param_value_list:
                    query_string = f'{dataset}-' \
                                   f'ensemble-' \
                                   f'draws{default_parameters["draws"]}-' \
                                   f'n_residual_layers{default_parameters["n_residual_layers"]}-' \
                                   f'seed*'
                    query_string = query_string.replace(param+str(default_parameters[param]), param+str(param_value))
                    all_experiments[param][param_value] = sorted(list(root_path.glob(query_string)))
                    if evaluate_only_default:
                        break
                if evaluate_only_default:
                        break
        elif model == 'mc_dropout':
            default_parameters = {
                'draws': 1000,
                'dropout_rate': 0.2 if "MNIST" in dataset else 0.5,
                'n_residual_layers': 6  if "MNIST" in dataset else 2
            }
            possible_parameters = {
                'draws': [100, 500, 1000] ,
                'dropout_rate': [0.1, 0.2, 0.5] if "MNIST" in dataset else [0.25, 0.5, 0.75],
                'n_residual_layers': [6]  if "MNIST" in dataset else [2]
            }
            for param in default_parameters:
                all_experiments[param] = {}
                param_value_list = [default_parameters[param]] if evaluate_only_default else possible_parameters[param]
                for param_value in param_value_list:
                    query_string = f'{dataset}-' \
                                   f'mc_dropout-' \
                                   f'draws{default_parameters["draws"]}-' \
                                   f'dropout_rate{default_parameters["dropout_rate"]}-' \
                                   f'n_residual_layers{default_parameters["n_residual_layers"]}-' \
                                   f'seed*'
                    query_string = query_string.replace(param+str(default_parameters[param]), param+str(param_value))
                    all_experiments[param][param_value] = sorted(list(root_path.glob(query_string)))
                    if evaluate_only_default:
                        break
                if evaluate_only_default:
                        break
        # Load results of experiments.
        experiments = {}
        for param in all_experiments:
            experiments[param] = {}
            for param_val in all_experiments[param]:
                experiments[param][param_val] = {}
                for exp_path in all_experiments[param][param_val]:
                    try:
                        results, args = load_results(exp_path)
                        experiments[param][param_val][exp_path] = {'results': results, 'args': args}
                    except Exception as e:
                        print(e)
                        #experiments[param].pop(param_val, None)
                print(f'Found {len(experiments[param][param_val])} experiments for {param}={param_val}')
        metric_list = ["test_acc1", f"test_auroc_{ood_dataset}_cat_entropy", f"test_auroc_{ood_dataset}_dir_variance", "test_nll", "training_time", "test_prediction_time"]
        alternative_list = ['greater', 'greater', 'greater', 'less', '', '']
        results, auc_results = main_table(metric_list=metric_list, test_alternative_list=alternative_list, reweight_samples=reweight_samples, experiments_dict=experiments, pvalue=0.01)
        total_results_dict[dataset][model] = results
        for param in auc_results:
            auc_results_param = pd.DataFrame(auc_results[param])
            auc_results_param.index.name = param
            print(
                auc_results_param.T.style.to_latex()
                .replace("\{", "^{").replace("\}", "}")
                .replace("\$", "$").replace("\\textasciicircum", "")
                .replace("\\textbackslash ast", "\\ast")
                .replace(" ^", "^").replace("{ ", "{")
                .replace("metric\_baseline", "0")
                .replace(" 0.", " .")
                .replace("+0.", "+.")
                .replace("-0.", "-.")
            )

LETTER
ensemble
Found 10 experiments for draws=10
[57.50945981 57.41137831 59.98422821 59.2546725  59.464089   60.0743186
 57.19795027 58.67310795 60.02163472 59.95361576]
[0.06173066 0.064171   0.06431121 0.06220998 0.06383737 0.06428764
 0.06391139 0.06291374 0.0639794  0.06338143]
\begin{tabular}{llllllllllllllll}
draws & test_acc1basl & test_acc1cupd32 & test_acc1retr32 & test_auroc_PENDIGITS_cat_entropybasl & test_auroc_PENDIGITS_cat_entropycupd32 & test_auroc_PENDIGITS_cat_entropyretr32 & test_auroc_PENDIGITS_dir_variancebasl & test_auroc_PENDIGITS_dir_variancecupd32 & test_auroc_PENDIGITS_dir_varianceretr32 & test_nllbasl & test_nllcupd32 & test_nllretr32 & ztraining_timecupd32 & zztraining_timeretr32 & zzztest_prediction_timecupd32 \\
10 & .542 & --.004\phantom{${^\ast}$} & +.028${^\ast}$ & .675 & --.041\phantom{${^\ast}$} & +.003${^\ast}$ & .762 & --.046\phantom{${^\ast}$} & +.009${^\ast}$ & 2.928 & +.148\phantom{${^\ast}$} & --.416${^\ast}$ & .004 & 57.198 & .062 \\
\end{tabu

In [26]:
fontsize = 12
model_name_dict = {"ensemble": "Ensemble", "mc_dropout": "Dropout", "sngp": "SNGP-LA", "sngp_sampling": "SNGP-MC"}
dataset_name_dict = {"LETTER": "LETTER", "PENDIGITS": "PDIGITS", "MNIST": "MNIST", "FashionMNIST": "FMNIST"}
style_dict = {"ensemble": {"color": "r"},
              "mc_dropout": {"color": "g"},
              "sngp": {"color": "b"},
              "sngp_sampling": {"color": "m"}}
ood_dataset_dict = {"LETTER": "PENDIGITS", "PENDIGITS": "LETTER", "MNIST": "FashionMNIST", "FashionMNIST": "MNIST"}
def get_metric_name(metric):
    if metric == "test_acc1":
        return "ACC"
    elif metric == "test_nll":
        return "NLL"
    elif "variance" in metric:
        return "AUROC (variance)"
    elif "entropy" in metric:
        return "AUROC (entropy)"
    else:
        return ValueError("Invalid metric!")
for dataset, dataset_dict in total_results_dict.items():
    for model in ["ensemble", "mc_dropout", "sngp", "sngp_sampling"]:
            for metric in ["test_acc1", "test_nll", f"test_auroc_{ood_dataset_dict[dataset]}_dir_variance", f"test_auroc_{ood_dataset_dict[dataset]}_cat_entropy"]:
                print(metric)
                try:
                    plt.figure(figsize=(4.5, 2.8))
                    plt.title(f"{dataset_name_dict[dataset]}: {model_name_dict[model]}")
                    for param, param_dict in dataset_dict[model].items():
                        for value, value_dict in param_dict.items():
                            plt.fill_between(x=value_dict["n_train_samples"], y1=value_dict[f"{metric}basl"],
                                             y2=value_dict[f"{metric}retr32"], alpha=0.1, zorder=1, color="k")
                            plt.plot(value_dict["n_train_samples"], value_dict[f"{metric}basl"], ls="--",
                                     **style_dict[model], label=f"{model_name_dict[model]} (baseline)", alpha=0.5)
                            plt.plot(value_dict["n_train_samples"], value_dict[f"{metric}cupd32"], zorder=2, ls="-", marker=".", **style_dict[model],
                                     label=f"{model_name_dict[model]} (update)", lw=1.5)
                            plt.plot(value_dict["n_train_samples"], value_dict[f"{metric}retr32"], ls="-", **style_dict[model],
                                     label=f"{model_name_dict[model]} (retrain)", alpha=0.5)
                    plt.xlabel("# samples in $\mathcal{D}$", fontsize=fontsize)
                    plt.ylabel(f"{get_metric_name(metric)}", fontsize=fontsize)
                    plt.legend()
                    plt.tight_layout()

                    if "entropy" in metric:
                        plt.savefig(f"plots/absolute_auroc_entropy_{dataset_name_dict[dataset]}_{model_name_dict[model]}.pdf")
                    else:
                        plt.savefig(f"plots/absolute_{metric.split('_')[1]}_{dataset_name_dict[dataset]}_{model_name_dict[model]}.pdf")
                    plt.close()
                except:
                    plt.close()
                    continue

for dataset, dataset_dict in total_results_dict.items():
    for metric in ["test_acc1", "test_nll", f"test_auroc_{ood_dataset_dict[dataset]}_dir_variance", f"test_auroc_{ood_dataset_dict[dataset]}_cat_entropy"]:
        try:
            plt.figure(figsize=(4.5, 4.2))
            plt.title(f"{dataset_name_dict[dataset]}")
            for model in ["ensemble", "mc_dropout", "sngp", "sngp_sampling"]:
                #if "test_auroc"in metric:
                #    metric = f"test_auroc_{ood_dataset_dict[dataset]}{auroc_metric}"
                for param, param_dict in dataset_dict[model].items():
                    for value, value_dict in param_dict.items():
                        plt.plot(value_dict["n_train_samples"], value_dict[f"{metric}cupd32"]-value_dict[f"{metric}basl"],
                                 zorder=2, marker=".", ls="-", **style_dict[model], lw=1.5)
                        plt.plot(value_dict["n_train_samples"], value_dict[f"{metric}retr32"]-value_dict[f"{metric}basl"], ls="-", **style_dict[model],
                                 alpha=0.5, label=f"{model_name_dict[model]}", zorder=1)
            plt.axhline(y=0, lw=0.8, color="k", zorder=0, ls="--")
            plt.xlabel("# samples in $\mathcal{D}$", fontsize=fontsize)
            plt.ylabel(f"$\Delta${get_metric_name(metric)}", fontsize=fontsize)
            #plt.legend()
            plt.tight_layout()
            print(metric)
            if "entropy" in metric:
                plt.savefig(f"plots/diff_auroc_entropy_{dataset_name_dict[dataset]}.pdf")
            else:
                plt.savefig(f"plots/diff_{metric.split('_')[1]}_{dataset_name_dict[dataset]}.pdf")
            plt.close()
        except:
            plt.close()
            continue

test_acc1
test_nll
test_auroc_PENDIGITS_dir_variance
test_auroc_PENDIGITS_cat_entropy
test_acc1
test_nll
test_auroc_PENDIGITS_dir_variance
test_auroc_PENDIGITS_cat_entropy
test_acc1
test_nll
test_auroc_PENDIGITS_dir_variance
test_auroc_PENDIGITS_cat_entropy
test_acc1
test_nll
test_auroc_PENDIGITS_dir_variance
test_auroc_PENDIGITS_cat_entropy
test_acc1
test_nll
test_auroc_LETTER_dir_variance
test_auroc_LETTER_cat_entropy
test_acc1
test_nll
test_auroc_LETTER_dir_variance
test_auroc_LETTER_cat_entropy
test_acc1
test_nll
test_auroc_LETTER_dir_variance
test_auroc_LETTER_cat_entropy
test_acc1
test_nll
test_auroc_LETTER_dir_variance
test_auroc_LETTER_cat_entropy
test_acc1
test_nll
test_auroc_FashionMNIST_dir_variance
test_auroc_FashionMNIST_cat_entropy
test_acc1
test_nll
test_auroc_FashionMNIST_dir_variance
test_auroc_FashionMNIST_cat_entropy
test_acc1
test_nll
test_auroc_FashionMNIST_dir_variance
test_auroc_FashionMNIST_cat_entropy
test_acc1
test_nll
test_auroc_FashionMNIST_dir_variance
test