In [1]:
from __future__ import print_function, division

import numpy as np
import gzip
from sklearn.preprocessing import LabelBinarizer

valid_labels = LabelBinarizer().fit_transform(
    np.array([float(x.decode("utf-8").split("\t")[1])
              for x in gzip.open("valid_labels.txt.gz",'rb')]))

In [2]:
#augmenting the dataset with flips and rotaitons, for more robustness
parent_folders = ["flip-False_rotamt-0",
                  "flip-True_rotamt-0",
                  "flip-False_rotamt-90",
                  "flip-True_rotamt-90",
                  "flip-False_rotamt-180",
                  "flip-True_rotamt-180",]

parent_folder_to_det_pred = {}
for parent_folder in parent_folders:
    det_preds = np.array([
            [float(y) for y in x.decode("utf-8").split("\t")[1:]]
             for x in gzip.open(parent_folder+"/deterministic_preds.txt.gz", 'rb')])
    parent_folder_to_det_pred[parent_folder] = det_preds
    
parent_folder_to_nondet_pred = {}
parent_folder_to_mean_nondet_pred = {}
for parent_folder in parent_folders:
    nondet_preds = []
    for i in range(100):
        single_nondet_pred = np.array([
            [float(y) for y in x.decode("utf-8").split("\t")[1:]]
             for x in gzip.open(
              parent_folder+"/nondeterministic_preds_"+str(i)+".txt.gz", 'rb')])
        nondet_preds.append(single_nondet_pred)
    nondet_preds = np.array(nondet_preds)
    parent_folder_to_nondet_pred[parent_folder] = nondet_preds
    parent_folder_to_mean_nondet_pred[parent_folder] = np.mean(nondet_preds,axis=0)

In [3]:
#Compute the auROC/auPRC
from sklearn.metrics import roc_auc_score

for parent_folder in parent_folders:
    print(parent_folder)
    det_preds = parent_folder_to_det_pred[parent_folder]
    mean_nondet_preds = parent_folder_to_mean_nondet_pred[parent_folder]
    print("deterministic pred auROC",
          roc_auc_score(y_true=1-valid_labels[:,0],
                              y_score=1-det_preds[:,0]))
    print("nondeterministic pred auROC",
          roc_auc_score(y_true=1-valid_labels[:,0],
                              y_score=1-mean_nondet_preds[:,0]))

flip-False_rotamt-0
deterministic pred auROC 0.9118638796723656
nondeterministic pred auROC 0.9129881925522253
flip-True_rotamt-0
deterministic pred auROC 0.9136845292158645
nondeterministic pred auROC 0.9141930341618936
flip-False_rotamt-90
deterministic pred auROC 0.9077797755493358
nondeterministic pred auROC 0.9079080860318696
flip-True_rotamt-90
deterministic pred auROC 0.9072243126739039
nondeterministic pred auROC 0.9084814670645733
flip-False_rotamt-180
deterministic pred auROC 0.916166708887612
nondeterministic pred auROC 0.9166587373671843
flip-True_rotamt-180
deterministic pred auROC 0.9131712872857287
nondeterministic pred auROC 0.9138490879246036


In [5]:
import abstention
from abstention.calibration import compute_ece, TempScaling
reload(abstention.abstention)
from abstention.abstention import (weighted_kappa_metric,
                                   WeightedKappa, DistMaxClassProbFromOne,
                                   Entropy, Uncertainty)
from collections import defaultdict, namedtuple
import numpy as np
import random

def inverse_softmax(preds):
    return np.log(preds) - np.mean(np.log(preds),axis=1)[:,None]

quadratic_weights = np.array([[(i-j)**2 for i in range(5)]
                             for j in range(5)])

AbstainerSettings = namedtuple("AbstainerSettings",
                               ["name",
                                "abstainer_factories",
                                "preds_lookup",
                                "predsamples_lookup",
                                "use_calib"])

abstainer_factories = [
        ("expected_delta_weighted_kappa", WeightedKappa(
            weights=quadratic_weights, verbose=False)),
        ("expected_delta_weighted_kappa_imbalance_from_valid", WeightedKappa(
                weights=quadratic_weights,
                estimate_class_imbalance_from_valid=True,
                verbose=False)),
        ("dist_maxclass_prob_from_one", DistMaxClassProbFromOne()),
        ("entropy", Entropy()),
        ("variance", Uncertainty())]
abstention_fractions = [0.05, 0.1, 0.15, 0.2]

abstainer_settings_list = [
    AbstainerSettings(
        name="calib_weightrescalepreds",
        abstainer_factories=abstainer_factories,
        preds_lookup=parent_folder_to_det_pred,
        predsamples_lookup=parent_folder_to_nondet_pred,
        use_calib=True),
    AbstainerSettings(
        name="uncalib_weightrescalepreds",
        abstainer_factories=abstainer_factories,
        preds_lookup=parent_folder_to_det_pred,
        predsamples_lookup=parent_folder_to_nondet_pred,
        use_calib=False),
    AbstainerSettings(
        name="calib_mcdrpreds",
        abstainer_factories=abstainer_factories,
        preds_lookup=parent_folder_to_mean_nondet_pred,
        predsamples_lookup=parent_folder_to_nondet_pred,
        use_calib=True),
    AbstainerSettings(
        name="uncalib_mcdrpreds",
        abstainer_factories=abstainer_factories,
        preds_lookup=parent_folder_to_mean_nondet_pred,
        predsamples_lookup=parent_folder_to_nondet_pred,
        use_calib=False)
]

num_folds = 50

settingsname_to_metric_to_fraction_to_method_to_perfs = {}
settingsname_to_metric_to_baselineperfs = {}

for abstainer_settings in abstainer_settings_list:
    
    settings_name = abstainer_settings.name
    print("abstainer settings", settings_name)
    abstainer_factories = abstainer_settings.abstainer_factories
    preds_lookup = abstainer_settings.preds_lookup
    predsamples_lookup = abstainer_settings.predsamples_lookup
    use_calib = abstainer_settings.use_calib
    
    metric_to_fraction_to_method_to_perfs =\
        defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    settingsname_to_metric_to_fraction_to_method_to_perfs[settings_name] =\
        metric_to_fraction_to_method_to_perfs
    metric_to_baselineperfs = defaultdict(list)   
    settingsname_to_metric_to_baselineperfs[settings_name] =\
        metric_to_baselineperfs
    
    for fold_number in range(num_folds):
        print("on fold",fold_number)

        np.random.seed(fold_number*1000)
        random.seed(fold_number*1000)
        #the data is in pairs of (left eye, right eye) per patient (entry for
        # the right eye comes after the entry for the left eye); hence, the number of
        # unique patients is 0.5*len(valid_labels)
        patient_id_ordering = list(range(int(0.5*len(valid_labels))))
        np.random.shuffle(patient_id_ordering)

        pseudovalid_uncalib_preds = []
        pseudotest_uncalib_preds = []
        pseudovalid_uncalib_predsamples = []
        pseudotest_uncalib_predsamples = []
        pseudovalid_labels = []
        pseudotest_labels = []
        pseudovalid_label_counts = np.zeros(5)
        pseudotest_label_counts = np.zeros(5)
        for i in patient_id_ordering:
            left_eye_label = valid_labels[2*i]
            right_eye_label = valid_labels[(2*i)+1]
            most_diseased_label = max(np.argmax(left_eye_label),
                                      np.argmax(right_eye_label))
            if (pseudovalid_label_counts[most_diseased_label] <
                pseudotest_label_counts[most_diseased_label]):
                append_to_uncalib_preds = pseudovalid_uncalib_preds
                append_to_uncalib_predsamples = pseudovalid_uncalib_predsamples
                append_to_labels = pseudovalid_labels
                append_to_label_counts = pseudovalid_label_counts
            else:
                append_to_uncalib_preds = pseudotest_uncalib_preds
                append_to_uncalib_predsamples = pseudotest_uncalib_predsamples
                append_to_labels = pseudotest_labels
                append_to_label_counts = pseudotest_label_counts

            for parent_folder in parent_folders:        
                append_to_labels.append(valid_labels[2*i])
                append_to_labels.append(valid_labels[(2*i)+1])
                append_to_label_counts += valid_labels[2*i]
                append_to_label_counts += valid_labels[(2*i)+1]
                append_to_uncalib_preds.append(
                        preds_lookup[parent_folder][2*i])
                append_to_uncalib_preds.append(
                        preds_lookup[parent_folder][(2*i)+1])
                append_to_uncalib_predsamples.append(
                        predsamples_lookup[parent_folder][:,(2*i)])
                append_to_uncalib_predsamples.append(
                        predsamples_lookup[parent_folder][:,(2*i)+1])

        pseudovalid_uncalib_preds = np.array(pseudovalid_uncalib_preds)
        pseudotest_uncalib_preds = np.array(pseudotest_uncalib_preds)
        pseudovalid_uncalib_pred_logits = inverse_softmax(pseudovalid_uncalib_preds)
        pseudotest_uncalib_pred_logits = inverse_softmax(pseudotest_uncalib_preds)
        pseudovalid_uncalib_predsamples = np.array(pseudovalid_uncalib_predsamples).transpose((1,0,2))
        pseudotest_uncalib_predsamples = np.array(pseudotest_uncalib_predsamples).transpose((1,0,2))
        pseudovalid_uncalib_predsamples_logits = np.array([
                inverse_softmax(x) for x in pseudovalid_uncalib_predsamples])        
        pseudotest_uncalib_predsamples_logits = np.array([
                inverse_softmax(x) for x in pseudotest_uncalib_predsamples])
        pseudovalid_labels = np.array(pseudovalid_labels) 
        pseudotest_labels = np.array(pseudotest_labels)
        #print("valid vs. test distribution shift",
        #      np.abs(pseudovalid_label_counts-pseudotest_label_counts)/
        #            (pseudovalid_label_counts+pseudotest_label_counts))
        
        pseudovalid_uncalib_variance = np.sum(np.var(pseudovalid_uncalib_predsamples,axis=0),axis=-1)
        pseudotest_uncalib_variance = np.sum(np.var(pseudotest_uncalib_predsamples,axis=0),axis=-1)
        
        if (use_calib):
            #print("ece before temp scale - valid",
            #  compute_ece(softmax_out=pseudovalid_uncalib_preds,
            #              labels=pseudovalid_labels,
            #              bins=15))
            #print("ece before temp scale - test",
            #      compute_ece(softmax_out=pseudotest_uncalib_preds,
            #                  labels=pseudotest_labels,
            #                  bins=15))
            temp_scaler = TempScaling(ece_bins=15, verbose=False)(
                                valid_preacts=pseudovalid_uncalib_pred_logits,
                                valid_labels=pseudovalid_labels)
            pseudovalid_calib_preds = temp_scaler(pseudovalid_uncalib_pred_logits)
            pseudotest_calib_preds = temp_scaler(pseudotest_uncalib_pred_logits)
            pseudovalid_calib_predsamples = np.array(
                [temp_scaler(x) for x in pseudovalid_uncalib_predsamples_logits])
            pseudotest_calib_predsamples = np.array(
                [temp_scaler(x) for x in pseudotest_uncalib_predsamples_logits])
            
            pseudovalid_calib_variance = np.sum(np.var(pseudovalid_calib_predsamples, axis=0),axis=-1)
            pseudotest_calib_variance = np.sum(np.var(pseudotest_calib_predsamples, axis=0),axis=-1)

            #print("ece after temp scale - valid",
            #      compute_ece(softmax_out=pseudovalid_calib_preds,
            #            labels=pseudovalid_labels,
            #            bins=15))
            #print("ece after temp scale - test",
            #      compute_ece(softmax_out=pseudotest_calib_preds,
            #            labels=pseudotest_labels,
            #            bins=15))
            
        if (use_calib):
            pseudotest_preds_to_use=pseudotest_calib_preds
            pseudovalid_preds_to_use=pseudovalid_calib_preds
            pseudotest_variance_to_use=pseudotest_calib_variance
            pseudovalid_variance_to_use=pseudovalid_calib_variance
        else:
            pseudotest_preds_to_use=pseudotest_uncalib_preds
            pseudovalid_preds_to_use=pseudovalid_uncalib_preds
            pseudotest_variance_to_use=pseudotest_uncalib_variance
            pseudovalid_variance_to_use=pseudovalid_uncalib_variance

        original_weighted_kappa_perf = weighted_kappa_metric(
            predprobs=pseudotest_preds_to_use,
            true_labels=pseudotest_labels,
            weights=quadratic_weights)
        
        #print("\nPseudotest set weighted kappa",
        #      original_weighted_kappa_perf)
        metric_to_baselineperfs["weighted_kappa"].append(
            original_weighted_kappa_perf)
        original_accuracy_perf = np.mean(
            np.argmax(pseudotest_preds_to_use,axis=-1)
            ==np.argmax(pseudotest_labels,axis=-1))
        #print("Pseudotest set accuracy",original_accuracy_perf)
        metric_to_baselineperfs["accuracy"].append(original_accuracy_perf)

        for abstention_fraction in abstention_fractions:
            #print("\nabstention fraction:",abstention_fraction)
            for abstainer_name, abstainer_factory in abstainer_factories:
                abstainer = abstainer_factory(
                    valid_labels=pseudovalid_labels,
                    valid_posterior=pseudovalid_preds_to_use)
                abstainer_priorities = abstainer(
                    posterior_probs=pseudotest_preds_to_use,
                    uncertainties=pseudotest_variance_to_use)
                indices_to_retain = (
                    [y[0] for y in sorted(enumerate(abstainer_priorities),
                        key=lambda x: x[1])][:int(len(abstainer_priorities)*
                                                     (1-abstention_fraction))])
                retained_pseudotest_preds = np.array(
                    [pseudotest_preds_to_use[i] for i in indices_to_retain])
                retained_pseudotest_labels = np.array(
                    [pseudotest_labels[i] for i in indices_to_retain])
                #print("\nAbstention criterion:",abstainer_name)
                weighted_kappa_perf = weighted_kappa_metric(
                    predprobs=retained_pseudotest_preds,
                    true_labels=retained_pseudotest_labels,
                    weights=quadratic_weights)
                #print("weighted kappa", weighted_kappa_perf)
                accuracy_perf = (np.mean(np.argmax(
                    retained_pseudotest_preds,axis=-1)
                    ==np.argmax(retained_pseudotest_labels,axis=-1)))
                #print("accuracy", accuracy_perf)

                metric_to_fraction_to_method_to_perfs["delta_weighted_kappa"][
                    abstention_fraction][abstainer_name].append(
                        weighted_kappa_perf-original_weighted_kappa_perf)
                metric_to_fraction_to_method_to_perfs["delta_accuracy"][
                    abstention_fraction][abstainer_name].append(
                        accuracy_perf-original_accuracy_perf)

abstainer settings calib_weightrescalepreds
on fold 0
on fold 1
on fold 2
on fold 3
on fold 4
on fold 5
on fold 6
on fold 7
on fold 8
on fold 9
on fold 10
on fold 11
on fold 12
on fold 13
on fold 14
on fold 15
on fold 16
on fold 17
on fold 18
on fold 19
on fold 20
on fold 21
on fold 22
on fold 23
on fold 24
on fold 25
on fold 26
on fold 27
on fold 28
on fold 29
on fold 30
on fold 31
on fold 32
on fold 33
on fold 34
on fold 35
on fold 36
on fold 37
on fold 38
on fold 39
on fold 40
on fold 41
on fold 42
on fold 43
on fold 44
on fold 45
on fold 46
on fold 47
on fold 48
on fold 49
abstainer settings uncalib_weightrescalepreds
on fold 0
on fold 1
on fold 2
on fold 3
on fold 4
on fold 5
on fold 6
on fold 7
on fold 8
on fold 9
on fold 10
on fold 11
on fold 12
on fold 13
on fold 14
on fold 15
on fold 16
on fold 17
on fold 18
on fold 19
on fold 20
on fold 21
on fold 22
on fold 23
on fold 24
on fold 25
on fold 26
on fold 27
on fold 28
on fold 29
on fold 30
on fold 31
on fold 32
on fold 33
on fol

In [6]:
import json
fh = open("balanced_abstention_results.json", 'w')
fh.write(json.dumps({
            "settingsname_to_metric_to_fraction_to_method_to_perfs":
              settingsname_to_metric_to_fraction_to_method_to_perfs,
            "settingsname_to_metric_to_baselineperfs":
              settingsname_to_metric_to_baselineperfs},
             sort_keys=True,
             indent=4,
             separators=(',', ': ')))
fh.close()

In [8]:
import json
loaded_data = json.loads(open("balanced_abstention_results.json").read())
settingsname_to_metric_to_fraction_to_method_to_perfs =\
    loaded_data["settingsname_to_metric_to_fraction_to_method_to_perfs"]
settingsname_to_metric_to_baselineperfs =\
    loaded_data["settingsname_to_metric_to_baselineperfs"]

In [90]:
reload(abstention.figure_making_utils)
from abstention.figure_making_utils import (
    wilcox_srs, get_ustats_mat,
    get_tied_top_and_worst_methods)
from collections import OrderedDict

comparison_groups = OrderedDict([
        ('With weight rescaling', [
          ('calib_weightrescalepreds', 'expected_delta_weighted_kappa'),
          ('uncalib_weightrescalepreds', 'expected_delta_weighted_kappa'),
          ('calib_weightrescalepreds', 'dist_maxclass_prob_from_one'),
          ('uncalib_weightrescalepreds', 'dist_maxclass_prob_from_one'),
          ('calib_weightrescalepreds', 'entropy'),
          ('uncalib_weightrescalepreds', 'entropy'),
          ('calib_weightrescalepreds', 'variance'),
          ('uncalib_weightrescalepreds', 'variance')]),
       ('With MC dropout', [
          ('calib_mcdrpreds', 'expected_delta_weighted_kappa'),
          ('uncalib_mcdrpreds', 'expected_delta_weighted_kappa'),
          ('calib_mcdrpreds', 'dist_maxclass_prob_from_one'),
          ('uncalib_mcdrpreds', 'dist_maxclass_prob_from_one'),
          ('calib_mcdrpreds', 'entropy'),
          ('uncalib_mcdrpreds', 'entropy'),
          ('calib_mcdrpreds', 'variance'),
          ('uncalib_mcdrpreds', 'variance')])
    ])

friendly_method_names = {
    'expected_delta_weighted_kappa': 'E[$\Delta$Kappa]',
    'dist_maxclass_prob_from_one': 'Max Class Prob.',
    'entropy': 'Entropy',
    'variance': 'MC Dropout Var.'
}
abstention_fractions = ['0.05', '0.1', '0.15', '0.2']

for comparison_group_name in comparison_groups:
    
    print("On comparison group", comparison_group_name)
    
    for metric in ["weighted_kappa",
                   "accuracy"]:
        print("On metric", metric)
        
        #gather all the necessary data
        settingnmethod_to_baselineperfs = OrderedDict()
        settingnmethod_to_abstentionfraction_to_perfs = OrderedDict()
        for (settingsname, methodname) in comparison_groups[comparison_group_name]:
            settingnmethod = settingsname+"-"+methodname
            settingnmethod_to_baselineperfs[settingnmethod] =\
                settingsname_to_metric_to_baselineperfs[settingsname][metric]
            
            
            abstentionfraction_to_perfs = OrderedDict()
            settingnmethod_to_abstentionfraction_to_perfs[settingnmethod] =\
                abstentionfraction_to_perfs
            for abstention_fraction in abstention_fractions:
                abstentionfraction_to_perfs[abstention_fraction] = (
                    settingsname_to_metric_to_fraction_to_method_to_perfs[
                        settingsname]["delta_"+metric][abstention_fraction][
                        methodname])
        
        #prepare the table contents
        
        settingnmethod_to_tablecontents = OrderedDict()
        for settingnmethod in settingnmethod_to_baselineperfs:
            tablerow = {}
            settingnmethod_to_tablecontents[settingnmethod] = tablerow
            tablerow['baseline'] = {
                'mean': np.mean(settingnmethod_to_baselineperfs[settingnmethod]),
                'stderr': np.std(settingnmethod_to_baselineperfs[settingnmethod],
                                 ddof=1)/np.sqrt(num_folds)}
            tablerow['method'] = settingnmethod.split("-")[1]
            tablerow['mcdr'] = "mcdr" in settingnmethod.split("-")[0]
            tablerow['calib'] = ("uncalib" in settingnmethod.split("-")[0])==False
        
        for abstention_fraction in abstention_fractions:
            method_to_perfs = OrderedDict()
            for settingnmethod in settingnmethod_to_tablecontents:
                perfsdelta = settingnmethod_to_abstentionfraction_to_perfs[
                    settingnmethod][abstention_fraction]
                method_to_perfs[settingnmethod] = perfsdelta
                mean_perfsdelta = np.mean(perfsdelta)
                stderr_perfsdelta = (np.std(perfsdelta,ddof=1)/
                                     np.sqrt(num_folds))
                tablerow = settingnmethod_to_tablecontents[settingnmethod]
                tablerow[abstention_fraction] = {
                    'mean': mean_perfsdelta,
                    'stderr': stderr_perfsdelta}
            methods_to_consider = list(method_to_perfs.keys())
            ustats_mat = get_ustats_mat(
                method_to_perfs,
                methods_to_consider,
                max_ustat=1275)
            tied_top_methods, tied_worst_methods =(
                get_tied_top_and_worst_methods(
                    ustats_mat,
                    methods_to_consider,
                    #0.05 threshold for one-sided test when N=119 is 50
                    #http://www.real-statistics.com/statistics-tables/wilcoxon-signed-ranks-table/
                    threshold=120
                ))
            tied_top_methods = [methods_to_consider[x]
                                for x in tied_top_methods]
            #print(abstention_fraction)
            #print(tied_top_methods)
            for settingnmethod in settingnmethod_to_tablecontents:
                settingnmethod_to_tablecontents[
                    settingnmethod][abstention_fraction][
                    'istop'] = (settingnmethod in tied_top_methods)
            #print(settingnmethod_to_tablecontents)
            
        thestr = "\\begin{tabular}{ | c | c | c | c | c | c | c | }\n"
        thestr += "\\hline Method & Calibrated? & $\\Delta$ @"
        thestr += " & $\\Delta$ @".join([str(int(100*float(x)))+"\\% Abs."
                              for x in abstention_fractions])
        thestr += "\\\\ \\hline\n"
        for settingnmethod in settingnmethod_to_tablecontents:
            tablerow = settingnmethod_to_tablecontents[settingnmethod]
            thestr += friendly_method_names[tablerow['method']]
            thestr += " & "+("Y" if tablerow['calib'] else "N")
            #thestr += " & "+(str(np.round(tablerow['baseline']['mean'],4))
            #                 +" $\\pm$ "
            #                 +str(np.round(tablerow['baseline']['stderr'],4)))
            for abstention_fraction in abstention_fractions:
                thestr += (
                    " & "+
                    ("\\textbf{" if tablerow[abstention_fraction]['istop'] else "")
                    +str(np.round(tablerow[abstention_fraction]['mean'],4))
                    +" $\\pm$ "
                    +str(np.round(tablerow[abstention_fraction]['stderr'],4))
                    +("}" if tablerow[abstention_fraction]['istop'] else ""))
            thestr += "\\\\ \hline\n"
        thestr += "\\end{tabular}\n"
        
        print("\nBaseline "+metric+" perfs:")
        baseline_mean = set(
                x['baseline']['mean'] for x in
                settingnmethod_to_tablecontents.values())
        #assert that all the methods have the same baseline
        assert len(baseline_mean)==1
        baseline_mean = list(baseline_mean)[0]
        baseline_stderr = set(
                x['baseline']['stderr'] for x in
                settingnmethod_to_tablecontents.values())
        assert len(baseline_stderr)==1
        baseline_stderr = list(baseline_stderr)[0]
        print(np.round(baseline_mean,4),"$\\pm$",np.round(baseline_stderr,4))
        
        print("\n Latex Table for metric "
              +metric+" and group "+comparison_group_name
              +"\n\n"+thestr)
        


On comparison group With weight rescaling
On metric weighted_kappa

Baseline weighted_kappa perfs:
0.8147 $\pm$ 0.0014

 Latex Table for metric weighted_kappa and group With weight rescaling

\begin{tabular}{ | c | c | c | c | c | c | c | }
\hline Method & Calibrated? & $\Delta$ @5\% Abs. & $\Delta$ @10\% Abs. & $\Delta$ @15\% Abs. & $\Delta$ @20\% Abs.\\ \hline
E[$\Delta$Kappa] & Y & \textbf{0.0311 $\pm$ 0.0004} & \textbf{0.0478 $\pm$ 0.0005} & \textbf{0.0603 $\pm$ 0.0007} & \textbf{0.07 $\pm$ 0.0008}\\ \hline
E[$\Delta$Kappa] & N & 0.0303 $\pm$ 0.0004 & \textbf{0.0467 $\pm$ 0.0005} & \textbf{0.0592 $\pm$ 0.0007} & 0.0667 $\pm$ 0.0008\\ \hline
Max Class Prob. & Y & 0.0162 $\pm$ 0.0003 & 0.017 $\pm$ 0.0006 & 0.015 $\pm$ 0.0009 & 0.0035 $\pm$ 0.0015\\ \hline
Max Class Prob. & N & 0.0121 $\pm$ 0.0003 & 0.0137 $\pm$ 0.0005 & 0.011 $\pm$ 0.0009 & -0.0013 $\pm$ 0.0015\\ \hline
Entropy & Y & 0.0197 $\pm$ 0.0004 & 0.0308 $\pm$ 0.0006 & 0.0334 $\pm$ 0.0009 & 0.0242 $\pm$ 0.0013\\ \hline
Entrop

In [83]:
settingsname_to_metric_to_fraction_to_method_to_perfs[
    'uncalib_weightrescalepreds']['delta_accuracy']['0.05'][
    'dist_maxclass_prob_from_one']

[0.023462290436192412,
 0.0214366488865273,
 0.021476275917065357,
 0.02205111960379591,
 0.023896461911217304,
 0.02309351717497876,
 0.022030682661979717,
 0.021571986856775638,
 0.022255034867693535,
 0.023564593301435455,
 0.023848684210526327,
 0.023895009298009917,
 0.022561272303673885,
 0.023088294400959164,
 0.022722288676236047,
 0.02257453239141738,
 0.02234350079744818,
 0.024645248751643534,
 0.023342393710871767,
 0.022606173047142963,
 0.023083525781202097,
 0.022604393034181935,
 0.022345184488815284,
 0.023021331738437012,
 0.022396509807839582,
 0.023595279055228402,
 0.023372822046464514,
 0.021531680665971775,
 0.022360171579480448,
 0.022368421052631593,
 0.02195972886762365,
 0.02309783164047341,
 0.023507592323884663,
 0.024001598623004328,
 0.022489719082880888,
 0.022229829306120386,
 0.022500051092354534,
 0.021511164274322203,
 0.02097762734874986,
 0.021981953144220356,
 0.024266257019521897,
 0.02314324706673121,
 0.023031299840510444,
 0.021946096430574547

In [84]:
settingsname_to_metric_to_fraction_to_method_to_perfs[
    'calib_weightrescalepreds']['delta_accuracy']['0.05'][
    'dist_maxclass_prob_from_one']

[0.023562090835394067,
 0.021536449285728843,
 0.021476275917065357,
 0.02205111960379591,
 0.0244952643064269,
 0.023891920368591557,
 0.02233008385958457,
 0.022170789251985235,
 0.02215523446849188,
 0.022966507177033524,
 0.02444677033492826,
 0.02479420002635435,
 0.021562171494402205,
 0.023387695598563907,
 0.022124202551834227,
 0.022774133189820578,
 0.0227422248803828,
 0.024745049150845078,
 0.02294319211406537,
 0.023205633532705994,
 0.022883924982798898,
 0.023203195429391532,
 0.022844186484823337,
 0.023220693779904322,
 0.022496419888766717,
 0.02299581856966537,
 0.023173221248061315,
 0.022429884258786115,
 0.02216057078107725,
 0.022268740031897938,
 0.022258771929824617,
 0.023497033237279696,
 0.023707193122287862,
 0.02380199782460113,
 0.02209051748607449,
 0.021830627709313988,
 0.022699651890757733,
 0.021112440191387583,
 0.021077537429676996,
 0.02168222290143884,
 0.024266257019521897,
 0.022843845869126467,
 0.02352970494417872,
 0.0222454976281794,
 0.023