In [1]:
from __future__ import print_function, division

import numpy as np
import gzip
from sklearn.preprocessing import LabelBinarizer

valid_labels = LabelBinarizer().fit_transform(
    np.array([float(x.decode("utf-8").split("\t")[1])
              for x in gzip.open("valid_labels.txt.gz",'rb')]))

In [2]:
#augmenting the dataset with flips and rotaitons, for more robustness
parent_folders = ["flip-False_rotamt-0",
                  "flip-True_rotamt-0",
                  "flip-False_rotamt-90",
                  "flip-True_rotamt-90",
                  "flip-False_rotamt-180",
                  "flip-True_rotamt-180",
                  "flip-False_rotamt-270",
                  "flip-True_rotamt-270",]

parent_folder_to_det_pred = {}
for parent_folder in parent_folders:
    det_preds = np.array([
            [float(y) for y in x.decode("utf-8").split("\t")[1:]]
             for x in gzip.open(parent_folder+"/deterministic_preds.txt.gz", 'rb')])
    parent_folder_to_det_pred[parent_folder] = det_preds
    
parent_folder_to_nondet_pred = {}
parent_folder_to_mean_nondet_pred = {}
for parent_folder in parent_folders:
    nondet_preds = []
    for i in range(100):
        single_nondet_pred = np.array([
            [float(y) for y in x.decode("utf-8").split("\t")[1:]]
             for x in gzip.open(
              parent_folder+"/nondeterministic_preds_"+str(i)+".txt.gz", 'rb')])
        nondet_preds.append(single_nondet_pred)
    nondet_preds = np.array(nondet_preds)
    parent_folder_to_nondet_pred[parent_folder] = nondet_preds
    parent_folder_to_mean_nondet_pred[parent_folder] = np.mean(nondet_preds,axis=0)

In [3]:
#Compute the auROC/auPRC
from sklearn.metrics import roc_auc_score

for parent_folder in parent_folders:
    print(parent_folder)
    det_preds = parent_folder_to_det_pred[parent_folder]
    mean_nondet_preds = parent_folder_to_mean_nondet_pred[parent_folder]
    print("deterministic pred auROC",
          roc_auc_score(y_true=1-valid_labels[:,0],
                              y_score=1-det_preds[:,0]))
    print("nondeterministic pred auROC",
          roc_auc_score(y_true=1-valid_labels[:,0],
                              y_score=1-mean_nondet_preds[:,0]))

flip-False_rotamt-0
deterministic pred auROC 0.9118638796723656
nondeterministic pred auROC 0.9129881925522253
flip-True_rotamt-0
deterministic pred auROC 0.9136845292158645
nondeterministic pred auROC 0.9141930341618936
flip-False_rotamt-90
deterministic pred auROC 0.9077797755493358
nondeterministic pred auROC 0.9079080860318696
flip-True_rotamt-90
deterministic pred auROC 0.9072243126739039
nondeterministic pred auROC 0.9084814670645733
flip-False_rotamt-180
deterministic pred auROC 0.916166708887612
nondeterministic pred auROC 0.9166587373671843
flip-True_rotamt-180
deterministic pred auROC 0.9131712872857287
nondeterministic pred auROC 0.9138490879246036
flip-False_rotamt-270
deterministic pred auROC 0.9074450973244279
nondeterministic pred auROC 0.9075221248051144
flip-True_rotamt-270
deterministic pred auROC 0.9051834963473733
nondeterministic pred auROC 0.9063560028916199


In [41]:
import abstention
reload(abstention.abstention)
reload(abstention.calibration)
from abstention.calibration import (compute_ece, compute_ece_with_bins,
                                    TempScaling,
                                    BiasCorrectionWrapper,
                                    EMImbalanceAdapter,
                                    EMBiasCorrectorFactory)
from abstention.abstention import (weighted_kappa_metric,
                                   WeightedKappa, DistMaxClassProbFromOne,
                                   Entropy, Uncertainty)
from collections import defaultdict, namedtuple
import numpy as np
import random
import sys

def inverse_softmax(preds):
    return np.log(preds) - np.mean(np.log(preds),axis=1)[:,None]

quadratic_weights = np.array([[(i-j)**2 for i in range(5)]
                             for j in range(5)])

AbstainerSettings = namedtuple("AbstainerSettings",
                               ["name",
                                "abstainer_factories",
                                "preds_lookup",
                                "predsamples_lookup",
                                "calibrator",
                                "imbalance_subsampling",
                                "imbalance_adapter"])

abstainer_factories = [
        ("expected_delta_argmaxWeightedKappa", WeightedKappa(
            weights=quadratic_weights, verbose=False, mode='argmax')),
        ("expected_delta_optimWeightedKappa", WeightedKappa(
            weights=quadratic_weights, verbose=False, mode='optim')),
        ("dist_maxclass_prob_from_one", DistMaxClassProbFromOne()),
        ("entropy", Entropy()),
        ("variance", Uncertainty())]
abstention_fractions = [0.05, 0.1, 0.15, 0.2]

abstainer_settings_list = [
    AbstainerSettings(
        name="uncalib_weightrescalepreds",
        abstainer_factories=abstainer_factories,
        preds_lookup=parent_folder_to_det_pred,
        predsamples_lookup=parent_folder_to_nondet_pred,
        calibrator=None,
        imbalance_subsampling=None, imbalance_adapter=None),
    AbstainerSettings(
        name="tempscale-calib_weightrescalepreds",
        abstainer_factories=abstainer_factories,
        preds_lookup=parent_folder_to_det_pred,
        predsamples_lookup=parent_folder_to_nondet_pred,
        calibrator=TempScaling(ece_bins=15, verbose=False),
        imbalance_subsampling=None, imbalance_adapter=None),
    AbstainerSettings(
        name="tempscalebiascor-calib_weightrescalepreds_imbalanced_adapted",
        abstainer_factories=abstainer_factories,
        preds_lookup=parent_folder_to_det_pred,
        predsamples_lookup=parent_folder_to_nondet_pred,
        calibrator=TempScaling(ece_bins=15, verbose=False,
                               bias_positions=[0,1,2,3,4]),
        imbalance_subsampling=[1, 2, 5, 8, 8], #these are upsample factors
        imbalance_adapter=EMImbalanceAdapter(verbose=False)),
    AbstainerSettings(
        name="tempscalebiascor-calib_weightrescalepreds_imbalanced_unadapted",
        abstainer_factories=abstainer_factories,
        preds_lookup=parent_folder_to_det_pred,
        predsamples_lookup=parent_folder_to_nondet_pred,
        calibrator=TempScaling(ece_bins=15, verbose=False,
                               bias_positions=[0,1,2,3,4]),
        imbalance_subsampling=[1, 2, 5, 8, 8], #these are upsample factors
        imbalance_adapter=None),
    AbstainerSettings(
        name="uncalib_mcdrpreds",
        abstainer_factories=abstainer_factories,
        preds_lookup=parent_folder_to_mean_nondet_pred,
        predsamples_lookup=parent_folder_to_nondet_pred,
        calibrator=None,
        imbalance_subsampling=None, imbalance_adapter=None),
    AbstainerSettings(
        name="tempscale-calib_mcdrpreds",
        abstainer_factories=abstainer_factories,
        preds_lookup=parent_folder_to_mean_nondet_pred,
        predsamples_lookup=parent_folder_to_nondet_pred,
        calibrator=TempScaling(ece_bins=15, verbose=False),
        imbalance_subsampling=None, imbalance_adapter=None),
    AbstainerSettings(
        name="tempscalebiascor-calib_mcdrpreds_imbalanced_adapted",
        abstainer_factories=abstainer_factories,
        preds_lookup=parent_folder_to_mean_nondet_pred,
        predsamples_lookup=parent_folder_to_nondet_pred,
        calibrator=TempScaling(ece_bins=15, verbose=False,
                               bias_positions=[0,1,2,3,4]),
        imbalance_subsampling=[1, 2, 5, 8, 8], #these are upsample factors
        imbalance_adapter=EMImbalanceAdapter(verbose=False)),
    AbstainerSettings(
        name="tempscalebiascor-calib_mcdrpreds_imbalanced_unadapted",
        abstainer_factories=abstainer_factories,
        preds_lookup=parent_folder_to_mean_nondet_pred,
        predsamples_lookup=parent_folder_to_nondet_pred,
        calibrator=TempScaling(ece_bins=15, verbose=False,
                               bias_positions=[0,1,2,3,4]),
        imbalance_subsampling=[1, 2, 5, 8, 8], #these are upsample factors
        imbalance_adapter=None),
]

num_folds = 50

settingsname_to_metric_to_fraction_to_method_to_perfs = {}
settingsname_to_metric_to_baselineperfs = {}

for abstainer_settings in abstainer_settings_list:
    
    settings_name = abstainer_settings.name
    print("abstainer settings", settings_name)
    abstainer_factories = abstainer_settings.abstainer_factories
    preds_lookup = abstainer_settings.preds_lookup
    predsamples_lookup = abstainer_settings.predsamples_lookup
    calibrator = abstainer_settings.calibrator
    imbalance_subsampling = abstainer_settings.imbalance_subsampling
    imbalance_adapter = abstainer_settings.imbalance_adapter
    
    metric_to_fraction_to_method_to_perfs =\
        defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    settingsname_to_metric_to_fraction_to_method_to_perfs[settings_name] =\
        metric_to_fraction_to_method_to_perfs
    metric_to_baselineperfs = defaultdict(list)   
    settingsname_to_metric_to_baselineperfs[settings_name] =\
        metric_to_baselineperfs
    
    for fold_number in range(num_folds):
        print("on fold",fold_number)

        np.random.seed(fold_number*1000)
        random.seed(fold_number*1000)
        #the data is in pairs of (left eye, right eye) per patient (entry for
        # the right eye comes after the entry for the left eye); hence, the number of
        # unique patients is 0.5*len(valid_labels)
        patient_id_ordering = list(range(int(0.5*len(valid_labels))))
        np.random.shuffle(patient_id_ordering)

        pseudovalid_uncalib_preds = []
        pseudotest_uncalib_preds = []
        pseudovalid_uncalib_predsamples = []
        pseudotest_uncalib_predsamples = []
        pseudovalid_labels = []
        pseudotest_labels = []
        pseudovalid_label_counts = np.zeros(5)
        pseudotest_label_counts = np.zeros(5)
        for i in patient_id_ordering:
            left_eye_label = valid_labels[2*i]
            right_eye_label = valid_labels[(2*i)+1]
            most_diseased_label = max(np.argmax(left_eye_label),
                                      np.argmax(right_eye_label))
            if (pseudovalid_label_counts[most_diseased_label] <
                pseudotest_label_counts[most_diseased_label]):
                in_test = False
                append_to_uncalib_preds = pseudovalid_uncalib_preds
                append_to_uncalib_predsamples = pseudovalid_uncalib_predsamples
                append_to_labels = pseudovalid_labels
                append_to_label_counts = pseudovalid_label_counts
            else:
                in_test = True
                append_to_uncalib_preds = pseudotest_uncalib_preds
                append_to_uncalib_predsamples = pseudotest_uncalib_predsamples
                append_to_labels = pseudotest_labels
                append_to_label_counts = pseudotest_label_counts
            
            append_to_label_counts += valid_labels[2*i]
            append_to_label_counts += valid_labels[(2*i)+1]
            for parent_folder_idx,parent_folder in enumerate(parent_folders):
                if ((not in_test) or (imbalance_subsampling is None) or
                    imbalance_subsampling[np.argmax(valid_labels[2*i])] > parent_folder_idx):
                    append_to_labels.append(valid_labels[2*i])
                    append_to_uncalib_preds.append(
                            preds_lookup[parent_folder][2*i])
                    append_to_uncalib_predsamples.append(
                        predsamples_lookup[parent_folder][:,(2*i)])                    
                if ((not in_test) or (imbalance_subsampling is None) or
                    imbalance_subsampling[np.argmax(valid_labels[(2*i) + 1])] > parent_folder_idx): 
                    append_to_labels.append(valid_labels[(2*i)+1])
                    append_to_uncalib_preds.append(
                        preds_lookup[parent_folder][(2*i)+1])
                    append_to_uncalib_predsamples.append(
                        predsamples_lookup[parent_folder][:,(2*i)+1])
                
        pseudovalid_uncalib_preds = np.array(pseudovalid_uncalib_preds)
        pseudotest_uncalib_preds = np.array(pseudotest_uncalib_preds)
        pseudovalid_uncalib_pred_logits = inverse_softmax(pseudovalid_uncalib_preds)
        pseudotest_uncalib_pred_logits = inverse_softmax(pseudotest_uncalib_preds)
        pseudovalid_uncalib_predsamples = np.array(pseudovalid_uncalib_predsamples).transpose((1,0,2))
        pseudotest_uncalib_predsamples = np.array(pseudotest_uncalib_predsamples).transpose((1,0,2))
        pseudovalid_uncalib_predsamples_logits = np.array([
                inverse_softmax(x) for x in pseudovalid_uncalib_predsamples])        
        pseudotest_uncalib_predsamples_logits = np.array([
                inverse_softmax(x) for x in pseudotest_uncalib_predsamples])
        pseudovalid_labels = np.array(pseudovalid_labels) 
        pseudotest_labels = np.array(pseudotest_labels)
        
        
        if (calibrator is not None):
            the_calibrator = calibrator(
                                valid_preacts=pseudovalid_uncalib_pred_logits,
                                valid_labels=pseudovalid_labels)
            pseudovalid_calib_preds = the_calibrator(pseudovalid_uncalib_pred_logits)
            pseudotest_calib_preds = the_calibrator(pseudotest_uncalib_pred_logits)
            
            
            """print("(Valid) Accuracy before calib:",
                  np.mean(np.argmax(pseudovalid_uncalib_preds,axis=-1)
                          ==np.argmax(pseudovalid_labels,axis=-1)))
            print("(Valid) Accuracy after calib:",
                  np.mean(np.argmax(pseudovalid_calib_preds,axis=-1)
                          ==np.argmax(pseudovalid_labels,axis=-1)))
            print("(Valid) WKappa-minexpcost before calib:",
                  weighted_kappa_metric(
                        predprobs=pseudovalid_uncalib_preds,
                        true_labels=pseudovalid_labels,
                        weights=quadratic_weights,
                        mode='optim'))
            print("(Valid) WKappa-minexpcost after calib:",
                  weighted_kappa_metric(
                        predprobs=pseudovalid_calib_preds,
                        true_labels=pseudovalid_labels,
                        weights=quadratic_weights,
                        mode='optim'))
            print("(Test) Accuracy before calib:",
                  np.mean(np.argmax(pseudotest_uncalib_preds,axis=-1)
                          ==np.argmax(pseudotest_labels,axis=-1)))
            print("(Test) Accuracy after calib:",
                  np.mean(np.argmax(pseudotest_calib_preds,axis=-1)
                          ==np.argmax(pseudotest_labels,axis=-1)))
            print("(Test) WKappa-minexpcost before calib:",
                  weighted_kappa_metric(
                        predprobs=pseudotest_uncalib_preds,
                        true_labels=pseudotest_labels,
                        weights=quadratic_weights,
                        mode='optim'))
            print("(Test) WKappa-minexpcost after calib:",
                  weighted_kappa_metric(
                        predprobs=pseudotest_calib_preds,
                        true_labels=pseudotest_labels,
                        weights=quadratic_weights,
                        mode='optim'))"""
            
            
            """print("Distribution shift from true labels after calibration:",
                  "\nTrue:\n",np.mean(pseudovalid_labels,axis=0),
                  "\nEstimated (valid):\n",np.mean(pseudovalid_calib_preds,axis=0),
                  "\nDifference (valid):\n", np.mean(pseudovalid_labels-pseudovalid_calib_preds,
                                         axis=0),
                  "\nEstimated (test):\n",np.mean(pseudotest_calib_preds,axis=0),
                  "\nDifference (test):\n", np.mean(pseudovalid_labels,axis=0)
                                            -np.mean(pseudotest_calib_preds,axis=0))"""
            pseudovalid_calib_predsamples = np.array(
                [the_calibrator(x) for x in pseudovalid_uncalib_predsamples_logits])
            pseudotest_calib_predsamples = np.array(
                [the_calibrator(x) for x in pseudotest_uncalib_predsamples_logits])
            
            
            """print("ece before calibration - valid",
              compute_ece(softmax_out=pseudovalid_uncalib_preds,
                          labels=pseudovalid_labels,
                          bins=15))
            print("ece after calibration - valid",
                  compute_ece(softmax_out=pseudovalid_calib_preds,
                        labels=pseudovalid_labels,
                        bins=15))
            print("ece before calibration - test",
                  compute_ece(softmax_out=pseudotest_uncalib_preds,
                              labels=pseudotest_labels,
                              bins=15))            
            print("ece after calibration - test",
                  compute_ece(softmax_out=pseudotest_calib_preds,
                        labels=pseudotest_labels,
                        bins=15))"""
            
            
            """#plot the calibration curves for each class
            from sklearn.calibration import calibration_curve
            %matplotlib inline
            from matplotlib import pyplot as plt
            for class_idx in range(valid_labels.shape[1]+1):
                f, axarr = plt.subplots(1,4,figsize=(10,3))
                for axidx,(the_preds,the_labels,the_name) in enumerate([
                    (pseudovalid_uncalib_preds, pseudovalid_labels, "valid_uncalib"),
                    (pseudovalid_calib_preds, pseudovalid_labels, "valid_calib"),
                    (pseudotest_uncalib_preds, pseudotest_labels, "test_uncalib"),
                    (pseudotest_calib_preds, pseudotest_labels, "test_calib")]):                    
                    if (class_idx == the_preds.shape[1]):
                        avg_confidence_bins, accuracy_bins, prop_in_bins, ece =\
                            compute_ece_with_bins(softmax_out=the_preds,
                                                  labels=the_labels, bins=10)
                        print(the_name, ece)
                        axarr[axidx].plot(accuracy_bins, avg_confidence_bins)
                        axarr[axidx].plot([0,1],[0,1])
                        axarr[axidx].set_title(the_name)
                    else:
                        class_preds = the_preds[:,class_idx]
                        class_labels = the_labels[:,class_idx]
                        prob_true, prob_pred = calibration_curve(y_true=class_labels,
                                                                 y_prob=class_preds,
                                                                 n_bins=10)
                        axarr[axidx].plot(prob_true, prob_pred)
                        axarr[axidx].plot([0,1],[0,1])
                        axarr[axidx].set_title(the_name)
                plt.show()"""
                    
                    
            
            sys.stdout.flush()
            
        if (calibrator is not None):
            pseudotest_preds_to_use=pseudotest_calib_preds
            pseudovalid_preds_to_use=pseudovalid_calib_preds
            pseudotest_predsamples_to_use=pseudotest_calib_predsamples
            pseudovalid_predsamples_to_use=pseudovalid_calib_predsamples
        else:
            pseudotest_preds_to_use=pseudotest_uncalib_preds
            pseudovalid_preds_to_use=pseudovalid_uncalib_preds
            pseudotest_predsamples_to_use=pseudotest_uncalib_predsamples
            pseudovalid_predsamples_to_use=pseudovalid_uncalib_predsamples
        
        if (imbalance_adapter is not None):
            imbalance_adaptation_func = imbalance_adapter(
                #set the validation labels to be pseudovalid_preds_to_use
                # (rather than pseudovalid_labels) for consistency;
                # we want no adjustment to happen in the
                # case where tofit_initial_posterior_probs=pseudovalid_preds_to_use
                valid_labels=pseudovalid_preds_to_use,
                tofit_initial_posterior_probs=pseudotest_preds_to_use)
            preds_before_adaptation = pseudotest_preds_to_use
            pseudotest_preds_to_use = imbalance_adaptation_func(pseudotest_preds_to_use)
            print("Accuracy before calib:",
                  np.mean(np.argmax(pseudotest_uncalib_preds,axis=-1)
                          ==np.argmax(pseudotest_labels,axis=-1)))
            print("Accuracy before adaptation:",
                  np.mean(np.argmax(preds_before_adaptation,axis=-1)
                          ==np.argmax(pseudotest_labels,axis=-1)))
            print("Accuracy after adaptation:",
                  np.mean(np.argmax(pseudotest_preds_to_use,axis=-1)
                          ==np.argmax(pseudotest_labels,axis=-1)))
            print("WKappa-minexpcost before calib:",
                  weighted_kappa_metric(
                        predprobs=pseudotest_uncalib_preds,
                        true_labels=pseudotest_labels,
                        weights=quadratic_weights,
                        mode='optim'))
            print("WKappa-minexpcost before adaptation:",
                  weighted_kappa_metric(
                        predprobs=preds_before_adaptation,
                        true_labels=pseudotest_labels,
                        weights=quadratic_weights,
                        mode='optim'))
            print("WKappa-minexpcost after adaptation:",
                  weighted_kappa_metric(
                        predprobs=pseudotest_preds_to_use,
                        true_labels=pseudotest_labels,
                        weights=quadratic_weights,
                        mode='optim'))
            sys.stdout.flush()
            #print((zip(preds_before_adaptation,pseudotest_preds_to_use))[:20])
            
            #print("Difference from true imbalance",
            #      np.mean(pseudotest_preds_to_use,axis=0)-
            #      np.mean(pseudotest_labels,axis=0))
            pseudotest_predsamples_to_use = np.array([
                    imbalance_adaptation_func(x) for
                    x in pseudotest_predsamples_to_use])

        pseudovalid_variance_to_use = np.sum(np.var(pseudovalid_predsamples_to_use, axis=0),
                                             axis=-1)
        pseudotest_variance_to_use = np.sum(np.var(pseudotest_predsamples_to_use, axis=0),
                                            axis=-1)
            
        original_argmaxWeightedKappa_perf = weighted_kappa_metric(
            predprobs=pseudotest_preds_to_use,
            true_labels=pseudotest_labels,
            weights=quadratic_weights,
            mode='argmax')
        original_optimWeightedKappa_perf = weighted_kappa_metric(
            predprobs=pseudotest_preds_to_use,
            true_labels=pseudotest_labels,
            weights=quadratic_weights,
            mode='optim')
        
        #print("\nPseudotest set weighted kappa",
        #      original_weighted_kappa_perf)
        metric_to_baselineperfs["argmaxWeightedKappa"].append(
            original_argmaxWeightedKappa_perf)
        metric_to_baselineperfs["optimWeightedKappa"].append(
            original_optimWeightedKappa_perf)
        original_accuracy_perf = np.mean(
            np.argmax(pseudotest_preds_to_use,axis=-1)
            ==np.argmax(pseudotest_labels,axis=-1))
        #print("Pseudotest set accuracy",original_accuracy_perf)
        metric_to_baselineperfs["accuracy"].append(original_accuracy_perf)
        
        for abstention_fraction in abstention_fractions:
            #print("\nabstention fraction:",abstention_fraction)
            for abstainer_name, abstainer_factory in abstainer_factories:
                abstainer = abstainer_factory(
                    valid_labels=pseudovalid_labels,
                    valid_posterior=pseudovalid_preds_to_use)
                abstainer_priorities = abstainer(
                    posterior_probs=pseudotest_preds_to_use,
                    uncertainties=pseudotest_variance_to_use)
                indices_to_retain = (
                    [y[0] for y in sorted(enumerate(abstainer_priorities),
                        key=lambda x: x[1])][:int(len(abstainer_priorities)*
                                                     (1-abstention_fraction))])
                retained_pseudotest_preds = np.array(
                    [pseudotest_preds_to_use[i] for i in indices_to_retain])
                retained_pseudotest_labels = np.array(
                    [pseudotest_labels[i] for i in indices_to_retain])
                
                argmaxWeightedKappa_perf = weighted_kappa_metric(
                    predprobs=retained_pseudotest_preds,
                    true_labels=retained_pseudotest_labels,
                    weights=quadratic_weights,
                    mode='argmax')
                optimWeightedKappa_perf = weighted_kappa_metric(
                    predprobs=retained_pseudotest_preds,
                    true_labels=retained_pseudotest_labels,
                    weights=quadratic_weights,
                    mode='optim')
                accuracy_perf = (np.mean(np.argmax(
                    retained_pseudotest_preds,axis=-1)
                    ==np.argmax(retained_pseudotest_labels,axis=-1)))
                #print("\nAbstention criterion:",abstainer_name,optimWeightedKappa_perf)

                metric_to_fraction_to_method_to_perfs["delta_argmaxWeightedKappa"][
                    abstention_fraction][abstainer_name].append(
                        argmaxWeightedKappa_perf-original_argmaxWeightedKappa_perf)
                metric_to_fraction_to_method_to_perfs["delta_optimWeightedKappa"][
                    abstention_fraction][abstainer_name].append(
                        optimWeightedKappa_perf-original_optimWeightedKappa_perf)
                metric_to_fraction_to_method_to_perfs["delta_accuracy"][
                    abstention_fraction][abstainer_name].append(
                        accuracy_perf-original_accuracy_perf)

abstainer settings uncalib_weightrescalepreds
on fold 0
on fold 1
on fold 2
on fold 3
on fold 4
on fold 5
on fold 6
on fold 7
on fold 8
on fold 9
on fold 10
on fold 11
on fold 12
on fold 13
on fold 14
on fold 15
on fold 16
on fold 17
on fold 18
on fold 19
on fold 20
on fold 21
on fold 22
on fold 23
on fold 24
on fold 25
on fold 26
on fold 27
on fold 28
on fold 29
on fold 30
on fold 31
on fold 32
on fold 33
on fold 34
on fold 35
on fold 36
on fold 37
on fold 38
on fold 39
on fold 40
on fold 41
on fold 42
on fold 43
on fold 44
on fold 45
on fold 46
on fold 47
on fold 48
on fold 49
abstainer settings tempscale-calib_weightrescalepreds
on fold 0
on fold 1
on fold 2
on fold 3
on fold 4
on fold 5
on fold 6
on fold 7
on fold 8
on fold 9
on fold 10
on fold 11
on fold 12
on fold 13
on fold 14
on fold 15
on fold 16
on fold 17
on fold 18
on fold 19
on fold 20
on fold 21
on fold 22
on fold 23
on fold 24
on fold 25
on fold 26
on fold 27
on fold 28
on fold 29
on fold 30
on fold 31
on fold 32
on fold

In [42]:
import json
fh = open("abstention_results.json", 'w')
fh.write(json.dumps({
            "settingsname_to_metric_to_fraction_to_method_to_perfs":
              settingsname_to_metric_to_fraction_to_method_to_perfs,
            "settingsname_to_metric_to_baselineperfs":
              settingsname_to_metric_to_baselineperfs},
             sort_keys=True,
             indent=4,
             separators=(',', ': ')))
fh.close()

In [43]:
import json
loaded_data = json.loads(open("abstention_results.json").read())
settingsname_to_metric_to_fraction_to_method_to_perfs =\
    loaded_data["settingsname_to_metric_to_fraction_to_method_to_perfs"]
settingsname_to_metric_to_baselineperfs =\
    loaded_data["settingsname_to_metric_to_baselineperfs"]


In [60]:
from abstention.figure_making_utils import (
    wilcox_srs, get_ustats_mat,
    get_tied_top_and_worst_methods)
from collections import OrderedDict

comparison_groups = OrderedDict([
        ('Imbalanced, with weight rescaling', ([
          #('tempscalebiascor-calib_weightrescalepreds_imbalanced_adapted', 'expected_delta_optimWeightedKappa'),
          #('tempscalebiascor-calib_weightrescalepreds_imbalanced_unadapted', 'expected_delta_optimWeightedKappa'),
          ('tempscalebiascor-calib_weightrescalepreds_imbalanced_adapted', 'expected_delta_argmaxWeightedKappa'),
          ('tempscalebiascor-calib_weightrescalepreds_imbalanced_unadapted', 'expected_delta_argmaxWeightedKappa'),
          ('tempscalebiascor-calib_weightrescalepreds_imbalanced_adapted', 'dist_maxclass_prob_from_one'),
          ('tempscalebiascor-calib_weightrescalepreds_imbalanced_unadapted', 'dist_maxclass_prob_from_one'),
          ('tempscalebiascor-calib_weightrescalepreds_imbalanced_adapted', 'entropy'),
          ('tempscalebiascor-calib_weightrescalepreds_imbalanced_unadapted', 'entropy'),
          ('tempscalebiascor-calib_weightrescalepreds_imbalanced_adapted', 'variance'),
          ('tempscalebiascor-calib_weightrescalepreds_imbalanced_unadapted', 'variance')],
         ['adapted'])),
        ('Balanced, with weight rescaling', ([
          ('tempscale-calib_weightrescalepreds', 'expected_delta_argmaxWeightedKappa'),
          ('uncalib_weightrescalepreds', 'expected_delta_argmaxWeightedKappa'),
          ('tempscale-calib_weightrescalepreds', 'dist_maxclass_prob_from_one'),
          ('uncalib_weightrescalepreds', 'dist_maxclass_prob_from_one'),
          ('tempscale-calib_weightrescalepreds', 'entropy'),
          ('uncalib_weightrescalepreds', 'entropy'),
          ('tempscale-calib_weightrescalepreds', 'variance'),
          ('uncalib_weightrescalepreds', 'variance')],
         ['calib'])),
       ('Imalanced, with MC dropout', ([
          #('tempscalebiascor-calib_mcdrpreds_imbalanced_adapted', 'expected_delta_optimWeightedKappa'),
          #('tempscalebiascor-calib_mcdrpreds_imbalanced_unadapted', 'expected_delta_optimWeightedKappa'),
          ('tempscalebiascor-calib_mcdrpreds_imbalanced_adapted', 'expected_delta_argmaxWeightedKappa'),
          ('tempscalebiascor-calib_mcdrpreds_imbalanced_unadapted', 'expected_delta_argmaxWeightedKappa'),
          ('tempscalebiascor-calib_mcdrpreds_imbalanced_adapted', 'dist_maxclass_prob_from_one'),
          ('tempscalebiascor-calib_mcdrpreds_imbalanced_unadapted', 'dist_maxclass_prob_from_one'),
          ('tempscalebiascor-calib_mcdrpreds_imbalanced_adapted', 'entropy'),
          ('tempscalebiascor-calib_mcdrpreds_imbalanced_unadapted', 'entropy'),
          ('tempscalebiascor-calib_mcdrpreds_imbalanced_adapted', 'variance'),
          ('tempscalebiascor-calib_mcdrpreds_imbalanced_unadapted', 'variance')],
         ['adapted'])),
       ('Balanced, with MC dropout', ([
          #('tempscale-calib_mcdrpreds', 'expected_delta_optimWeightedKappa'),
          #('uncalib_mcdrpreds', 'expected_delta_optimWeightedKappa'),
          ('tempscale-calib_mcdrpreds', 'expected_delta_argmaxWeightedKappa'),
          ('uncalib_mcdrpreds', 'expected_delta_argmaxWeightedKappa'),
          ('tempscale-calib_mcdrpreds', 'dist_maxclass_prob_from_one'),
          ('uncalib_mcdrpreds', 'dist_maxclass_prob_from_one'),
          ('tempscale-calib_mcdrpreds', 'entropy'),
          ('uncalib_mcdrpreds', 'entropy'),
          ('tempscale-calib_mcdrpreds', 'variance'),
          ('uncalib_mcdrpreds', 'variance')],
         ['calib']))
    ])

friendly_method_names = {
    'expected_delta_argmaxWeightedKappa': 'E[$\Delta$Kappa]',
    #'expected_delta_optimWeightedKappa': 'E[$\Delta$Kappa]',
    'dist_maxclass_prob_from_one': 'Max Class Prob.',
    'entropy': 'Entropy',
    'variance': 'MC Dropout Var.'
}
abstention_fractions = ['0.05', '0.1', #'0.15',
                        '0.2']

for comparison_group_name in comparison_groups:
    
    print("On comparison group", comparison_group_name)
    columnstowrite = comparison_groups[comparison_group_name][1]
    for metric in ["argmaxWeightedKappa",
                   #"optimWeightedKappa",
                   "accuracy"]:
        print("On metric", metric)
        
        #gather all the necessary data
        settingnmethod_to_baselineperfs = OrderedDict()
        settingnmethod_to_abstentionfraction_to_perfs = OrderedDict()
        
        for (settingsname, methodname) in comparison_groups[comparison_group_name][0]:
            settingnmethod = settingsname+":"+methodname
            settingnmethod_to_baselineperfs[settingnmethod] =\
                settingsname_to_metric_to_baselineperfs[settingsname][metric]
            
            
            abstentionfraction_to_perfs = OrderedDict()
            settingnmethod_to_abstentionfraction_to_perfs[settingnmethod] =\
                abstentionfraction_to_perfs
            for abstention_fraction in abstention_fractions:
                abstentionfraction_to_perfs[abstention_fraction] = (
                    settingsname_to_metric_to_fraction_to_method_to_perfs[
                        settingsname]["delta_"+metric][abstention_fraction][
                        methodname])
        
        #prepare the table contents
        
        settingnmethod_to_tablecontents = OrderedDict()
        for settingnmethod in settingnmethod_to_baselineperfs:
            tablerow = {}
            settingnmethod_to_tablecontents[settingnmethod] = tablerow
            tablerow['baseline'] = {
                'mean': np.mean(settingnmethod_to_baselineperfs[settingnmethod]),
                'stderr': np.std(settingnmethod_to_baselineperfs[settingnmethod],
                                 ddof=1)/np.sqrt(num_folds)}
            tablerow['method'] = settingnmethod.split(":")[1]
            tablerow['mcdr'] = "mcdr" in settingnmethod.split(":")[0]
            tablerow['calib'] = ("uncalib" in settingnmethod.split(":")[0])==False
            #if neither 'balanced' nor 'imbalanced' is in the name, it means balanced
            tablerow['imbalanced'] = (("imbalanced" in settingnmethod.split(":")[0])
                                      and ("balanced" in settingnmethod.split(":")[0]))
            #if neither adapted nor unadapted is in the name, it means no adaptation
            tablerow['adapted'] = (("unadapted" not in settingnmethod.split(":")[0])
                                     and ("adapted" in settingnmethod.split(":")[0]))
        
        for abstention_fraction in abstention_fractions:
            method_to_perfs = OrderedDict()
            for settingnmethod in settingnmethod_to_tablecontents:
                tablerow = settingnmethod_to_tablecontents[settingnmethod]
                perfsdelta = settingnmethod_to_abstentionfraction_to_perfs[
                    settingnmethod][abstention_fraction]
                
                perfs = np.array(perfsdelta)+np.array(settingnmethod_to_baselineperfs[settingnmethod])
                method_to_perfs[settingnmethod] = perfs
                mean_perfs = np.mean(perfs)
                stderr_perfs = (np.std(perfs,ddof=1)/np.sqrt(num_folds))
                
                tablerow[abstention_fraction] = {
                    'mean': mean_perfs,
                    'stderr': stderr_perfs}
            methods_to_consider = list(method_to_perfs.keys())
            ustats_mat = get_ustats_mat(
                method_to_perfs,
                methods_to_consider,
                max_ustat=1275)
            tied_top_methods, tied_worst_methods =(
                get_tied_top_and_worst_methods(
                    ustats_mat,
                    methods_to_consider,
                    #0.05 threshold for one-sided test when N=119 is 50
                    #http://www.real-statistics.com/statistics-tables/wilcoxon-signed-ranks-table/
                    threshold=120
                ))
            tied_top_methods = [methods_to_consider[x]
                                for x in tied_top_methods]
            #print(abstention_fraction)
            #print(tied_top_methods)
            for settingnmethod in settingnmethod_to_tablecontents:
                settingnmethod_to_tablecontents[
                    settingnmethod][abstention_fraction][
                    'istop'] = (settingnmethod in tied_top_methods)
            #print(settingnmethod_to_tablecontents)
            
        thestr = "\\begin{table*}\n\\begin{center}\n\\begin{tabular}{ | c | c | c | c | c | c | c | }\n"
        thestr += ("\\hline Method"
                  +('& Calibrated? ' if 'calib' in columnstowrite else '')
                  +('& Adapted? ' if 'adapted' in columnstowrite else '')
                  +"& Base. "
                  +"& @")
        thestr += " & @".join([str(int(100*float(x)))+"\\% Abs."
                              for x in abstention_fractions])
        thestr += "\\\\ \\hline\n"
        for settingnmethod in settingnmethod_to_tablecontents:
            tablerow = settingnmethod_to_tablecontents[settingnmethod]
            thestr += friendly_method_names[tablerow['method']]
            if ('calib' in columnstowrite):
                thestr += " & "+("Y" if tablerow['calib'] else "N")
            if ('adapted' in columnstowrite):
                thestr += " & "+("Y" if tablerow['adapted'] else "N")
            thestr += (" & "+str(np.round(tablerow['baseline']['mean'],4))
                        +" $\\pm$"
                        +str(np.round(tablerow['baseline']['stderr'],4)))
            
            #thestr += " & "+(str(np.round(tablerow['baseline']['mean'],4))
            #                 +" $\\pm$ "
            #                 +str(np.round(tablerow['baseline']['stderr'],4)))
            for abstention_fraction in abstention_fractions:
                thestr += (
                    " & "+
                    ("\\textbf{" if tablerow[abstention_fraction]['istop'] else "")
                    +str(np.round(tablerow[abstention_fraction]['mean'],4))
                    +" $\\pm$ "
                    +str(np.round(tablerow[abstention_fraction]['stderr'],4))
                    +("}" if tablerow[abstention_fraction]['istop'] else ""))
            thestr += "\\\\ \hline\n"
        thestr += "\\end{tabular}\n\\end{center}\n\\end{table*}\n"
        
        print("\nBaseline "+metric+" perfs:")
        baseline_mean = set(
                x['baseline']['mean'] for x in
                settingnmethod_to_tablecontents.values())
        print(baseline_mean)
        #assert that all the methods have the same baseline
        #assert len(baseline_mean)==1
        #baseline_mean = list(baseline_mean)[0]
        baseline_stderr = set(
                x['baseline']['stderr'] for x in
                settingnmethod_to_tablecontents.values())
        print(baseline_stderr)
        #assert len(baseline_stderr)==1
        #baseline_stderr = list(baseline_stderr)[0]
        #print(np.round(baseline_mean,4),"$\\pm$",np.round(baseline_stderr,4))
        
        print("\n Latex Table for metric "
              +metric+" and group "+comparison_group_name
              +"\n\n"+thestr)
        


On comparison group Imbalanced, with weight rescaling
On metric argmaxWeightedKappa

Baseline argmaxWeightedKappa perfs:
set([0.7279563070086574, 0.7795793098778482])
set([0.003004075671940214, 0.0030207520124481343])

 Latex Table for metric argmaxWeightedKappa and group Imbalanced, with weight rescaling

\begin{table*}
\begin{center}
\begin{tabular}{ | c | c | c | c | c | c | c | }
\hline Method& Adapted? & Base. & @5\% Abs. & @10\% Abs. & @20\% Abs.\\ \hline
E[$\Delta$Kappa] & Y & 0.7796 $\pm$0.003 & \textbf{0.8012 $\pm$ 0.0028} & \textbf{0.8164 $\pm$ 0.0025} & \textbf{0.8389 $\pm$ 0.0022}\\ \hline
E[$\Delta$Kappa] & N & 0.728 $\pm$0.003 & 0.7605 $\pm$ 0.0028 & 0.778 $\pm$ 0.0027 & 0.798 $\pm$ 0.0026\\ \hline
Max Class Prob. & Y & 0.7796 $\pm$0.003 & 0.7918 $\pm$ 0.0031 & 0.8018 $\pm$ 0.0032 & 0.8177 $\pm$ 0.0032\\ \hline
Max Class Prob. & N & 0.728 $\pm$0.003 & 0.7401 $\pm$ 0.0031 & 0.7497 $\pm$ 0.0031 & 0.7648 $\pm$ 0.0032\\ \hline
Entropy & Y & 0.7796 $\pm$0.003 & 0.7968 $\pm$ 0.

In [63]:
settingsname_to_metric_to_baselineperfs['tempscalebiascor-calib_weightrescalepreds_imbalanced_unadapted']['accuracy']

[0.685374149659864,
 0.6839716312056737,
 0.703483432455395,
 0.7083333333333334,
 0.6933106575963719,
 0.683404255319149,
 0.6918489065606361,
 0.689900426742532,
 0.6831205673758866,
 0.688737973967176,
 0.6810979060554613,
 0.7007403189066059,
 0.6969869243888573,
 0.6879613855763771,
 0.7009637188208617,
 0.7018439716312057,
 0.7140022675736961,
 0.70261066969353,
 0.6961341671404206,
 0.7018192154633315,
 0.6967120181405896,
 0.7080727686185333,
 0.702357284862255,
 0.6983588002263724,
 0.713593616414933,
 0.6947608200455581,
 0.6864527122976427,
 0.6933106575963719,
 0.7036089798238135,
 0.7026643990929705,
 0.6978783592644979,
 0.6942078364565588,
 0.6882453151618398,
 0.6959114139693356,
 0.6790684464640727,
 0.6941209883555808,
 0.6893121091529278,
 0.6859083191850595,
 0.6861853325753269,
 0.681378132118451,
 0.6793524566884408,
 0.6881750994883457,
 0.6816638370118846,
 0.6831065759637188,
 0.6862411347517731,
 0.7010514350667804,
 0.7009637188208617,
 0.6904964539007092,
 0