In [1]:
from __future__ import print_function, division

import numpy as np
import gzip
from sklearn.preprocessing import LabelBinarizer

valid_labels = LabelBinarizer().fit_transform(
    np.array([float(x.decode("utf-8").split("\t")[1])
              for x in gzip.open("valid_labels.txt.gz",'rb')]))

In [2]:
#augmenting the dataset with flips and rotaitons, for more robustness
parent_folders = ["flip-False_rotamt-0",
                  "flip-True_rotamt-0",
                  "flip-False_rotamt-90",
                  "flip-True_rotamt-90",
                  "flip-False_rotamt-180",
                  "flip-True_rotamt-180",]

parent_folder_to_det_pred = {}
for parent_folder in parent_folders:
    det_preds = np.array([
            [float(y) for y in x.decode("utf-8").split("\t")[1:]]
             for x in gzip.open(parent_folder+"/deterministic_preds.txt.gz", 'rb')])
    parent_folder_to_det_pred[parent_folder] = det_preds
    
parent_folder_to_nondet_pred = {}
parent_folder_to_mean_nondet_pred = {}
for parent_folder in parent_folders:
    nondet_preds = []
    for i in range(100):
        single_nondet_pred = np.array([
            [float(y) for y in x.decode("utf-8").split("\t")[1:]]
             for x in gzip.open(
              parent_folder+"/nondeterministic_preds_"+str(i)+".txt.gz", 'rb')])
        nondet_preds.append(single_nondet_pred)
    nondet_preds = np.array(nondet_preds)
    parent_folder_to_nondet_pred[parent_folder] = nondet_preds
    parent_folder_to_mean_nondet_pred[parent_folder] = np.mean(nondet_preds,axis=0)

In [3]:
#Compute the auROC/auPRC
from sklearn.metrics import roc_auc_score

for parent_folder in parent_folders:
    print(parent_folder)
    det_preds = parent_folder_to_det_pred[parent_folder]
    mean_nondet_preds = parent_folder_to_mean_nondet_pred[parent_folder]
    print("deterministic pred auROC",
          roc_auc_score(y_true=1-valid_labels[:,0],
                              y_score=1-det_preds[:,0]))
    print("nondeterministic pred auROC",
          roc_auc_score(y_true=1-valid_labels[:,0],
                              y_score=1-mean_nondet_preds[:,0]))

flip-False_rotamt-0
deterministic pred auROC 0.9118638796723656
nondeterministic pred auROC 0.9129881925522253
flip-True_rotamt-0
deterministic pred auROC 0.9136845292158645
nondeterministic pred auROC 0.9141930341618936
flip-False_rotamt-90
deterministic pred auROC 0.9077797755493358
nondeterministic pred auROC 0.9079080860318696
flip-True_rotamt-90
deterministic pred auROC 0.9072243126739039
nondeterministic pred auROC 0.9084814670645733
flip-False_rotamt-180
deterministic pred auROC 0.916166708887612
nondeterministic pred auROC 0.9166587373671843
flip-True_rotamt-180
deterministic pred auROC 0.9131712872857287
nondeterministic pred auROC 0.9138490879246036


In [None]:
import abstention
from abstention.calibration import compute_ece, TempScaling
reload(abstention.abstention)
from abstention.abstention import (weighted_kappa_metric,
                                   WeightedKappa, DistMaxClassProbFromOne,
                                   Entropy, Uncertainty)
from collections import defaultdict, namedtuple
import numpy as np
import random

def inverse_softmax(preds):
    return np.log(preds) - np.mean(np.log(preds),axis=1)[:,None]

quadratic_weights = np.array([[(i-j)**2 for i in range(5)]
                             for j in range(5)])

AbstainerSettings = namedtuple("AbstainerSettings",
                               ["name",
                                "abstainer_factories",
                                "preds_lookup",
                                "predsamples_lookup",
                                "use_calib"])

abstainer_factories = [
        ("expected_delta_weighted_kappa", WeightedKappa(
            weights=quadratic_weights, verbose=False)),
        ("expected_delta_weighted_kappa_imbalance_from_valid", WeightedKappa(
                weights=quadratic_weights,
                estimate_class_imbalance_from_valid=True,
                verbose=False)),
        ("dist_maxclass_prob_from_one", DistMaxClassProbFromOne()),
        ("entropy", Entropy()),
        ("variance", Uncertainty())]
abstention_fractions = [0.05, 0.1, 0.15, 0.2]

abstainer_settings_list = [
    AbstainerSettings(
        name="calib_weightrescalepreds",
        abstainer_factories=abstainer_factories,
        preds_lookup=parent_folder_to_det_pred,
        predsamples_lookup=parent_folder_to_nondet_pred,
        use_calib=True),
    AbstainerSettings(
        name="uncalib_weightrescalepreds",
        abstainer_factories=abstainer_factories,
        preds_lookup=parent_folder_to_det_pred,
        predsamples_lookup=parent_folder_to_nondet_pred,
        use_calib=False),
    AbstainerSettings(
        name="calib_mcdrpreds",
        abstainer_factories=abstainer_factories,
        preds_lookup=parent_folder_to_mean_nondet_pred,
        predsamples_lookup=parent_folder_to_nondet_pred,
        use_calib=True),
    AbstainerSettings(
        name="uncalib_mcdrpreds",
        abstainer_factories=abstainer_factories,
        preds_lookup=parent_folder_to_mean_nondet_pred,
        predsamples_lookup=parent_folder_to_nondet_pred,
        use_calib=False)
]

num_folds = 50

settingsname_to_metric_to_fraction_to_method_to_perfs = {}
settingsname_to_metric_to_baselineperfs = {}

for abstainer_settings in abstainer_settings_list:
    
    settings_name = abstainer_settings.name
    print("abstainer settings", settings_name)
    abstainer_factories = abstainer_settings.abstainer_factories
    preds_lookup = abstainer_settings.preds_lookup
    predsamples_lookup = abstainer_settings.predsamples_lookup
    use_calib = abstainer_settings.use_calib
    
    metric_to_fraction_to_method_to_perfs =\
        defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    settingsname_to_metric_to_fraction_to_method_to_perfs[settings_name] =\
        metric_to_fraction_to_method_to_perfs
    metric_to_baselineperfs = defaultdict(list)   
    settingsname_to_metric_to_baselineperfs[settings_name] =\
        metric_to_baselineperfs
    
    for fold_number in range(num_folds):
        print("on fold",fold_number)

        np.random.seed(fold_number*1000)
        random.seed(fold_number*1000)
        #the data is in pairs of (left eye, right eye) per patient (entry for
        # the right eye comes after the entry for the left eye); hence, the number of
        # unique patients is 0.5*len(valid_labels)
        patient_id_ordering = list(range(int(0.5*len(valid_labels))))
        np.random.shuffle(patient_id_ordering)

        pseudovalid_uncalib_preds = []
        pseudotest_uncalib_preds = []
        pseudovalid_uncalib_predsamples = []
        pseudotest_uncalib_predsamples = []
        pseudovalid_labels = []
        pseudotest_labels = []
        pseudovalid_label_counts = np.zeros(5)
        pseudotest_label_counts = np.zeros(5)
        for i in patient_id_ordering:
            left_eye_label = valid_labels[2*i]
            right_eye_label = valid_labels[(2*i)+1]
            most_diseased_label = max(np.argmax(left_eye_label),
                                      np.argmax(right_eye_label))
            if (pseudovalid_label_counts[most_diseased_label] <
                pseudotest_label_counts[most_diseased_label]):
                append_to_uncalib_preds = pseudovalid_uncalib_preds
                append_to_uncalib_predsamples = pseudovalid_uncalib_predsamples
                append_to_labels = pseudovalid_labels
                append_to_label_counts = pseudovalid_label_counts
            else:
                append_to_uncalib_preds = pseudotest_uncalib_preds
                append_to_uncalib_predsamples = pseudotest_uncalib_predsamples
                append_to_labels = pseudotest_labels
                append_to_label_counts = pseudotest_label_counts

            for parent_folder in parent_folders:        
                append_to_labels.append(valid_labels[2*i])
                append_to_labels.append(valid_labels[(2*i)+1])
                append_to_label_counts += valid_labels[2*i]
                append_to_label_counts += valid_labels[(2*i)+1]
                append_to_uncalib_preds.append(
                        preds_lookup[parent_folder][2*i])
                append_to_uncalib_preds.append(
                        preds_lookup[parent_folder][(2*i)+1])
                append_to_uncalib_predsamples.append(
                        predsamples_lookup[parent_folder][:,(2*i)])
                append_to_uncalib_predsamples.append(
                        predsamples_lookup[parent_folder][:,(2*i)+1])

        pseudovalid_uncalib_preds = np.array(pseudovalid_uncalib_preds)
        pseudotest_uncalib_preds = np.array(pseudotest_uncalib_preds)
        pseudovalid_uncalib_pred_logits = inverse_softmax(pseudovalid_uncalib_preds)
        pseudotest_uncalib_pred_logits = inverse_softmax(pseudotest_uncalib_preds)
        pseudovalid_uncalib_predsamples = np.array(pseudovalid_uncalib_predsamples).transpose((1,0,2))
        pseudotest_uncalib_predsamples = np.array(pseudotest_uncalib_predsamples).transpose((1,0,2))
        pseudovalid_uncalib_predsamples_logits = np.array([
                inverse_softmax(x) for x in pseudovalid_uncalib_predsamples])        
        pseudotest_uncalib_predsamples_logits = np.array([
                inverse_softmax(x) for x in pseudotest_uncalib_predsamples])
        pseudovalid_labels = np.array(pseudovalid_labels) 
        pseudotest_labels = np.array(pseudotest_labels)
        #print("valid vs. test distribution shift",
        #      np.abs(pseudovalid_label_counts-pseudotest_label_counts)/
        #            (pseudovalid_label_counts+pseudotest_label_counts))
        
        pseudovalid_uncalib_variance = np.sum(np.var(pseudovalid_uncalib_predsamples,axis=0),axis=-1)
        pseudotest_uncalib_variance = np.sum(np.var(pseudotest_uncalib_predsamples,axis=0),axis=-1)
        
        if (use_calib):
            #print("ece before temp scale - valid",
            #  compute_ece(softmax_out=pseudovalid_uncalib_preds,
            #              labels=pseudovalid_labels,
            #              bins=15))
            #print("ece before temp scale - test",
            #      compute_ece(softmax_out=pseudotest_uncalib_preds,
            #                  labels=pseudotest_labels,
            #                  bins=15))
            temp_scaler = TempScaling(ece_bins=15, verbose=False)(
                                valid_preacts=pseudovalid_uncalib_pred_logits,
                                valid_labels=pseudovalid_labels)
            pseudovalid_calib_preds = temp_scaler(pseudovalid_uncalib_pred_logits)
            pseudotest_calib_preds = temp_scaler(pseudotest_uncalib_pred_logits)
            pseudovalid_calib_predsamples = np.array(
                [temp_scaler(x) for x in pseudovalid_uncalib_predsamples_logits])
            pseudotest_calib_predsamples = np.array(
                [temp_scaler(x) for x in pseudotest_uncalib_predsamples_logits])
            
            pseudovalid_calib_variance = np.sum(np.var(pseudovalid_calib_predsamples, axis=0),axis=-1)
            pseudotest_calib_variance = np.sum(np.var(pseudotest_calib_predsamples, axis=0),axis=-1)

            #print("ece after temp scale - valid",
            #      compute_ece(softmax_out=pseudovalid_calib_preds,
            #            labels=pseudovalid_labels,
            #            bins=15))
            #print("ece after temp scale - test",
            #      compute_ece(softmax_out=pseudotest_calib_preds,
            #            labels=pseudotest_labels,
            #            bins=15))
            
        if (use_calib):
            pseudotest_preds_to_use=pseudotest_calib_preds
            pseudovalid_preds_to_use=pseudovalid_calib_preds
            pseudotest_variance_to_use=pseudotest_calib_variance
            pseudovalid_variance_to_use=pseudovalid_calib_variance
        else:
            pseudotest_preds_to_use=pseudotest_uncalib_preds
            pseudovalid_preds_to_use=pseudovalid_uncalib_preds
            pseudotest_variance_to_use=pseudotest_uncalib_variance
            pseudovalid_variance_to_use=pseudovalid_uncalib_variance

        original_weighted_kappa_perf = weighted_kappa_metric(
            predprobs=pseudotest_preds_to_use,
            true_labels=pseudotest_labels,
            weights=quadratic_weights)
        
        #print("\nPseudotest set weighted kappa",
        #      original_weighted_kappa_perf)
        metric_to_baselineperfs["weighted_kappa"].append(
            original_weighted_kappa_perf)
        original_accuracy_perf = np.mean(
            np.argmax(pseudotest_preds_to_use,axis=-1)
            ==np.argmax(pseudotest_labels,axis=-1))
        #print("Pseudotest set accuracy",original_accuracy_perf)
        metric_to_baselineperfs["accuracy"].append(original_accuracy_perf)

        for abstention_fraction in abstention_fractions:
            #print("\nabstention fraction:",abstention_fraction)
            for abstainer_name, abstainer_factory in abstainer_factories:
                abstainer = abstainer_factory(
                    valid_labels=pseudovalid_labels,
                    valid_posterior=pseudovalid_preds_to_use)
                abstainer_priorities = abstainer(
                    posterior_probs=pseudotest_preds_to_use,
                    uncertainties=pseudotest_variance_to_use)
                indices_to_retain = (
                    [y[0] for y in sorted(enumerate(abstainer_priorities),
                        key=lambda x: x[1])][:int(len(abstainer_priorities)*
                                                     (1-abstention_fraction))])
                retained_pseudotest_preds = np.array(
                    [pseudotest_preds_to_use[i] for i in indices_to_retain])
                retained_pseudotest_labels = np.array(
                    [pseudotest_labels[i] for i in indices_to_retain])
                #print("\nAbstention criterion:",abstainer_name)
                weighted_kappa_perf = weighted_kappa_metric(
                    predprobs=retained_pseudotest_preds,
                    true_labels=retained_pseudotest_labels,
                    weights=quadratic_weights)
                #print("weighted kappa", weighted_kappa_perf)
                accuracy_perf = (np.mean(np.argmax(
                    retained_pseudotest_preds,axis=-1)
                    ==np.argmax(retained_pseudotest_labels,axis=-1)))
                #print("accuracy", accuracy_perf)

                metric_to_fraction_to_method_to_perfs["delta_weighted_kappa"][
                    abstention_fraction][abstainer_name].append(
                        weighted_kappa_perf-original_weighted_kappa_perf)
                metric_to_fraction_to_method_to_perfs["delta_accuracy"][
                    abstention_fraction][abstainer_name].append(
                        accuracy_perf-original_accuracy_perf)

abstainer settings calib_weightrescalepreds
on fold 0
on fold 1
on fold 2
on fold 3
on fold 4
on fold 5
on fold 6
on fold 7
on fold 8
on fold 9
on fold 10
on fold 11
on fold 12
on fold 13
on fold 14
on fold 15
on fold 16
on fold 17
on fold 18
on fold 19
on fold 20
on fold 21
on fold 22
on fold 23
on fold 24
on fold 25
on fold 26
on fold 27
on fold 28
on fold 29
on fold 30
on fold 31
on fold 32
on fold

In [None]:
from abstention.figure_making_utils import (
    wilcox_srs, get_ustats_mat, get_tied_top_and_worst_methods)

methods_to_consider = ['expected_delta_weighted_kappa',
                       'dist_maxclass_prob_from_one',
                       'entropy']
for metric in ["delta_weighted_kappa", "delta_accuracy"]:
    for abstention_fraction_to_consider in [0.05, 0.2]:   
        ustats_mat = get_ustats_mat(
            metric_to_fraction_to_method_to_perfs[metric][abstention_fraction_to_consider],
            methods_to_consider)
        
        tied_top_methods, tied_worst_methods =(
            get_tied_top_and_worst_methods(ustats_mat, methods_to_consider))
        print("\nAbstention fraction",abstention_fraction_to_consider,"with metric",metric)
        
        method_to_perf_deltas = metric_to_fraction_to_method_to_perfs[
                                 metric][abstention_fraction_to_consider]
        
        print("\nMethods sorted by mean delta perf:")
        print("\n".join(
                [str(x[0])
                 +"\t"+str(np.round(x[1],5))
                 +" +/- "
                 +str(np.round(np.std(method_to_perf_deltas[x[0]],ddof=1)/
                               np.sqrt(num_folds),5)) #standard error 
                #sort the different methods by their mean delta perf
                for x in sorted([
                    (method_name,np.mean(method_to_perf_deltas[method_name]))
                     for method_name in methods_to_consider],
                    key=lambda x: -x[1])
                ]))
        
        print("\nTop methods by wilcoxon:",[methods_to_consider[x] for x in tied_top_methods])
        print("Worst methods by wilcoxon:",[methods_to_consider[x] for x in tied_worst_methods])