In [1]:
from __future__ import print_function, division

import numpy as np
import gzip
from sklearn.preprocessing import LabelBinarizer

valid_labels = LabelBinarizer().fit_transform(
    np.array([float(x.decode("utf-8").split("\t")[1])
              for x in gzip.open("valid_labels.txt.gz",'rb')]))

In [2]:
#augmenting the dataset with flips and rotaitons, for more robustness
parent_folders = ["flip-False_rotamt-0",
                  "flip-True_rotamt-0",
                  "flip-False_rotamt-90",
                  "flip-True_rotamt-90",
                  "flip-False_rotamt-180",
                  "flip-True_rotamt-180",]

parent_folder_to_det_pred = {}
for parent_folder in parent_folders:
    det_preds = np.array([
            [float(y) for y in x.decode("utf-8").split("\t")[1:]]
             for x in gzip.open(parent_folder+"/deterministic_preds.txt.gz", 'rb')])
    parent_folder_to_det_pred[parent_folder] = det_preds

In [3]:
import abstention
from abstention.calibration import compute_ece, TempScaling
reload(abstention.abstention)
from abstention.abstention import (weighted_kappa_metric,
                                   WeightedKappa, DistMaxClassProbFromOne,
                                   Entropy)
from collections import defaultdict
import numpy as np
import random

def inverse_softmax(preds):
    return np.log(preds) - np.mean(np.log(preds),axis=1)[:,None]

quadratic_weights = np.array([[(i-j)**2 for i in range(5)]
                             for j in range(5)])

abstainer_factories = [
        ("expected_delta_weighted_kappa", WeightedKappa(weights=quadratic_weights)),
        ("expected_delta_weighted_kappa_imbalance_from_valid", WeightedKappa(
                weights=quadratic_weights, estimate_class_imbalance_from_valid=True)),
        ("dist_maxclass_prob_from_one", DistMaxClassProbFromOne()),
        ("entropy", Entropy())
    ]
abstention_fractions = [0.05, 0.1, 0.15, 0.2]
metric_to_fraction_to_method_to_perfs = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
metric_to_baselineperfs = defaultdict(list)

num_folds = 10
for fold_number in range(num_folds):
    print("on fold",fold_number)
    np.random.seed(fold_number*1000)
    random.seed(fold_number*1000)
    
    #the data is in pairs of (left eye, right eye) per patient (entry for
    # the right eye comes after the entry for the left eye); hence, the number of
    # unique patients is 0.5*len(valid_labels)
    patient_id_ordering = list(range(int(0.5*len(valid_labels))))
    np.random.shuffle(patient_id_ordering)
    
    pseudovalid_predictions = []
    pseudovalid_labels = []
    pseudovalid_label_counts = np.zeros(5)
    pseudotest_predictions = []
    pseudotest_labels = []
    pseudotest_label_counts = np.zeros(5)
    for i in patient_id_ordering:
        left_eye_label = valid_labels[2*i]
        right_eye_label = valid_labels[(2*i)+1]
        most_diseased_label = max(np.argmax(left_eye_label),
                                  np.argmax(right_eye_label))
        if (pseudovalid_label_counts[most_diseased_label] <
            pseudotest_label_counts[most_diseased_label]):
            append_to_predictions = pseudovalid_predictions
            append_to_labels = pseudovalid_labels
            append_to_label_counts = pseudovalid_label_counts
        else:
            append_to_predictions = pseudotest_predictions
            append_to_labels = pseudotest_labels
            append_to_label_counts = pseudotest_label_counts

        for parent_folder in parent_folders:        
            append_to_labels.append(valid_labels[2*i])
            append_to_labels.append(valid_labels[(2*i)+1])
            append_to_label_counts += valid_labels[2*i]
            append_to_label_counts += valid_labels[(2*i)+1]
            append_to_predictions.append(
                parent_folder_to_det_pred[parent_folder][2*i])
            append_to_predictions.append(
                parent_folder_to_det_pred[parent_folder][(2*i)+1])

    pseudovalid_predictions = np.array(pseudovalid_predictions)
    pseudovalid_pred_logits = inverse_softmax(pseudovalid_predictions)
    pseudovalid_labels = np.array(pseudovalid_labels)
    pseudotest_predictions = np.array(pseudotest_predictions)
    pseudotest_pred_logits = inverse_softmax(pseudotest_predictions)
    pseudotest_labels = np.array(pseudotest_labels)
    print("valid vs. test distribution shift",
          np.abs(pseudovalid_label_counts-pseudotest_label_counts)/
                (pseudovalid_label_counts+pseudotest_label_counts))
    
    print("ece before temp scale - valid",
      compute_ece(softmax_out=pseudovalid_predictions,
            labels=pseudovalid_labels,
            bins=15))
    print("ece before temp scale - test",
          compute_ece(softmax_out=pseudotest_predictions,
                labels=pseudotest_labels,
                bins=15))

    temp_scaler = TempScaling(ece_bins=15)(
                        valid_preacts=pseudovalid_pred_logits,
                        valid_labels=pseudovalid_labels)
    temp_scaled_valid = temp_scaler(pseudovalid_pred_logits)
    temp_scaled_test = temp_scaler(pseudotest_pred_logits)

    print("ece after temp scale - valid",
          compute_ece(softmax_out=temp_scaled_valid,
                labels=pseudovalid_labels,
                bins=15))
    print("ece after temp scale - test",
          compute_ece(softmax_out=temp_scaled_test,
                labels=pseudotest_labels,
                bins=15))
    
    #print("Est valid perf",weighted_kappa_metric(predprobs=temp_scaled_valid,
    #                            true_labels=temp_scaled_valid,
    #                            weights=quadratic_weights))
    #print("Est test perf",weighted_kappa_metric(predprobs=temp_scaled_test,
    #                            true_labels=temp_scaled_test,
    #                            weights=quadratic_weights))
    original_weighted_kappa_perf = weighted_kappa_metric(predprobs=temp_scaled_test,
                                            true_labels=pseudotest_labels,
                                            weights=quadratic_weights)
    print("\nPseudotest set weighted kappa",original_weighted_kappa_perf)
    metric_to_baselineperfs["weighted_kappa"].append(original_weighted_kappa_perf)
    original_accuracy_perf = np.mean(np.argmax(temp_scaled_test,axis=-1)
                                     ==np.argmax(pseudotest_labels,axis=-1))
    print("Pseudotest set accuracy",original_accuracy_perf)
    metric_to_baselineperfs["accuracy"].append(original_accuracy_perf)

    for abstention_fraction in abstention_fractions:
        print("\nabstention fraction:",abstention_fraction)
        for abstainer_name, abstainer_factory in abstainer_factories:
            abstainer = abstainer_factory(valid_labels=pseudovalid_labels,
                                          valid_posterior=temp_scaled_valid)
            abstainer_priorities = abstainer(temp_scaled_test)
            indices_to_retain = ([y[0] for y in sorted(enumerate(abstainer_priorities),
                                  key=lambda x: x[1])][:int(len(abstainer_priorities)*
                                                               (1-abstention_fraction))])
            retained_temp_scaled_test = np.array([temp_scaled_test[i] for i in indices_to_retain])
            retained_pseudotest_labels = np.array([pseudotest_labels[i] for i in indices_to_retain])
            print("\nAbstention criterion:",abstainer_name)
            weighted_kappa_perf = weighted_kappa_metric(predprobs=retained_temp_scaled_test,
                                                          true_labels=retained_pseudotest_labels,
                                                          weights=quadratic_weights)
            print("weighted kappa", weighted_kappa_perf)
            accuracy_perf = (np.mean(np.argmax(retained_temp_scaled_test,axis=-1)
                                       ==np.argmax(retained_pseudotest_labels,axis=-1)))
            print("accuracy", accuracy_perf)
            
            metric_to_fraction_to_method_to_perfs["delta_weighted_kappa"][
                abstention_fraction][abstainer_name].append(weighted_kappa_perf
                                                            -original_weighted_kappa_perf)
            metric_to_fraction_to_method_to_perfs["delta_accuracy"][
                abstention_fraction][abstainer_name].append(accuracy_perf
                                                               -original_accuracy_perf)

Couldn't import dot_parser, loading of dot files will not be possible.
on fold 0
valid vs. test distribution shift [0.00038926 0.         0.         0.02325581 0.01333333]
ece before temp scale - valid 5.233412822513292
ece before temp scale - test 7.176854767444066
Original NLL & grad is:  (18.90354760257945, array([-0.22010595]))
Original ECE is:  5.233412681771793
      fun: 14.098479819251333
 hess_inv: <1x1 LbfgsInvHessProduct with dtype=float64>
      jac: array([-2.25208524e-09])
  message: 'CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL'
     nfev: 7
      nit: 6
   status: 0
  success: True
        x: array([1.38349462])
Final NLL & grad is:  (14.098479819251333, array([-2.25208524e-09]))
Final ECE is:  1.9532845206743972
ece after temp scale - valid 1.9532845206743972
ece after temp scale - test 1.7508179232902705

Pseudotest set weighted kappa 0.7987515509427573
Pseudotest set accuracy 0.820250284414107

abstention fraction: 0.05

Abstention criterion: expected_delta_weigh

Using TensorFlow backend.


In [6]:
from abstention.figure_making_utils import (
    wilcox_srs, get_ustats_mat, get_tied_top_and_worst_methods)

methods_to_consider = ['expected_delta_weighted_kappa',
                       'dist_maxclass_prob_from_one',
                       'entropy']
for metric in ["delta_weighted_kappa", "delta_accuracy"]:
    for abstention_fraction_to_consider in [0.05, 0.2]:   
        ustats_mat = get_ustats_mat(
            metric_to_fraction_to_method_to_perfs[metric][abstention_fraction_to_consider],
            methods_to_consider)
        
        tied_top_methods, tied_worst_methods =(
            get_tied_top_and_worst_methods(ustats_mat, methods_to_consider))
        print("\nAbstention fraction",abstention_fraction_to_consider,"with metric",metric)
        
        method_to_perf_deltas = metric_to_fraction_to_method_to_perfs[
                                 metric][abstention_fraction_to_consider]
        
        print("\nMethods sorted by mean delta perf:")
        print("\n".join(
                [str(x[0])
                 +"\t"+str(np.round(x[1],5))
                 +" +/- "
                 +str(np.round(np.std(method_to_perf_deltas[x[0]],ddof=1)/
                               np.sqrt(num_folds),5)) #standard error 
                #sort the different methods by their mean delta perf
                for x in sorted([
                    (method_name,np.mean(method_to_perf_deltas[method_name]))
                     for method_name in methods_to_consider],
                    key=lambda x: -x[1])
                ]))
        
        print("\nTop methods by wilcoxon:",[methods_to_consider[x] for x in tied_top_methods])
        print("Worst methods by wilcoxon:",[methods_to_consider[x] for x in tied_worst_methods])


Abstention fraction 0.05 with metric delta_weighted_kappa

Methods sorted by mean delta perf:
expected_delta_weighted_kappa	0.03176 +/- 0.00096
entropy	0.01992 +/- 0.00105
dist_maxclass_prob_from_one	0.01587 +/- 0.00082

Top methods by wilcoxon: ['expected_delta_weighted_kappa']
Worst methods by wilcoxon: ['dist_maxclass_prob_from_one']

Abstention fraction 0.2 with metric delta_weighted_kappa

Methods sorted by mean delta perf:
expected_delta_weighted_kappa	0.07235 +/- 0.00139
entropy	0.02474 +/- 0.00417
dist_maxclass_prob_from_one	0.00426 +/- 0.00443

Top methods by wilcoxon: ['expected_delta_weighted_kappa']
Worst methods by wilcoxon: ['dist_maxclass_prob_from_one']

Abstention fraction 0.05 with metric delta_accuracy

Methods sorted by mean delta perf:
dist_maxclass_prob_from_one	0.02266 +/- 0.00032
entropy	0.02017 +/- 0.00042
expected_delta_weighted_kappa	0.01459 +/- 0.00026

Top methods by wilcoxon: ['dist_maxclass_prob_from_one']
Worst methods by wilcoxon: ['expected_delta_weig