In [1]:
from __future__ import print_function, division

import numpy as np
import gzip
from sklearn.preprocessing import LabelBinarizer

valid_labels = LabelBinarizer().fit_transform(
    np.array([float(x.decode("utf-8").split("\t")[1])
              for x in gzip.open("valid_labels.txt.gz",'rb')]))

In [2]:
#augmenting the dataset with flips and rotaitons, for more robustness
parent_folders = ["flip-False_rotamt-0",
                  "flip-True_rotamt-0",
                  "flip-False_rotamt-90",
                  "flip-True_rotamt-90",
                  "flip-False_rotamt-180",
                  "flip-True_rotamt-180",
                  "flip-False_rotamt-270",
                  "flip-True_rotamt-270",]

parent_folder_to_det_pred = {}
for parent_folder in parent_folders:
    det_preds = np.array([
            [float(y) for y in x.decode("utf-8").split("\t")[1:]]
             for x in gzip.open(parent_folder+"/deterministic_preds.txt.gz", 'rb')])
    parent_folder_to_det_pred[parent_folder] = det_preds
    
parent_folder_to_nondet_pred = {}
parent_folder_to_mean_nondet_pred = {}
for parent_folder in parent_folders:
    nondet_preds = []
    for i in range(100):
        single_nondet_pred = np.array([
            [float(y) for y in x.decode("utf-8").split("\t")[1:]]
             for x in gzip.open(
              parent_folder+"/nondeterministic_preds_"+str(i)+".txt.gz", 'rb')])
        nondet_preds.append(single_nondet_pred)
    nondet_preds = np.array(nondet_preds)
    parent_folder_to_nondet_pred[parent_folder] = nondet_preds
    parent_folder_to_mean_nondet_pred[parent_folder] = np.mean(nondet_preds,axis=0)

In [3]:
#Compute the auROC/auPRC
from sklearn.metrics import roc_auc_score

for parent_folder in parent_folders:
    print(parent_folder)
    det_preds = parent_folder_to_det_pred[parent_folder]
    mean_nondet_preds = parent_folder_to_mean_nondet_pred[parent_folder]
    print("deterministic pred auROC",
          roc_auc_score(y_true=1-valid_labels[:,0],
                              y_score=1-det_preds[:,0]))
    print("nondeterministic pred auROC",
          roc_auc_score(y_true=1-valid_labels[:,0],
                              y_score=1-mean_nondet_preds[:,0]))

flip-False_rotamt-0
deterministic pred auROC 0.9118638796723656
nondeterministic pred auROC 0.9129881925522253
flip-True_rotamt-0
deterministic pred auROC 0.9136845292158645
nondeterministic pred auROC 0.9141930341618936
flip-False_rotamt-90
deterministic pred auROC 0.9077797755493358
nondeterministic pred auROC 0.9079080860318696
flip-True_rotamt-90
deterministic pred auROC 0.9072243126739039
nondeterministic pred auROC 0.9084814670645733
flip-False_rotamt-180
deterministic pred auROC 0.916166708887612
nondeterministic pred auROC 0.9166587373671843
flip-True_rotamt-180
deterministic pred auROC 0.9131712872857287
nondeterministic pred auROC 0.9138490879246036
flip-False_rotamt-270
deterministic pred auROC 0.9074450973244279
nondeterministic pred auROC 0.9075221248051144
flip-True_rotamt-270
deterministic pred auROC 0.9051834963473733
nondeterministic pred auROC 0.9063560028916199


In [None]:
import abstention
reload(abstention.abstention)
reload(abstention.calibration)
from abstention.calibration import (compute_ece, TempScaling,
                                    EMImbalanceAdapter)
from abstention.abstention import (weighted_kappa_metric,
                                   WeightedKappa, DistMaxClassProbFromOne,
                                   Entropy, Uncertainty)
from collections import defaultdict, namedtuple
import numpy as np
import random

def inverse_softmax(preds):
    return np.log(preds) - np.mean(np.log(preds),axis=1)[:,None]

quadratic_weights = np.array([[(i-j)**2 for i in range(5)]
                             for j in range(5)])

AbstainerSettings = namedtuple("AbstainerSettings",
                               ["name",
                                "abstainer_factories",
                                "preds_lookup",
                                "predsamples_lookup",
                                "use_calib",
                                "imbalance_subsampling",
                                "imbalance_adapter"])

abstainer_factories = [
        ("expected_delta_weighted_kappa", WeightedKappa(
            weights=quadratic_weights, verbose=False)),
        ("expected_delta_weighted_kappa_imbalance_from_valid", WeightedKappa(
                weights=quadratic_weights,
                estimate_class_imbalance_from_valid=True,
                verbose=False)),
        ("dist_maxclass_prob_from_one", DistMaxClassProbFromOne()),
        ("entropy", Entropy()),
        ("variance", Uncertainty())]
abstention_fractions = [0.05, 0.1, 0.15, 0.2]

abstainer_settings_list = [
    AbstainerSettings(
        name="calib_weightrescalepreds_imbalanced_adapted",
        abstainer_factories=abstainer_factories,
        preds_lookup=parent_folder_to_det_pred,
        predsamples_lookup=parent_folder_to_nondet_pred,
        use_calib=True,
        imbalance_subsampling=[8, 5, 1, 1, 1],
        imbalance_adapter=EMImbalanceAdapter(verbose=True)),
    AbstainerSettings(
        name="calib_weightrescalepreds_imbalanced_unadapted",
        abstainer_factories=abstainer_factories,
        preds_lookup=parent_folder_to_det_pred,
        predsamples_lookup=parent_folder_to_nondet_pred,
        use_calib=True,
        imbalance_subsampling=[1.0, 0.8, 0.5, 0.2, 0.2],
        imbalance_adapter=None),
    AbstainerSettings(
        name="calib_mcdrpreds_imbalanced_adapted",
        abstainer_factories=abstainer_factories,
        preds_lookup=parent_folder_to_mean_nondet_pred,
        predsamples_lookup=parent_folder_to_nondet_pred,
        use_calib=True,
        imbalance_subsampling=[1.0, 0.8, 0.5, 0.2, 0.2],
        imbalance_adapter=EMImbalanceAdapter(verbose=False)),
    AbstainerSettings(
        name="calib_mcdrpreds_imbalanced_unadapted",
        abstainer_factories=abstainer_factories,
        preds_lookup=parent_folder_to_mean_nondet_pred,
        predsamples_lookup=parent_folder_to_nondet_pred,
        use_calib=True,
        imbalance_subsampling=[1.0, 0.8, 0.5, 0.2, 0.2],
        imbalance_adapter=None),
#    AbstainerSettings(
#        name="calib_weightrescalepreds",
#        abstainer_factories=abstainer_factories,
#        preds_lookup=parent_folder_to_det_pred,
#        predsamples_lookup=parent_folder_to_nondet_pred,
#        use_calib=True,
#        imbalance_subsampling=None),  
#    AbstainerSettings(
#        name="uncalib_weightrescalepreds",
#        abstainer_factories=abstainer_factories,
#        preds_lookup=parent_folder_to_det_pred,
#        predsamples_lookup=parent_folder_to_nondet_pred,
#        use_calib=False,
#        imbalance_subsampling=None),
#    AbstainerSettings(
#        name="calib_mcdrpreds",
#        abstainer_factories=abstainer_factories,
#        preds_lookup=parent_folder_to_mean_nondet_pred,
#        predsamples_lookup=parent_folder_to_nondet_pred,
#        use_calib=True,
#        imbalance_subsampling=None),
#    AbstainerSettings(
#        name="uncalib_mcdrpreds",
#        abstainer_factories=abstainer_factories,
#        preds_lookup=parent_folder_to_mean_nondet_pred,
#        predsamples_lookup=parent_folder_to_nondet_pred,
#        use_calib=False,
#        imbalance_subsampling=None)
]

num_folds = 50

settingsname_to_metric_to_fraction_to_method_to_perfs = {}
settingsname_to_metric_to_baselineperfs = {}

for abstainer_settings in abstainer_settings_list:
    
    settings_name = abstainer_settings.name
    print("abstainer settings", settings_name)
    abstainer_factories = abstainer_settings.abstainer_factories
    preds_lookup = abstainer_settings.preds_lookup
    predsamples_lookup = abstainer_settings.predsamples_lookup
    use_calib = abstainer_settings.use_calib
    imbalance_subsampling = abstainer_settings.imbalance_subsampling
    imbalance_adapter = abstainer_settings.imbalance_adapter
    
    metric_to_fraction_to_method_to_perfs =\
        defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    settingsname_to_metric_to_fraction_to_method_to_perfs[settings_name] =\
        metric_to_fraction_to_method_to_perfs
    metric_to_baselineperfs = defaultdict(list)   
    settingsname_to_metric_to_baselineperfs[settings_name] =\
        metric_to_baselineperfs
    
    for fold_number in range(num_folds):
        print("on fold",fold_number)

        np.random.seed(fold_number*1000)
        random.seed(fold_number*1000)
        #the data is in pairs of (left eye, right eye) per patient (entry for
        # the right eye comes after the entry for the left eye); hence, the number of
        # unique patients is 0.5*len(valid_labels)
        patient_id_ordering = list(range(int(0.5*len(valid_labels))))
        np.random.shuffle(patient_id_ordering)

        pseudovalid_uncalib_preds = []
        pseudotest_uncalib_preds = []
        pseudovalid_uncalib_predsamples = []
        pseudotest_uncalib_predsamples = []
        pseudovalid_labels = []
        pseudotest_labels = []
        pseudovalid_label_counts = np.zeros(5)
        pseudotest_label_counts = np.zeros(5)
        for i in patient_id_ordering:
            left_eye_label = valid_labels[2*i]
            right_eye_label = valid_labels[(2*i)+1]
            most_diseased_label = max(np.argmax(left_eye_label),
                                      np.argmax(right_eye_label))
            if (pseudovalid_label_counts[most_diseased_label] <
                pseudotest_label_counts[most_diseased_label]):
                in_test = False
                append_to_uncalib_preds = pseudovalid_uncalib_preds
                append_to_uncalib_predsamples = pseudovalid_uncalib_predsamples
                append_to_labels = pseudovalid_labels
                append_to_label_counts = pseudovalid_label_counts
            else:
                in_test = True
                append_to_uncalib_preds = pseudotest_uncalib_preds
                append_to_uncalib_predsamples = pseudotest_uncalib_predsamples
                append_to_labels = pseudotest_labels
                append_to_label_counts = pseudotest_label_counts
            
            append_to_label_counts += valid_labels[2*i]
            append_to_label_counts += valid_labels[(2*i)+1]
            for parent_folder_idx,parent_folder in enumerate(parent_folders):
                if ((not in_test) or
                    imbalance_subsampling[np.argmax(valid_labels[2*i])] > parent_folder_idx):
                    append_to_labels.append(valid_labels[2*i])
                    append_to_uncalib_preds.append(
                            preds_lookup[parent_folder][2*i])
                    append_to_uncalib_predsamples.append(
                        predsamples_lookup[parent_folder][:,(2*i)])                    
                if ((not in_test) or
                    imbalance_subsampling[np.argmax(valid_labels[(2*i) + 1])] > parent_folder_idx): 
                    append_to_labels.append(valid_labels[(2*i)+1])
                    append_to_uncalib_preds.append(
                        preds_lookup[parent_folder][(2*i)+1])
                    append_to_uncalib_predsamples.append(
                        predsamples_lookup[parent_folder][:,(2*i)+1])
        print(np.sum(pseudovalid_labels,axis=0))
                
        pseudovalid_uncalib_preds = np.array(pseudovalid_uncalib_preds)
        pseudotest_uncalib_preds = np.array(pseudotest_uncalib_preds)
        pseudovalid_uncalib_pred_logits = inverse_softmax(pseudovalid_uncalib_preds)
        pseudotest_uncalib_pred_logits = inverse_softmax(pseudotest_uncalib_preds)
        pseudovalid_uncalib_predsamples = np.array(pseudovalid_uncalib_predsamples).transpose((1,0,2))
        pseudotest_uncalib_predsamples = np.array(pseudotest_uncalib_predsamples).transpose((1,0,2))
        pseudovalid_uncalib_predsamples_logits = np.array([
                inverse_softmax(x) for x in pseudovalid_uncalib_predsamples])        
        pseudotest_uncalib_predsamples_logits = np.array([
                inverse_softmax(x) for x in pseudotest_uncalib_predsamples])
        pseudovalid_labels = np.array(pseudovalid_labels) 
        pseudotest_labels = np.array(pseudotest_labels)
        print("valid vs. test distribution shift",
              "Valid dist:",np.mean(pseudovalid_labels,axis=0),
              "Test dist:",np.mean(pseudotest_labels,axis=0))
        
        if (use_calib):
            #print("ece before temp scale - valid",
            #  compute_ece(softmax_out=pseudovalid_uncalib_preds,
            #              labels=pseudovalid_labels,
            #              bins=15))
            #print("ece before temp scale - test",
            #      compute_ece(softmax_out=pseudotest_uncalib_preds,
            #                  labels=pseudotest_labels,
            #                  bins=15))
            temp_scaler = TempScaling(ece_bins=15, verbose=False)(
                                valid_preacts=pseudovalid_uncalib_pred_logits,
                                valid_labels=pseudovalid_labels)
            pseudovalid_calib_preds = temp_scaler(pseudovalid_uncalib_pred_logits)
            pseudotest_calib_preds = temp_scaler(pseudotest_uncalib_pred_logits)
            print("Distribution shift from true labels after calibration:",
                  "True:",np.mean(pseudovalid_labels,axis=0),
                  "Estimated:",np.mean(pseudovalid_calib_preds,axis=0),
                  "Difference:", np.mean(pseudovalid_labels-pseudovalid_calib_preds,
                                         axis=0))
            pseudovalid_calib_predsamples = np.array(
                [temp_scaler(x) for x in pseudovalid_uncalib_predsamples_logits])
            pseudotest_calib_predsamples = np.array(
                [temp_scaler(x) for x in pseudotest_uncalib_predsamples_logits])       
            #print("ece after temp scale - valid",
            #      compute_ece(softmax_out=pseudovalid_calib_preds,
            #            labels=pseudovalid_labels,
            #            bins=15))
            #print("ece after temp scale - test",
            #      compute_ece(softmax_out=pseudotest_calib_preds,
            #            labels=pseudotest_labels,
            #            bins=15))
            
        if (use_calib):
            pseudotest_preds_to_use=pseudotest_calib_preds
            pseudovalid_preds_to_use=pseudovalid_calib_preds
            pseudotest_predsamples_to_use=pseudotest_calib_predsamples
            pseudovalid_predsamples_to_use=pseudovalid_calib_predsamples
        else:
            pseudotest_preds_to_use=pseudotest_uncalib_preds
            pseudovalid_preds_to_use=pseudovalid_uncalib_preds
            pseudotest_predsamples_to_use=pseudotest_uncalib_predsamples
            pseudovalid_predsamples_to_use=pseudovalid_uncalib_predsamples
        
        if (imbalance_adapter is not None):
            imbalance_adaptation_func = imbalance_adapter(
                #set the validation labels to be pseudovalid_preds_to_use
                # (rather than pseudovalid_labels) for consistency;
                # we want no adjustment to happen in the
                # case where tofit_initial_posterior_probs=pseudovalid_preds_to_use
                valid_labels=pseudovalid_preds_to_use,
                tofit_initial_posterior_probs=pseudotest_preds_to_use)
            preds_before_adaptation = pseudotest_preds_to_use
            pseudotest_preds_to_use = imbalance_adaptation_func(pseudotest_preds_to_use)
            print("Accuracy before adaptation:",
                  np.mean(np.argmax(preds_before_adaptation,axis=-1)
                          ==np.argmax(pseudotest_labels,axis=-1)))
            print("WKappa before adaptation:",
                  weighted_kappa_metric(
                        predprobs=preds_before_adaptation,
                        true_labels=pseudotest_labels,
                        weights=quadratic_weights))
            print("Accuracy after adaptation:",
                  np.mean(np.argmax(pseudotest_preds_to_use,axis=-1)
                          ==np.argmax(pseudotest_labels,axis=-1)))
            print("WKappa after adaptation:",
                  weighted_kappa_metric(
                        predprobs=pseudotest_preds_to_use,
                        true_labels=pseudotest_labels,
                        weights=quadratic_weights))
            #print((zip(preds_before_adaptation,pseudotest_preds_to_use))[:20])
            
            #print("Difference from true imbalance",
            #      np.mean(pseudotest_preds_to_use,axis=0)-
            #      np.mean(pseudotest_labels,axis=0))
            pseudotest_predsamples_to_use = np.array([
                    imbalance_adaptation_func(x) for
                    x in pseudotest_predsamples_to_use])

        pseudovalid_variance_to_use = np.sum(np.var(pseudovalid_predsamples_to_use, axis=0),
                                             axis=-1)
        pseudotest_variance_to_use = np.sum(np.var(pseudotest_predsamples_to_use, axis=0),
                                            axis=-1)
            
        original_weighted_kappa_perf = weighted_kappa_metric(
            predprobs=pseudotest_preds_to_use,
            true_labels=pseudotest_labels,
            weights=quadratic_weights)
        
        #print("\nPseudotest set weighted kappa",
        #      original_weighted_kappa_perf)
        metric_to_baselineperfs["weighted_kappa"].append(
            original_weighted_kappa_perf)
        original_accuracy_perf = np.mean(
            np.argmax(pseudotest_preds_to_use,axis=-1)
            ==np.argmax(pseudotest_labels,axis=-1))
        #print("Pseudotest set accuracy",original_accuracy_perf)
        metric_to_baselineperfs["accuracy"].append(original_accuracy_perf)
        
        for abstention_fraction in abstention_fractions:
            #print("\nabstention fraction:",abstention_fraction)
            for abstainer_name, abstainer_factory in abstainer_factories:
                abstainer = abstainer_factory(
                    valid_labels=pseudovalid_labels,
                    valid_posterior=pseudovalid_preds_to_use)
                abstainer_priorities = abstainer(
                    posterior_probs=pseudotest_preds_to_use,
                    uncertainties=pseudotest_variance_to_use)
                indices_to_retain = (
                    [y[0] for y in sorted(enumerate(abstainer_priorities),
                        key=lambda x: x[1])][:int(len(abstainer_priorities)*
                                                     (1-abstention_fraction))])
                retained_pseudotest_preds = np.array(
                    [pseudotest_preds_to_use[i] for i in indices_to_retain])
                retained_pseudotest_labels = np.array(
                    [pseudotest_labels[i] for i in indices_to_retain])
                #print("\nAbstention criterion:",abstainer_name)
                weighted_kappa_perf = weighted_kappa_metric(
                    predprobs=retained_pseudotest_preds,
                    true_labels=retained_pseudotest_labels,
                    weights=quadratic_weights)
                #print("weighted kappa", weighted_kappa_perf)
                accuracy_perf = (np.mean(np.argmax(
                    retained_pseudotest_preds,axis=-1)
                    ==np.argmax(retained_pseudotest_labels,axis=-1)))
                #print("accuracy", accuracy_perf)

                metric_to_fraction_to_method_to_perfs["delta_weighted_kappa"][
                    abstention_fraction][abstainer_name].append(
                        weighted_kappa_perf-original_weighted_kappa_perf)
                metric_to_fraction_to_method_to_perfs["delta_accuracy"][
                    abstention_fraction][abstainer_name].append(
                        accuracy_perf-original_accuracy_perf)

abstainer settings calib_weightrescalepreds_imbalanced_adapted
on fold 0
[10280   992  2144   336   296]
valid vs. test distribution shift Valid dist: [0.73177677 0.07061503 0.15261959 0.023918   0.02107062] Test dist: [0.91371642 0.05515033 0.02383917 0.00391389 0.00338018]
Distribution shift from true labels after calibration: True: [0.73177677 0.07061503 0.15261959 0.023918   0.02107062] Estimated: [0.70398613 0.1021053  0.11840535 0.05697056 0.01853267] Difference: [ 0.02779063 -0.03149026  0.03421424 -0.03305256  0.00253795]
Original class imbalance [0.70398613 0.1021053  0.11840535 0.05697056 0.01853267]
Finished on iteration 8 with delta 0.0020559468131290566
Final imbalance [0.88337398 0.0741724  0.03079887 0.0055813  0.00607346]
Multiplier: [1.2548173  0.72643046 0.26011385 0.09796807 0.32771641]
Accuracy before adaptation: 0.9042874933285893
WKappa before adaptation: 0.6467983358357531
Accuracy after adaptation: 0.9162960327343889
WKappa after adaptation: 0.6794150406334958
o

In [None]:
import json
fh = open("imbalanced_abstention_results.json", 'w')
fh.write(json.dumps({
            "settingsname_to_metric_to_fraction_to_method_to_perfs":
              settingsname_to_metric_to_fraction_to_method_to_perfs,
            "settingsname_to_metric_to_baselineperfs":
              settingsname_to_metric_to_baselineperfs},
             sort_keys=True,
             indent=4,
             separators=(',', ': ')))
fh.close()

In [None]:
import json
loaded_data = json.loads(open("balanced_abstention_results.json").read())
settingsname_to_metric_to_fraction_to_method_to_perfs.update(
    loaded_data["settingsname_to_metric_to_fraction_to_method_to_perfs"])
settingsname_to_metric_to_baselineperfs.update(
    loaded_data["settingsname_to_metric_to_baselineperfs"])

import json
fh = open("abstention_results.json", 'w')
fh.write(json.dumps({
            "settingsname_to_metric_to_fraction_to_method_to_perfs":
              settingsname_to_metric_to_fraction_to_method_to_perfs,
            "settingsname_to_metric_to_baselineperfs":
              settingsname_to_metric_to_baselineperfs},
             sort_keys=True,
             indent=4,
             separators=(',', ': ')))
fh.close()

In [None]:
import json
loaded_data = json.loads(open("abstention_results.json").read())
settingsname_to_metric_to_fraction_to_method_to_perfs =\
    loaded_data["settingsname_to_metric_to_fraction_to_method_to_perfs"]
settingsname_to_metric_to_baselineperfs =\
    loaded_data["settingsname_to_metric_to_baselineperfs"]


In [None]:
from abstention.figure_making_utils import (
    wilcox_srs, get_ustats_mat,
    get_tied_top_and_worst_methods)
from collections import OrderedDict

comparison_groups = OrderedDict([
        ('Imbalanced, with weight rescaling', ([
          ('calib_weightrescalepreds_imbalanced_adapted', 'expected_delta_weighted_kappa'),
          ('calib_weightrescalepreds_imbalanced_unadapted', 'expected_delta_weighted_kappa'),
          ('calib_weightrescalepreds_imbalanced_adapted', 'dist_maxclass_prob_from_one'),
          ('calib_weightrescalepreds_imbalanced_unadapted', 'dist_maxclass_prob_from_one'),
          ('calib_weightrescalepreds_imbalanced_adapted', 'entropy'),
          ('calib_weightrescalepreds_imbalanced_unadapted', 'entropy'),
          ('calib_weightrescalepreds_imbalanced_adapted', 'variance'),
          ('calib_weightrescalepreds_imbalanced_unadapted', 'variance')],
         ['adapted'])),
        ('Balanced, with weight rescaling', ([
          ('calib_weightrescalepreds', 'expected_delta_weighted_kappa'),
          ('uncalib_weightrescalepreds', 'expected_delta_weighted_kappa'),
          ('calib_weightrescalepreds', 'dist_maxclass_prob_from_one'),
          ('uncalib_weightrescalepreds', 'dist_maxclass_prob_from_one'),
          ('calib_weightrescalepreds', 'entropy'),
          ('uncalib_weightrescalepreds', 'entropy'),
          ('calib_weightrescalepreds', 'variance'),
          ('uncalib_weightrescalepreds', 'variance')],
         ['calib'])),
       ('Imalanced, with MC dropout', ([
          ('calib_mcdrpreds_imbalanced_adapted', 'expected_delta_weighted_kappa'),
          ('calib_mcdrpreds_imbalanced_unadapted', 'expected_delta_weighted_kappa'),
          ('calib_mcdrpreds_imbalanced_adapted', 'dist_maxclass_prob_from_one'),
          ('calib_mcdrpreds_imbalanced_unadapted', 'dist_maxclass_prob_from_one'),
          ('calib_mcdrpreds_imbalanced_adapted', 'entropy'),
          ('calib_mcdrpreds_imbalanced_unadapted', 'entropy'),
          ('calib_mcdrpreds_imbalanced_adapted', 'variance'),
          ('calib_mcdrpreds_imbalanced_unadapted', 'variance')],
         ['adapted'])),
       ('Balanced, with MC dropout', ([
          ('calib_mcdrpreds', 'expected_delta_weighted_kappa'),
          ('uncalib_mcdrpreds', 'expected_delta_weighted_kappa'),
          ('calib_mcdrpreds', 'dist_maxclass_prob_from_one'),
          ('uncalib_mcdrpreds', 'dist_maxclass_prob_from_one'),
          ('calib_mcdrpreds', 'entropy'),
          ('uncalib_mcdrpreds', 'entropy'),
          ('calib_mcdrpreds', 'variance'),
          ('uncalib_mcdrpreds', 'variance')],
         ['calib']))
    ])

friendly_method_names = {
    'expected_delta_weighted_kappa': 'E[$\Delta$Kappa]',
    'dist_maxclass_prob_from_one': 'Max Class Prob.',
    'entropy': 'Entropy',
    'variance': 'MC Dropout Var.'
}
abstention_fractions = ['0.05', '0.1', '0.15', '0.2']

for comparison_group_name in comparison_groups:
    
    print("On comparison group", comparison_group_name)
    columnstowrite = comparison_groups[comparison_group_name][1]
    for metric in ["weighted_kappa",
                   "accuracy"]:
        print("On metric", metric)
        
        #gather all the necessary data
        settingnmethod_to_baselineperfs = OrderedDict()
        settingnmethod_to_abstentionfraction_to_perfs = OrderedDict()
        
        for (settingsname, methodname) in comparison_groups[comparison_group_name][0]:
            settingnmethod = settingsname+"-"+methodname
            settingnmethod_to_baselineperfs[settingnmethod] =\
                settingsname_to_metric_to_baselineperfs[settingsname][metric]
            
            
            abstentionfraction_to_perfs = OrderedDict()
            settingnmethod_to_abstentionfraction_to_perfs[settingnmethod] =\
                abstentionfraction_to_perfs
            for abstention_fraction in abstention_fractions:
                abstentionfraction_to_perfs[abstention_fraction] = (
                    settingsname_to_metric_to_fraction_to_method_to_perfs[
                        settingsname]["delta_"+metric][abstention_fraction][
                        methodname])
        
        #prepare the table contents
        
        settingnmethod_to_tablecontents = OrderedDict()
        for settingnmethod in settingnmethod_to_baselineperfs:
            tablerow = {}
            settingnmethod_to_tablecontents[settingnmethod] = tablerow
            tablerow['baseline'] = {
                'mean': np.mean(settingnmethod_to_baselineperfs[settingnmethod]),
                'stderr': np.std(settingnmethod_to_baselineperfs[settingnmethod],
                                 ddof=1)/np.sqrt(num_folds)}
            tablerow['method'] = settingnmethod.split("-")[1]
            tablerow['mcdr'] = "mcdr" in settingnmethod.split("-")[0]
            tablerow['calib'] = ("uncalib" in settingnmethod.split("-")[0])==False
            #if neither 'balanced' nor 'imbalanced' is in the name, it means balanced
            tablerow['imbalanced'] = (("imbalanced" in settingnmethod.split("-")[0])
                                      and ("balanced" in settingnmethod.split("-")[0]))
            #if neither adapted nor unadapted is in the name, it means no adaptation
            tablerow['adapted'] = (("unadapted" not in settingnmethod.split("-")[0])
                                     and ("adapted" in settingnmethod.split("-")[0]))
        
        for abstention_fraction in abstention_fractions:
            method_to_perfs = OrderedDict()
            for settingnmethod in settingnmethod_to_tablecontents:
                perfsdelta = settingnmethod_to_abstentionfraction_to_perfs[
                    settingnmethod][abstention_fraction]
                method_to_perfs[settingnmethod] = perfsdelta
                mean_perfsdelta = np.mean(perfsdelta)
                stderr_perfsdelta = (np.std(perfsdelta,ddof=1)/
                                     np.sqrt(num_folds))
                tablerow = settingnmethod_to_tablecontents[settingnmethod]
                tablerow[abstention_fraction] = {
                    'mean': mean_perfsdelta,
                    'stderr': stderr_perfsdelta}
            methods_to_consider = list(method_to_perfs.keys())
            ustats_mat = get_ustats_mat(
                method_to_perfs,
                methods_to_consider,
                max_ustat=1275)
            tied_top_methods, tied_worst_methods =(
                get_tied_top_and_worst_methods(
                    ustats_mat,
                    methods_to_consider,
                    #0.05 threshold for one-sided test when N=119 is 50
                    #http://www.real-statistics.com/statistics-tables/wilcoxon-signed-ranks-table/
                    threshold=120
                ))
            tied_top_methods = [methods_to_consider[x]
                                for x in tied_top_methods]
            #print(abstention_fraction)
            #print(tied_top_methods)
            for settingnmethod in settingnmethod_to_tablecontents:
                settingnmethod_to_tablecontents[
                    settingnmethod][abstention_fraction][
                    'istop'] = (settingnmethod in tied_top_methods)
            #print(settingnmethod_to_tablecontents)
            
        thestr = "\\begin{tabular}{ | c | c | c | c | c | c | c | }\n"
        thestr += ("\\hline Method & "
                  +('Calibrated?' if 'calib' in columnstowrite else '')
                  +('Adapted?' if 'adapted' in columnstowrite else '')
                  +"& $\\Delta$ @")
        thestr += " & $\\Delta$ @".join([str(int(100*float(x)))+"\\% Abs."
                              for x in abstention_fractions])
        thestr += "\\\\ \\hline\n"
        for settingnmethod in settingnmethod_to_tablecontents:
            tablerow = settingnmethod_to_tablecontents[settingnmethod]
            thestr += friendly_method_names[tablerow['method']]
            thestr += " & "+str(tablerow['baseline']['mean'])
            if ('calib' in columnstowrite):
                thestr += " & "+("Y" if tablerow['calib'] else "N")
            if ('adapted' in columnstowrite):
                thestr += " & "+("Y" if tablerow['adapted'] else "N")
            #thestr += " & "+(str(np.round(tablerow['baseline']['mean'],4))
            #                 +" $\\pm$ "
            #                 +str(np.round(tablerow['baseline']['stderr'],4)))
            for abstention_fraction in abstention_fractions:
                thestr += (
                    " & "+
                    ("\\textbf{" if tablerow[abstention_fraction]['istop'] else "")
                    +str(np.round(tablerow[abstention_fraction]['mean'],4))
                    +" $\\pm$ "
                    +str(np.round(tablerow[abstention_fraction]['stderr'],4))
                    +("}" if tablerow[abstention_fraction]['istop'] else ""))
            thestr += "\\\\ \hline\n"
        thestr += "\\end{tabular}\n"
        
        print("\nBaseline "+metric+" perfs:")
        baseline_mean = set(
                x['baseline']['mean'] for x in
                settingnmethod_to_tablecontents.values())
        print(baseline_mean)
        #assert that all the methods have the same baseline
        #assert len(baseline_mean)==1
        #baseline_mean = list(baseline_mean)[0]
        baseline_stderr = set(
                x['baseline']['stderr'] for x in
                settingnmethod_to_tablecontents.values())
        print(baseline_stderr)
        #assert len(baseline_stderr)==1
        #baseline_stderr = list(baseline_stderr)[0]
        #print(np.round(baseline_mean,4),"$\\pm$",np.round(baseline_stderr,4))
        
        print("\n Latex Table for metric "
              +metric+" and group "+comparison_group_name
              +"\n\n"+thestr)
        
