In [1]:
from __future__ import division, print_function
from importlib import reload

In [2]:
import abstention
reload(abstention)
reload(abstention.calibration)
reload(abstention.label_shift)
from abstention.calibration import TempScaling, ConfusionMatrix, softmax
from abstention.label_shift import EMImbalanceAdapter, BBSEImbalanceAdapter, ShiftWeightFromImbalanceAdapter
import glob
import gzip
import numpy as np
from collections import defaultdict

def read_preds(fh):
    return np.array([[float(x) for x in y.decode("utf-8").rstrip().split("\t")]
                     for y in fh])

def sample_from_probs_arr(arr_with_probs):
    rand_num = np.random.random()
    cdf_so_far = 0
    for (idx, prob) in enumerate(arr_with_probs):
        cdf_so_far += prob
        if (cdf_so_far >= rand_num
            or idx == (len(arr_with_probs) - 1)):  # need the
            # letterIdx==(len(row)-1) clause because of potential floating point errors
            # that mean arrWithProbs doesn't sum to 1
            return idx
        
test_labels = read_preds(gzip.open(glob.glob("test_labels.txt.gz")[0]))
test_class_to_indices = defaultdict(list)
for index,row in enumerate(test_labels):
    row_label = np.argmax(row)
    test_class_to_indices[row_label].append(index)
def draw_test_indices(total_to_return, label_proportions):
    indices_to_use = []
    for class_index, class_proportion in enumerate(label_proportions):
        indices_to_use.extend(np.random.choice(
                test_class_to_indices[class_index],
                int(total_to_return*class_proportion),
                replace=True))
    for i in range(total_to_return-len(indices_to_use)):
        class_index = sample_from_probs_arr(label_proportions)
        indices_to_use.append(
            np.random.choice(test_class_to_indices[class_index]))
    return indices_to_use

valid_labels = read_preds(gzip.open(glob.glob("valid_labels.txt.gz")[0]))

imbalance_adapters = [
    ('em_calib-confusionmat_init-default', EMImbalanceAdapter(calibrator_factory=ConfusionMatrix(), verbose=False)),
    ('em_calib-confusionmat_init-BBSE-hard', EMImbalanceAdapter(calibrator_factory=ConfusionMatrix(), verbose=False,
                                                                    initialization_weight_ratio=
                                                                     ShiftWeightFromImbalanceAdapter(BBSEImbalanceAdapter(soft=False)))),
    ('em_calib-None_init-default', EMImbalanceAdapter(calibrator_factory=None)),
    ('em_calib-tsnobiascorr_init-default', EMImbalanceAdapter(calibrator_factory=TempScaling(verbose=False))),
    ('em_calib-tswithbiascorr_init-default', EMImbalanceAdapter(calibrator_factory=
                                                       TempScaling(verbose=False,bias_positions=[0,1,2,3,4,5,6,7,8,9]))),
    ('bbse-hard_calib-None', BBSEImbalanceAdapter(soft=False, calibrator_factory=None)),
    ('bbse-soft_calib-None', BBSEImbalanceAdapter(soft=True, calibrator_factory=None)),
    ('bbse-soft_calib-tsnobiascorr', BBSEImbalanceAdapter(soft=True, calibrator_factory=TempScaling(verbose=False))),
    ('bbse-hard_calib-tsnobiascorr', BBSEImbalanceAdapter(soft=False, calibrator_factory=TempScaling(verbose=False))),
    ('bbse-soft_calib-tswithbiascorr', BBSEImbalanceAdapter(soft=True, calibrator_factory=TempScaling(verbose=False,
                                                                                      bias_positions=[0,1,2,3,4,5,6,7,8,9]))), 
    ('bbse-hard_calib-tswithbiascorr', BBSEImbalanceAdapter(soft=False, calibrator_factory=TempScaling(verbose=False,
                                                                                      bias_positions=[0,1,2,3,4,5,6,7,8,9]))),
]

In [3]:
import numpy as np
import random
import sys

dirichletalpha_to_samplesize_to_adaptername_to_metric_to_vals = defaultdict(
                                                                  lambda: defaultdict(
                                                                           lambda: defaultdict(
                                                                                    lambda: defaultdict(list))))
dirichletalpha_to_samplesize_to_baselineacc = defaultdict(lambda: defaultdict(list))
num_trials = 10
dirichlet_alphas_and_samplesize = [(0.01, 500), (0.1,500), (1.0,500), (10.0,500),
                                   (0.01, 1000), (0.1,1000), (1.0,1000), (10.0,1000),
                                   (0.01, 2000), (0.1,2000), (1.0,2000), (10.0,2000),
                                   (0.01, 4000), (0.1,4000), (1.0,4000), (10.0,4000),
                                   (0.01, 8000), (0.1,8000), (1.0,8000), (10.0,8000),]
for (dirichlet_alpha,samplesize) in dirichlet_alphas_and_samplesize:
    #for model_idx,train_set_size in enumerate([250, 500, 1000, 2000, 4000, 8000, 16000]):
    for seed in [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]:
        print("Seed",seed)
        test_preds = softmax(preact=read_preds(gzip.open(glob.glob("testpreacts_model_mnist_set-16000_seed-"+str(seed)+".txt.gz")[0])),
                             temp=1, biases=None)
        valid_preds = softmax(preact=read_preds(gzip.open(glob.glob("validpreacts_model_mnist_set-16000_seed-"+str(seed)+".txt.gz")[0])),
                              temp=1, biases=None)
        sample_valid_preds = valid_preds[:samplesize]
        sample_valid_labels = valid_labels[:samplesize]
        for trial_num in range(num_trials):
            #print("On trial num",trial_num)
            sys.stdout.flush()
            np.random.seed(trial_num*100)
            random.seed(trial_num*100)
            dirichlet_dist = np.random.dirichlet([dirichlet_alpha for x in range(10)])
            test_indices = draw_test_indices(total_to_return=samplesize,
                                             label_proportions=dirichlet_dist)
            shifted_test_labels = test_labels[test_indices]
            shifted_test_preds = test_preds[test_indices]
            
            shifted_test_baseline_accuracy = np.mean(np.argmax(shifted_test_labels,axis=-1)==
                                                     np.argmax(shifted_test_preds,axis=-1))
            dirichletalpha_to_samplesize_to_baselineacc[dirichlet_alpha][samplesize].append(shifted_test_baseline_accuracy)
            
            ideal_shift_weights = np.mean(shifted_test_labels,axis=0)/np.mean(sample_valid_labels,axis=0)
            for adapter_name,imbalance_adapter in imbalance_adapters:
                #print(adapter_name)
                imbalance_adapter_func = imbalance_adapter(valid_labels=sample_valid_labels,
                                                           tofit_initial_posterior_probs=shifted_test_preds,
                                                           valid_posterior_probs=sample_valid_preds)  
                shift_weights = imbalance_adapter_func.multipliers
                adapted_shifted_test_preds = imbalance_adapter_func(shifted_test_preds)
                adapted_shifted_test_accuracy = np.mean(np.argmax(shifted_test_labels,axis=-1)==
                                                        np.argmax(adapted_shifted_test_preds,axis=-1))
                delta_from_baseline = adapted_shifted_test_accuracy-shifted_test_baseline_accuracy
                
                dirichletalpha_to_samplesize_to_adaptername_to_metric_to_vals[
                    dirichlet_alpha][samplesize][adapter_name]['weightdiffnorm'].append(
                    np.linalg.norm(shift_weights-ideal_shift_weights))
                dirichletalpha_to_samplesize_to_adaptername_to_metric_to_vals[
                    dirichlet_alpha][samplesize][adapter_name]['delta_acc'].append(
                    delta_from_baseline)
                    
    print("On alpha",dirichlet_alpha,"sample size", samplesize)
    for metric_name in ['delta_acc', 'weightdiffnorm']:
        print("Metric",metric_name)
        for adapter_name in [x[0] for x in imbalance_adapters]:
            n = len(dirichletalpha_to_samplesize_to_adaptername_to_metric_to_vals[
                                dirichlet_alpha][samplesize][adapter_name][metric_name])
            
            print(adapter_name,
                  np.mean(dirichletalpha_to_samplesize_to_adaptername_to_metric_to_vals[
                            dirichlet_alpha][samplesize][adapter_name][metric_name]),
                  "+/-",
                  (1.0/np.sqrt(n))*np.std(dirichletalpha_to_samplesize_to_adaptername_to_metric_to_vals[
                                  dirichlet_alpha][samplesize][adapter_name][metric_name],
                                 ddof=1))
            sys.stdout.flush()

Seed 0
Seed 10
Seed 20
Seed 30
Seed 40
Seed 50
Seed 60
Seed 70
Seed 80
Seed 90
On alpha 0.01 sample size 500
Metric delta_acc
em_calib-confusionmat_init-default 0.0028000000000000026 +/- 0.0004512608598542134
em_calib-confusionmat_init-BBSE-hard 0.0028000000000000026 +/- 0.0004512608598542134
em_calib-None_init-default 0.012800000000000011 +/- 0.0006240953048172611
em_calib-tsnobiascorr_init-default 0.013400000000000013 +/- 0.0006730002176294549
em_calib-tswithbiascorr_init-default 0.01260000000000001 +/- 0.0006166052385457174
bbse-hard_calib-None 0.012200000000000011 +/- 0.0006126610281008056
bbse-soft_calib-None 0.01260000000000001 +/- 0.0006033576087532209
bbse-soft_calib-tsnobiascorr 0.013200000000000012 +/- 0.000667877687949378
bbse-hard_calib-tsnobiascorr 0.01260000000000001 +/- 0.0006033576087532209
bbse-soft_calib-tswithbiascorr 0.01100000000000001 +/- 0.0007205497340169658
bbse-hard_calib-tswithbiascorr 0.01100000000000001 +/- 0.0006977597630049242
Metric weightdiffnorm
em_cal

In [4]:
import json
import os
file_out = "label_shift_adaptation_results.json"
dict_to_write = {
    "dirichletalpha_to_samplesize_to_adaptername_to_metric_to_vals":
     dirichletalpha_to_samplesize_to_adaptername_to_metric_to_vals,
    "dirichletalpha_to_samplesize_to_baselineacc": dirichletalpha_to_samplesize_to_baselineacc,
}
open(file_out, 'w').write(
    json.dumps(dict_to_write,
               sort_keys=True, indent=4, separators=(',', ': ')))
os.system("gzip -f "+file_out)

0

In [5]:
import gzip
import json
loaded_dicts = json.loads(gzip.open("label_shift_adaptation_results.json.gz").read())
dirichletalpha_to_samplesize_to_adaptername_to_metric_to_vals =\
    loaded_dicts['dirichletalpha_to_samplesize_to_adaptername_to_metric_to_vals']
dirichletalpha_to_samplesize_to_baselineacc = loaded_dicts['dirichletalpha_to_samplesize_to_baselineacc']

In [6]:
import numpy as np

from abstention.figure_making_utils import (
    wilcox_srs, get_ustats_mat,
    get_tied_top_and_worst_methods)
from scipy.stats import norm

#columns are: method type, calibration strategy alpha perfs
methods_to_consider = [
        'bbse-hard_calib-None',
        'bbse-soft_calib-None',
        'em_calib-None_init-default',
        'bbse-soft_calib-tsnobiascorr',
        'bbse-soft_calib-tswithbiascorr',        
        'em_calib-tsnobiascorr_init-default',
        'em_calib-tswithbiascorr_init-default',
]
our_proposed_methods = set([
           'em_calib-tswithbiascorr_init-default',
           'em_calib-tsnobiascorr_init-default',
           #'em_calib-None_init-default',
           #'bbse-hard_calib-None',
           #'bbse-soft_calib-None',
           'bbse-soft_calib-tsnobiascorr',
           'bbse-soft_calib-tswithbiascorr'])

metrics = ["weightdiffnorm", "delta_acc"]
#metrics = ["weightdiffnorm"]
metric_to_nicename = {'delta_acc': "$\\bm{\\Delta}$\\textbf{\\%Accuracy}",
                      'weightdiffnorm': "$\\bm{| w - \hat{w} |}$"}
adaptmethod_to_nicename = {'em': 'EM',
                           'bbse-soft': 'BBSE-soft',
                           'bbse-hard': 'BBSE-hard'}
calibmethod_to_nicename = {'None': 'None',
                           'tsnobiascorr': 'Temp. Scale',
                           'tswithbiascorr': 'B.C. Temp. Scale'}
metric_to_largerisbetter = {'delta_acc':True, 'weightdiffnorm': False}

sets = [
    (" \\textbf{under different} $\\bm{\\alpha}$ ", [('0.01', '8000'), ('0.1', '8000'), ('1.0', '8000')],
      'alpha'),
    (" \\textbf{under different} $\\bm{n}$ ", [
            ('0.1', '500'), #('0.1', '1000'),
            ('0.1', '2000'),# ('0.1', '4000'),
            ('0.1', '8000')], 'n')
]

for metric in metrics:
    for set_name, set_cols, varyingparam in sets:
    #print("Set",set_name)
    
        #print("Metric:",metric)
        
        condition_to_best_methods = {}
        for alpha,samplesize in set_cols:
            method_to_perfs = dict([(method_name,
                                    dirichletalpha_to_samplesize_to_adaptername_to_metric_to_vals[alpha][samplesize][method_name][metric])
                                    for method_name in methods_to_consider])
            ustats_mat = get_ustats_mat(
                method_to_perfs=method_to_perfs,
                method_names=methods_to_consider,
                max_ustat=(101*50))
            #print(ustats_mat)
            tied_top_methods, tied_worst_methods =(
                get_tied_top_and_worst_methods(
                    ustats_mat=ustats_mat,
                    method_names=methods_to_consider,
                    #Using the normal approximation at N=100;
                    # variance from https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test
                    #Note that T = ((N+1)*N/2 - W)/2
                    threshold=((100*101)/2 - norm.ppf(0.95)*np.sqrt(100*(100+1)*(200+1)/6.0))/2.0
                ))
            if metric_to_largerisbetter[metric]:
                condition_to_best_methods[(alpha,samplesize)] = [methods_to_consider[x] for x in tied_top_methods]
            else:
                condition_to_best_methods[(alpha,samplesize)] = [methods_to_consider[x] for x in tied_worst_methods]
    
        table_rows = []
        for method_name in methods_to_consider:
            table_row = {'adapt_method_name': method_name.split("_")[0]}
            table_rows.append(table_row)
            table_row['proposed_here'] = method_name in our_proposed_methods
            table_row['calib_method_name'] = method_name.split("_")[1].split("-")[1]
            for alpha,samplesize in set_cols:
                
                vals_arr = dirichletalpha_to_samplesize_to_adaptername_to_metric_to_vals[
                                              alpha][samplesize][method_name][metric]
                if (metric=="delta_acc"):
                    vals_arr = 100*np.array(vals_arr)
               
                table_row[(alpha,samplesize)] = {'mean': np.mean(vals_arr),
                                                 'std': (1.0/np.sqrt(len(vals_arr)))*np.std(vals_arr, ddof=1),
                                                 'is_best': (method_name in condition_to_best_methods[(alpha,samplesize)])}


        #method name, calib name, conditions...
        the_str = "\\begin{table*}\n\\adjustbox{max width=\\textwidth}{\\centering\n\\begin{tabular}{ | c | c | c |"+("".join([" c |" for x in set_cols]))+"}\n"
        the_str += "\\hline"
        the_str += ("\multirow{2}{*}{\\begin{tabular}{c}\\textbf{Proposed} \\\\ \\textbf{Here?}\end{tabular}}"
                    +"& \multirow{2}{*}{\\begin{tabular}{c}\\textbf{Shift} \\\\ \\textbf{Estimator}\end{tabular}}"
                    +"& \multirow{2}{*}{\\begin{tabular}{c}\\textbf{Calibration} \\\\ \\textbf{Method}\end{tabular}} "
                    +"& \multicolumn{"+str(len(set_cols))+"}{c|}{"
                    +metric_to_nicename[metric]+set_name+"}\\\\ \\cline{4-"+str(4+len(set_cols)-1)+"}\n")
        the_str += "& & & "+(" & ".join([("$\\bm{\\alpha="+alpha+"}$"
                                          if varyingparam=="alpha" else "$\\bm{n="+n+"}$")
                                         for (alpha,n) in set_cols]))+"\\\\ \\hline\n"
        for idx,table_row in enumerate(table_rows):
            the_str += (("Y" if table_row['proposed_here'] else "N")
                        +" & "+adaptmethod_to_nicename[table_row['adapt_method_name']]
                        +" & "+calibmethod_to_nicename[table_row['calib_method_name']])
            for (alpha,samplesize) in set_cols:
                the_str += " & "+("\\textbf{" if table_row[(alpha,samplesize)]['is_best'] else "")
                the_str += str(np.round(table_row[(alpha,samplesize)]['mean'],4))
                the_str += " $\\pm$ "
                the_str += str(np.round(table_row[(alpha,samplesize)]['std'],4))
                the_str += ("}" if table_row[(alpha,samplesize)]['is_best'] else "")
            
            the_str += "\\\\\n"
            if (idx==2):
                the_str += "\\hline\n"
        the_str += "\\hline \\end{tabular}}\n"
        the_str += ("\\caption{\\textbf{"
            +("Difference from ideal weights" if metric=="weightdiffnorm" else "Improvement in \\%Accuracy")
            +" for CIFAR10 under different "
            +("degrees of dirichlet shift $\\bm{\\alpha}$" if varyingparam=="alpha" else "values of $\\bm{n}$")
            +"}. The value of "
            +("$\\alpha$ was fixed at "+str(alpha) if varyingparam=="n" else "$n$ was fixed at "+str(n))
            +". Table shows mean value of "
            +("$\\bm{|w - \\hat{w}|}$" if metric=="weightdiffnorm" else "$\\Delta$\\%Accuracy")
            +" for each set of 100 experiments along with the standard error."
            +" Bold numbers in a column were significantly better than"
            +" non-bold numbers by a Wilcoxon signed rank test. See main text for more details.}")
        the_str += "\\label{tab:varying"+str(varyingparam)+"_"+str(metric)+"}\n"
        the_str += "\\end{table*}\n"
        print(the_str)

\begin{table*}
\adjustbox{max width=\textwidth}{\centering
\begin{tabular}{ | c | c | c | c | c | c |}
\hline\multirow{2}{*}{\begin{tabular}{c}\textbf{Proposed} \\ \textbf{Here?}\end{tabular}}& \multirow{2}{*}{\begin{tabular}{c}\textbf{Shift} \\ \textbf{Estimator}\end{tabular}}& \multirow{2}{*}{\begin{tabular}{c}\textbf{Calibration} \\ \textbf{Method}\end{tabular}} & \multicolumn{3}{c|}{$\bm{| w - \hat{w} |}$ \textbf{under different} $\bm{\alpha}$ }\\ \cline{4-6}
& & & $\bm{\alpha=0.01}$ & $\bm{\alpha=0.1}$ & $\bm{\alpha=1.0}$\\ \hline
N & BBSE-hard & None & 0.0897 $\pm$ 0.0043 & 0.0673 $\pm$ 0.0026 & 0.0519 $\pm$ 0.0012\\
N & BBSE-soft & None & 0.0809 $\pm$ 0.0052 & 0.0605 $\pm$ 0.0021 & 0.0499 $\pm$ 0.0012\\
N & EM & None & 0.0928 $\pm$ 0.0106 & 0.07 $\pm$ 0.0032 & 0.0515 $\pm$ 0.0018\\
\hline
Y & BBSE-soft & Temp. Scale & 0.0803 $\pm$ 0.0056 & 0.0601 $\pm$ 0.0021 & 0.0498 $\pm$ 0.0012\\
Y & BBSE-soft & B.C. Temp. Scale & 0.0737 $\pm$ 0.0044 & 0.0559 $\pm$ 0.0018 & \textbf{0.0428 $\p