#### Import necessary modules for training a model

In [1]:
%matplotlib inline
from __future__ import division, print_function
import h5py
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.models import model_from_json
import os
from collections import OrderedDict
import numpy as np
np.random.seed(0) # for reproducibility

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3,5,6"

Using TensorFlow backend.


#### Load and preprocess the data

In [2]:
# set parameters:
max_features = 5000
maxlen = 400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

print('Loading data...')
(X_set1, y_set1), (X_set2, y_set2) = imdb.load_data(nb_words=max_features)
print(len(X_set1)+len(X_set2), ' sequences')

print('Pad sequences (samples x time)')
X_set1 = sequence.pad_sequences(X_set1, maxlen=maxlen)
X_set2 = sequence.pad_sequences(X_set2, maxlen=maxlen)

Loading data...




50000  sequences
Pad sequences (samples x time)


#### Split the training set into a training and validation set

In [3]:
X_train = X_set1[:20000]
y_train = y_set1[:20000]
X_valid = np.concatenate((X_set1[20000:], X_set2[:5000]), axis=0)
y_valid = np.concatenate((y_set1[20000:], y_set2[:5000]), axis=0)
X_test = X_set2[5000:]
y_test = y_set2[5000:]
print('X_train shape:', X_train.shape)
print('X_valid shape:', X_valid.shape)
print('X_test shape:', X_test.shape)

X_train shape: (20000, 400)
X_valid shape: (10000, 400)
X_test shape: (20000, 400)


#### Train a model (the architecture ships with Keras)

In [4]:
# Building the model that ships with Keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D

print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(X_valid, y_valid))
model.save("imdb_model_k2.h5")

Build model...
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Train on 20000 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2


#### Compute model outputs (with and without test-time dropout) on the validation set

In [5]:
import abstention as ab

num_dropout_runs = 100 #the number of runs to use with test-time dropout
batch_size = 50
task_idx = 0 #output task to evaluate abstention metrics for

#preact_func returns the output prior to the final sigmoid nonlinearity, given the input
preact_func = ab.util.get_preact_func(model=model, task_idx=task_idx)

#Compute the output on the validation set pre-activation, both without test-time
#dropout and with test-time dropout enabled
print("On validation set")
valid_preacts, valid_dropout_preacts = ab.util.obtain_raw_data(
    preact_func=preact_func,
    data=X_valid,
    num_dropout_runs=num_dropout_runs,
    batch_size=batch_size)

#Do the same for the testing set
print("On testing set")
test_preacts, test_dropout_preacts = ab.util.obtain_raw_data(
    preact_func=preact_func, data=X_test, num_dropout_runs=num_dropout_runs, batch_size=batch_size)

On validation set
Computing deterministic activations
Computing nondeterministic activations
Done 10 runs
Done 20 runs
Done 30 runs
Done 40 runs
Done 50 runs
Done 60 runs
Done 70 runs
Done 80 runs
Done 90 runs
Done 100 runs
On testing set
Computing deterministic activations
Computing nondeterministic activations
Done 10 runs
Done 20 runs
Done 30 runs
Done 40 runs
Done 50 runs
Done 60 runs
Done 70 runs
Done 80 runs
Done 90 runs
Done 100 runs


#### Put model outputs through calibration

In [6]:
cb_method_name_to_factory = OrderedDict([
    #Expit is just the sigmoid; no calibration
    ("uncalibrated_posterior", ab.calibration.Expit()),
    ("platt_calibrated_posterior", ab.calibration.PlattScaling()),
])

(cb_method_name_to_valid_posterior_prob,
 cb_method_name_to_test_posterior_prob,
 transform_name_to_valid_uncert,
 transform_name_to_test_uncert) = ab.util.obtain_posterior_probs_and_uncert_estimates(
                                    cb_method_name_to_factory=cb_method_name_to_factory,
                                    valid_labels=y_valid,
                                    valid_preacts=valid_preacts,
                                    valid_dropout_preacts=valid_dropout_preacts,
                                    test_preacts=test_preacts,
                                    test_dropout_preacts=test_dropout_preacts)

Platt scaling coef: 1.010432843598933 ; intercept: -0.6110752598492817


`cb_method_name_to_valid_posterior_prob` and `cb_method_name_to_test_posterior_prob` have keys of:
- `uncalibrated_posterior` which has the uncalibrated (i.e.) the original probabilities
- `platt_calibrated_posterior` which has the calibrated probabilities using platt scaling

`transform_name_to_valid_uncert` and `transform_name_to_test_uncert` have keys of:
- `preactivation_uncertainty`, which has the population standard deviation computed on the output before the final nonlinearity
- `uncalibrated_posterior_uncertainty`, which has the population standard deviation computed on the uncertainties output after the final nonlinearity, but without any calibration applied
- `platt_calibrated_posterior_uncertainty`, which has the population standard deviation computed on the uncertainties output after the final nonlinearity and after calibration is applied using Platt scaling

In [7]:
from matplotlib import pyplot as plt
import sklearn.calibration

platt_frac_pos, platt_mean_pred_val = sklearn.calibration.calibration_curve(
        y_true=y_valid,
        y_prob=cb_method_name_to_valid_posterior_prob['platt_calibrated_posterior'],
        normalize=False, n_bins=10)

In [8]:
import abstention
reload(abstention.abstention)
from abstention.abstention import AuPrcAbstentionEval, AuRocAbstentionEval
from abstention.abstention import (FixedThreshold, RandomAbstention,
                                   NegPosteriorDistanceFromThreshold,
                                   Uncertainty,
                                   MarginalDeltaAuRoc, MarginalDeltaAuPrc,
                                   RecursiveMarginalDeltaAuRoc,
                                   RecursiveMarginalDeltaAuPrc)
from collections import namedtuple

AbstentionFuncInfo = namedtuple('AbstentionFuncInfo',
                                ('method_name','factory', 'posterior', 'uncert'))

proportion_to_retain = 0.95
evaluation_functions = OrderedDict([('auPRC',AuPrcAbstentionEval(proportion_to_retain)),
                                    ('auROC',AuRocAbstentionEval(proportion_to_retain))])

num_positives = np.sum(y_train)+np.sum(y_valid)
imbalance = (len(y_train)+len(y_valid)-num_positives)/float(num_positives)

abstention_func_infos = [
         AbstentionFuncInfo(method_name='random', factory=RandomAbstention(),
                            posterior='platt_calibrated_posterior', uncert=None),
         AbstentionFuncInfo(method_name='calibrated_marginal_auroc',
                            factory=MarginalDeltaAuRoc(),
                            posterior='platt_calibrated_posterior', uncert=None),
         AbstentionFuncInfo(method_name='calibrated_marginal_auprc',
                            factory=MarginalDeltaAuPrc(),
                            posterior='platt_calibrated_posterior', uncert=None),
         AbstentionFuncInfo(method_name='uncalibrated_prob_distance_point_five',
                            factory=NegPosteriorDistanceFromThreshold(FixedThreshold(0.5)),
                            posterior='uncalibrated_posterior', uncert=None),
         AbstentionFuncInfo(method_name='calibrated_prob_distance_point_five',
                            factory=NegPosteriorDistanceFromThreshold(FixedThreshold(0.5)),
                            posterior='platt_calibrated_posterior', uncert=None),
         AbstentionFuncInfo(method_name='preactivation_uncertainty',
                            factory=Uncertainty(),
                            posterior='uncalibrated_posterior', uncert='preactivation'),
         AbstentionFuncInfo(method_name='uncalibrated_posterior_uncertainty',
                            factory=Uncertainty(),
                            posterior='uncalibrated_posterior', uncert='uncalibrated_posterior'),
         AbstentionFuncInfo(method_name='calibrated_posterior_uncertainty',
                            factory=Uncertainty(),
                            posterior='uncalibrated_posterior', uncert='platt_calibrated_posterior')]

In [9]:
metric_to_method_name_to_test_perfs = OrderedDict()
metric_to_method_name_to_valid_perfs = OrderedDict()
for metric_name in evaluation_functions:
    method_name_to_test_perfs = OrderedDict([
            (abstention_func.method_name, []) for abstention_func in abstention_func_infos])
    method_name_to_valid_perfs = OrderedDict([
            (abstention_func.method_name, []) for abstention_func in abstention_func_infos])
    metric_to_method_name_to_test_perfs[metric_name] = method_name_to_test_perfs
    metric_to_method_name_to_valid_perfs[metric_name] = method_name_to_valid_perfs

for abstention_func_info in abstention_func_infos:
    print("\nCalling method", abstention_func_info.method_name)
    factory = abstention_func_info.factory
    posterior_name = abstention_func_info.posterior
    uncert_name = abstention_func_info.uncert   

    valid_posterior = cb_method_name_to_valid_posterior_prob[posterior_name]
    test_posterior = cb_method_name_to_test_posterior_prob[posterior_name]
    valid_uncert = transform_name_to_valid_uncert[uncert_name] if uncert_name else None
    test_uncert = transform_name_to_test_uncert[uncert_name] if uncert_name else None
    
    abstention_func = factory(valid_labels=y_valid,
                              valid_posterior=valid_posterior,
                              valid_uncert=valid_uncert)
    test_abstention_scores = abstention_func(posterior_probs=test_posterior,
                                             uncertainties=test_uncert)
    valid_abstention_scores = abstention_func(posterior_probs=valid_posterior,
                                              uncertainties=valid_uncert)
    for evaluation_func_name, evaluation_func in evaluation_functions.items():
        valid_perf = evaluation_func(abstention_scores=valid_abstention_scores,
                                                            y_true=y_valid, y_score=valid_posterior)
        test_perf = evaluation_func(abstention_scores=test_abstention_scores,
                                                            y_true=y_test, y_score=test_posterior)
        metric_to_method_name_to_test_perfs[evaluation_func_name]\
                                      [abstention_func_info.method_name].append(test_perf)
        metric_to_method_name_to_valid_perfs[evaluation_func_name]\
                                      [abstention_func_info.method_name].append(valid_perf)


Calling method random

Calling method calibrated_marginal_auroc
valid est metric 0.9577813763154428
data est metric 0.9559907651869655
valid est metric 0.9577813763154428
data est metric 0.9574339675295698

Calling method calibrated_marginal_auprc
valid est metric 0.9564477426081814
data est metric 0.9535672687991844
valid est metric 0.9564477426081814
data est metric 0.9542574597727642

Calling method uncalibrated_prob_distance_point_five

Calling method calibrated_prob_distance_point_five

Calling method preactivation_uncertainty

Calling method uncalibrated_posterior_uncertainty

Calling method calibrated_posterior_uncertainty


In [10]:
from collections import defaultdict

print("\nBest auROC methods - test")
metric_to_auroc_test_score = defaultdict(lambda: 0)
metric_name_ranks = sorted(metric_to_method_name_to_test_perfs['auROC'].keys(),
                           key=lambda x: -metric_to_method_name_to_test_perfs['auROC'][x][0])
for idx, name in enumerate(metric_name_ranks):
        metric_to_auroc_test_score[name] += idx
print("AuROC ranks:")
print("\n".join(str(x)
                +", average auROC = "
                +str(np.mean(metric_to_method_name_to_test_perfs['auROC'][x[0]]))
                for x in sorted(metric_to_auroc_test_score.items(), key=lambda x: x[1])))

print("\nBest auPRC methods - test")
metric_to_auprc_test_score = defaultdict(lambda: 0)
metric_name_ranks = sorted(metric_to_method_name_to_test_perfs['auPRC'].keys(),
                           key=lambda x: -metric_to_method_name_to_test_perfs['auPRC'][x][0])
for idx, name in enumerate(metric_name_ranks):
        metric_to_auprc_test_score[name] += idx
print("AuPRC ranks:")
print("\n".join(str(x)
                +", average auPRC = "
                +str(np.mean(metric_to_method_name_to_test_perfs['auPRC'][x[0]]))
                for x in sorted(metric_to_auprc_test_score.items(), key=lambda x: x[1])))


Best auROC methods - test
AuROC ranks:
('calibrated_marginal_auroc', 0), average auROC = 0.9594569584487534
('calibrated_prob_distance_point_five', 1), average auROC = 0.9594419735971772
('calibrated_marginal_auprc', 2), average auROC = 0.9593564954958288
('calibrated_posterior_uncertainty', 3), average auROC = 0.959317798185798
('uncalibrated_posterior_uncertainty', 4), average auROC = 0.9585043518572365
('uncalibrated_prob_distance_point_five', 5), average auROC = 0.9585003604979381
('preactivation_uncertainty', 6), average auROC = 0.9550453860182624
('random', 7), average auROC = 0.9522179216047506

Best auPRC methods - test
AuPRC ranks:
('calibrated_marginal_auprc', 0), average auPRC = 0.9569414386015427
('calibrated_prob_distance_point_five', 1), average auPRC = 0.956921591776704
('calibrated_marginal_auroc', 2), average auPRC = 0.9568865103881449
('calibrated_posterior_uncertainty', 3), average auPRC = 0.9568526143158497
('uncalibrated_prob_distance_point_five', 4), average auPR