In [1]:
import numpy as np
import h5py
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Convolution1D, GlobalMaxPooling1D
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"

Using TensorFlow backend.


In [2]:
# parameters
max_features = 5000
maxlen = 400
batch_size = 32
embedding_dims = 50
nb_filter = 250
filter_length = 3
hidden_dims = 250
nb_epoch = 2

print('Loading data...')
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

Loading data...
(25000, 'train sequences')
(25000, 'test sequences')


In [3]:
print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

Pad sequences (samples x time)


In [4]:
X_valid = X_train[20000:]
y_valid = y_train[20000:]
X_train = X_train[:20000]
y_train = y_train[:20000]
print('X_train shape:', X_train.shape)
print('X_valid shape:', X_valid.shape)
print('X_test shape:', X_test.shape)

('X_train shape:', (20000, 400))
('X_valid shape:', (5000, 400))
('X_test shape:', (25000, 400))


In [5]:
# Building the model that ships with Keras

print('Build model...')
model = Sequential()
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen,
                    dropout=0.2))
model.add(Convolution1D(nb_filter=nb_filter,
                        filter_length=filter_length,
                        border_mode='valid',
                        activation='relu',
                        subsample_length=1))
model.add(GlobalMaxPooling1D())
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(X_train, y_train,
          batch_size=batch_size,
          nb_epoch=nb_epoch,
          validation_data=(X_valid, y_valid))

Build model...


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f58abace950>

In [6]:
import abstention as ab
reload(ab.util)

num_dropout_runs = 100
batch_size = 50
task_idx = 0

preact_func = ab.util.get_preact_func(model=model, task_idx=task_idx)
print("On validation set")
valid_preacts, valid_dropout_preacts = ab.util.obtain_raw_data(
    preact_func=preact_func, data=X_valid, num_dropout_runs=num_dropout_runs, batch_size=batch_size)
    
print("On testing set")
test_preacts, test_dropout_preacts = ab.util.obtain_raw_data(
    preact_func=preact_func, data=X_test, num_dropout_runs=num_dropout_runs, batch_size=batch_size)

On validation set
Computing deterministic activations
Computing nondeterministic activations
Done 10 runs
Done 20 runs
Done 30 runs
Done 40 runs
Done 50 runs
Done 60 runs
Done 70 runs
Done 80 runs
Done 90 runs
Done 100 runs
On testing set
Computing deterministic activations
Computing nondeterministic activations
Done 10 runs
Done 20 runs
Done 30 runs
Done 40 runs
Done 50 runs
Done 60 runs
Done 70 runs
Done 80 runs
Done 90 runs
Done 100 runs


In [7]:
reload(ab)
reload(ab.util)
reload(ab.calibration)
from collections import OrderedDict

cb_method_name_to_factory = OrderedDict([
    ("uncalibrated_posterior", ab.calibration.Expit()),
    ("platt_scaled_posterior", ab.calibration.PlattScaling()),
])

    
(cb_method_name_to_valid_posterior_prob,
 cb_method_name_to_test_posterior_prob,
 transform_name_to_valid_uncert,
 transform_name_to_test_uncert) = ab.util.obtain_posterior_probs_and_uncert_estimates(
                                    cb_method_name_to_factory=cb_method_name_to_factory,
                                    valid_labels=y_valid,
                                    valid_preacts=valid_preacts,
                                    valid_dropout_preacts=valid_dropout_preacts,
                                    test_preacts=test_preacts,
                                    test_dropout_preacts=test_dropout_preacts)

Platt scaling coef: 1.2654751364264774 ; intercept: 0.3391775087713138


In [8]:
%matplotlib inline
reload(ab)
reload(ab.util)
reload(ab.calibration)
reload(ab.abstention)
from abstention.abstention import AuPrcAbstentionEval, AuRocAbstentionEval
from abstention.abstention import (FixedThreshold, OptimalF1, RandomAbstention,
                                   NegPosteriorDistanceFromThreshold, Uncertainty,
                                   ConvexHybrid, MarginalDeltaAuRoc, MarginalDeltaAuPrc)
from collections import OrderedDict
from collections import namedtuple

AbstentionFuncInfo = namedtuple('AbstentionFuncInfo',
                                ('method_name','factory', 'posterior', 'uncert'))

abstention_fraction = 0.95
evaluation_functions = OrderedDict([('auPRC',AuPrcAbstentionEval(abstention_fraction)),
                                    ('auROC',AuRocAbstentionEval(abstention_fraction))])

num_positives = np.sum(y_train)+np.sum(y_valid)
imbalance = (len(y_train)+len(y_valid)-num_positives)/float(num_positives)

abstention_func_infos = [
         AbstentionFuncInfo(method_name='random', factory=RandomAbstention(),
                            posterior='platt_scaled_posterior', uncert=None),
         AbstentionFuncInfo(method_name='calibrated_marginal_auroc',
                            factory=MarginalDeltaAuRoc(),
                            posterior='platt_scaled_posterior', uncert=None),
         AbstentionFuncInfo(method_name='calibrated_marginal_auprc',
                            factory=MarginalDeltaAuPrc(),
                            posterior='platt_scaled_posterior', uncert=None),
         AbstentionFuncInfo(method_name='uncalibrated_prob_distance_point_five',
                            factory=NegPosteriorDistanceFromThreshold(FixedThreshold(0.5)),
                            posterior='uncalibrated_posterior', uncert=None),
         AbstentionFuncInfo(method_name='calibrated_prob_distance_point_five',
                            factory=NegPosteriorDistanceFromThreshold(FixedThreshold(0.5)),
                            posterior='platt_scaled_posterior', uncert=None),
         AbstentionFuncInfo(method_name='preactivation_uncertainty',
                            factory=Uncertainty(),
                            posterior='uncalibrated_posterior', uncert='preactivation'),
         AbstentionFuncInfo(method_name='uncalibrated_posterior_uncertainty',
                            factory=Uncertainty(),
                            posterior='uncalibrated_posterior', uncert='uncalibrated_posterior'),
         AbstentionFuncInfo(method_name='calibrated_posterior_uncertainty',
                            factory=Uncertainty(),
                            posterior='uncalibrated_posterior', uncert='platt_scaled_posterior')]

metric_to_method_name_to_test_perfs = OrderedDict()
metric_to_method_name_to_valid_perfs = OrderedDict()
for metric_name in evaluation_functions:
    method_name_to_test_perfs = OrderedDict([
            (abstention_func.method_name, []) for abstention_func in abstention_func_infos])
    method_name_to_valid_perfs = OrderedDict([
            (abstention_func.method_name, []) for abstention_func in abstention_func_infos])
    metric_to_method_name_to_test_perfs[metric_name] = method_name_to_test_perfs
    metric_to_method_name_to_valid_perfs[metric_name] = method_name_to_valid_perfs

In [9]:
for abstention_func_info in abstention_func_infos:
    factory = abstention_func_info.factory
    posterior_name = abstention_func_info.posterior
    uncert_name = abstention_func_info.uncert   

    valid_posterior = cb_method_name_to_valid_posterior_prob[posterior_name]
    test_posterior = cb_method_name_to_test_posterior_prob[posterior_name]
    valid_uncert = transform_name_to_valid_uncert[uncert_name] if uncert_name else None
    test_uncert = transform_name_to_test_uncert[uncert_name] if uncert_name else None
    
    abstention_func = factory(valid_labels=y_valid,
                              valid_posterior=valid_posterior,
                              valid_uncert=valid_uncert)
    test_abstention_scores = abstention_func(posterior_probs=test_posterior,
                                             uncertainties=test_uncert)
    valid_abstention_scores = abstention_func(posterior_probs=valid_posterior,
                                              uncertainties=valid_uncert)
    for evaluation_func_name, evaluation_func in evaluation_functions.items():
        valid_perf = evaluation_func(abstention_scores=valid_abstention_scores,
                                                            y_true=y_valid, y_score=valid_posterior)
        test_perf = evaluation_func(abstention_scores=test_abstention_scores,
                                                            y_true=y_test, y_score=test_posterior)
        metric_to_method_name_to_test_perfs[evaluation_func_name]\
                                      [abstention_func_info.method_name].append(test_perf)
        metric_to_method_name_to_valid_perfs[evaluation_func_name]\
                                      [abstention_func_info.method_name].append(valid_perf)

valid est metric 0.9555875711449394
data est metric 0.954362194651254
Difference is: -0.0012253764936853706
valid est metric 0.9555875711449394
data est metric 0.9557433075977577
Difference is: 0.00015573645281830029
valid est metric 0.9546182070727567
data est metric 0.9534657536033662
Difference is: -0.0011524534693905242
valid est metric 0.9546182070727567
data est metric 0.9545317135687087
Difference is: -8.649350404799083e-05


In [10]:
from collections import defaultdict

print("\nBest auROC methods - test")
metric_to_auroc_test_score = defaultdict(lambda: 0)
metric_name_ranks = sorted(metric_to_method_name_to_test_perfs['auROC'].keys(),
                           key=lambda x: -metric_to_method_name_to_test_perfs['auROC'][x][0])
for idx, name in enumerate(metric_name_ranks):
        metric_to_auroc_test_score[name] += idx
print("AuROC ranks:")
print("\n".join(str(x)
                +", average auROC = "
                +str(np.mean(metric_to_method_name_to_test_perfs['auROC'][x[0]]))
                for x in sorted(metric_to_auroc_test_score.items(), key=lambda x: x[1])))

print("\nBest auPRC methods - test")
metric_to_auprc_test_score = defaultdict(lambda: 0)
metric_name_ranks = sorted(metric_to_method_name_to_test_perfs['auPRC'].keys(),
                           key=lambda x: -metric_to_method_name_to_test_perfs['auPRC'][x][0])
for idx, name in enumerate(metric_name_ranks):
        metric_to_auprc_test_score[name] += idx
print("AuPRC ranks:")
print("\n".join(str(x)
                +", average auPRC = "
                +str(np.mean(metric_to_method_name_to_test_perfs['auPRC'][x[0]]))
                for x in sorted(metric_to_auprc_test_score.items(), key=lambda x: x[1])))


Best auROC methods - test
AuROC ranks:
('calibrated_marginal_auroc', 0), average auROC = 0.9603504608822637
('calibrated_prob_distance_point_five', 1), average auROC = 0.9603498226551124
('calibrated_marginal_auprc', 2), average auROC = 0.960321199894073
('calibrated_posterior_uncertainty', 3), average auROC = 0.9602138606446952
('uncalibrated_prob_distance_point_five', 4), average auROC = 0.9601101175652056
('uncalibrated_posterior_uncertainty', 5), average auROC = 0.9599061214308432
('preactivation_uncertainty', 6), average auROC = 0.9567924548291684
('random', 7), average auROC = 0.9540346855466172

Best auPRC methods - test
AuPRC ranks:
('calibrated_marginal_auprc', 0), average auPRC = 0.9574095902200023
('calibrated_marginal_auroc', 1), average auPRC = 0.9573449188548278
('calibrated_prob_distance_point_five', 2), average auPRC = 0.957339472109674
('calibrated_posterior_uncertainty', 3), average auPRC = 0.957237552425522
('uncalibrated_prob_distance_point_five', 4), average auPRC