<a href="https://colab.research.google.com/github/kundajelab/label_shift_experiments/blob/master/cifar10/Download_CIFAR10_models_from_zenodo_and_make_predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from __future__ import print_function
import keras
from keras.models import load_model
from keras.models import Sequential, Model
print("keras version:", keras.__version__)
import tensorflow as tf
print("tensorflow version:", tf.__version__)
import random
import os
import sys
import numpy as np
from keras.datasets import mnist
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, Activation
from keras import backend as K
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


keras version: 2.2.4
tensorflow version: 1.14.0


In [2]:
batch_size = 128
num_classes = 10
epochs = 10

# input image dimensions
img_rows, img_cols = 28, 28

# the data, split between train and test sets
(x_train, y_train), (x_test, y_test) = mnist.load_data()

if K.image_data_format() == 'channels_first':
    x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
    input_shape = (1, img_rows, img_cols)
else:
    x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
    input_shape = (img_rows, img_cols, 1)

full_x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
full_x_train /= 255
x_test /= 255
x_valid = full_x_train[-10000:]
print('x_train shape:', full_x_train.shape)
print(full_x_train.shape[0], 'train samples')
print(x_valid.shape[0], 'valid samples')
print(x_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
full_y_train = keras.utils.to_categorical(y_train, num_classes)
y_valid = full_y_train[-10000:]
y_test = keras.utils.to_categorical(y_test, num_classes)

x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 valid samples
10000 test samples


In [3]:
output_file = "test_labels.txt"
f = open(output_file, 'w')
f.write("\n".join(["\t".join([str(x) for x in y]) for y in y_test]))
f.close()
os.system("gzip -f "+output_file)

output_file = "valid_labels.txt"
f = open(output_file, 'w')
f.write("\n".join(["\t".join([str(x) for x in y]) for y in y_valid]))
f.close()
os.system("gzip -f "+output_file)

output_file = "train_labels.txt"
f = open(output_file, 'w')
f.write("\n".join(["\t".join([str(x) for x in y]) for y in full_y_train]))
f.close()
os.system("gzip -f "+output_file)

0

In [4]:
from keras import optimizers
model_files = []
for seed in range(0,100,10):
    np.random.seed(seed)
    random.seed(seed)
    for model_idx,train_set_size in enumerate([30000]):
        model_file = "model_mnist_set-"+str(train_set_size)+"_seed-"+str(seed)+".h5"
        model_files.append(model_file)
        print("On train set size",train_set_size)

        model = Sequential()
        model.add(Flatten(input_shape=input_shape))
        model.add(Dense(256, activation='relu'))
        model.add(Dense(num_classes))
        model.add(Activation("softmax"))

        optimizer = optimizers.SGD(lr=0.01, momentum=0.5, decay=5e-4)
        model.compile(loss=keras.losses.categorical_crossentropy,
                      optimizer=optimizer,
                      metrics=['accuracy'])
        x_train = full_x_train[:train_set_size] 
        y_train = full_y_train[:train_set_size]
        print("Mean y train:",np.mean(y_train, axis=0))
        print("Mean y valid:",np.mean(y_valid, axis=0))
        model.fit(x_train, y_train,
                  batch_size=batch_size,
                  epochs=epochs,
                  verbose=1,
                  validation_data=(x_valid, y_valid),
                  callbacks=[EarlyStopping(
                    monitor='val_loss', patience=10,
                    restore_best_weights=True)])
        model.save(model_file)

        pre_softmax_model = Model(input=model.input,
                            output=model.layers[-2].output)
        print("Making predictions on validation set")
        valid_preacts = pre_softmax_model.predict(x_valid)
        print("Making predictions on test set")
        test_preacts = pre_softmax_model.predict(x_test)
        print('Test accuracy:', np.mean(np.argmax(test_preacts,axis=-1)
                                        ==np.argmax(y_test,axis=-1)))
        print('Valid accuracy:', np.mean(np.argmax(valid_preacts,axis=-1)
                                        ==np.argmax(y_valid,axis=-1)))
        sys.stdout.flush()
        test_predictions_file = ("testpreacts_"+model_file.split(".")[0])+".txt"
        print("Saving", test_predictions_file)
        f = open(test_predictions_file,'w')
        for test_preact in test_preacts:
            f.write("\t".join([str(x) for x in test_preact])+"\n") 
        f.close()
        !md5sum $test_predictions_file
        !gzip $test_predictions_file

        valid_predictions_file = ("validpreacts_"+model_file.split(".")[0])+".txt"
        print("Saving", valid_predictions_file)
        f = open(valid_predictions_file,'w')
        for valid_preact in valid_preacts:
            f.write("\t".join([str(x) for x in valid_preact])+"\n") 
        f.close()
        !md5sum $valid_predictions_file
        !gzip $valid_predictions_file

On train set size 30000





Mean y train: [0.0987     0.1141     0.09826667 0.10243333 0.09753333 0.0903
 0.09916667 0.10356667 0.09583333 0.1001    ]
Mean y valid: [0.0991 0.1064 0.099  0.103  0.0983 0.0915 0.0967 0.109  0.1009 0.0961]
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Train on 30000 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Making predictions on validation set




Making predictions on test set
Test accuracy: 0.9178
Valid accuracy: 0.9194
Saving testpreacts_model_mnist_set-30000_seed-0.txt
f0f52eca81d56c315628f9effb3dfbb0  testpreacts_model_mnist_set-30000_seed-0.txt
Saving validpreacts_model_mnist_set-30000_seed-0.txt
f8b8c16e39c2dc3dc98f917d9828de7f  validpreacts_model_mnist_set-30000_seed-0.txt
On train set size 30000
Mean y train: [0.0987     0.1141     0.09826667 0.10243333 0.09753333 0.0903
 0.09916667 0.10356667 0.09583333 0.1001    ]
Mean y valid: [0.0991 0.1064 0.099  0.103  0.0983 0.0915 0.0967 0.109  0.1009 0.0961]
Train on 30000 samples, validate on 10000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Making predictions on validation set
Making predictions on test set
Test accuracy: 0.9183
Valid accuracy: 0.921
Saving testpreacts_model_mnist_set-30000_seed-10.txt
260be080ee3540ded634d773be845134  testpreacts_model_mnist_set-30000_seed-10.txt
Saving validpreacts_m