# First neural network using MNIST data

In [1]:
from theano.sandbox import cuda

Using cuDNN version 5110 on context None
Mapped name None to device cuda: GeForce GTX 1060 6GB (0000:23:00.0)


In [2]:
%matplotlib inline 
import numpy as np
import matplotlib.pyplot as plt
import keras
from keras.models import Sequential 
from keras.utils.np_utils import to_categorical

np.set_printoptions(precision=4, linewidth=100)

Using Theano backend.


In [3]:
from keras.layers.core import Flatten, Dense, Dropout, Lambda

from keras.optimizers import Adam

from keras.preprocessing import image

from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D

from keras.layers.normalization import BatchNormalization

# Setup 

In [4]:
batch_size = 64

In [5]:
from keras.datasets import mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((60000, 28, 28), (60000,), (10000, 28, 28), (10000,))

In [6]:
# keras expect the images to be in rank2 so we need to add the channel into the image. 
# because we are using theano as backend, image is channelxheightxwidth
x_train = np.expand_dims(x_train,1)
x_test = np.expand_dims(x_test, 1)

In [7]:
# final shape of the images after expansion
x_train.shape, x_test.shape

((60000, 1, 28, 28), (10000, 1, 28, 28))

In [8]:
# the labels are not one hot encoded. 
y_train[:5]

array([5, 0, 4, 1, 9], dtype=uint8)

In [9]:
# one-hot encode the labels
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [10]:
y_train[:5]

array([[ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]])

In [11]:
# normalizing the input helps with training 
mean_px = x_train.mean().astype(np.float32)
std_px = x_train.std().astype(np.float32)
print('The pixel mean is {0} and the standard deviation is {1}'.format(mean_px, std_px))

The pixel mean is 33.31842041015625 and the standard deviation is 78.56748962402344


In [12]:
def norm_input(x): return (x-mean_px)/std_px

# Linear Model 

In [21]:
# Create a simple linear model which is similar to logistical regression. 
# First input is normalized
# Second flatten to create a single vector with length of 1x28x28
# Third a non-linear activation 
def get_lin_model():
    model = Sequential([
        Lambda(norm_input, input_shape = (1,28,28), output_shape = (1,28,28)),
        Flatten(),
        Dense(10, activation = 'softmax')   
        ])
    model.compile(Adam(), loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return model

In [22]:
lm = get_lin_model()

In [14]:
# Generate image batches 
gen = image.ImageDataGenerator()
batches = gen.flow(x_train, y_train, batch_size = batch_size)
test_batches = gen.flow(x_test, y_test, batch_size = batch_size)

In [34]:
# Train the model
lm.fit_generator(generator = batches, samples_per_epoch = batches.n, nb_epoch=5, 
                 validation_data=test_batches, nb_val_samples = test_batches.n)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb7c3a7b978>

In [36]:
# Reduce the learning rate since it looks like it is close to the minimum 
lm.optimizer.lr=0.001

In [38]:
lm.fit_generator(batches, batches.n, nb_epoch = 3, 
                 validation_data = test_batches, nb_val_samples=test_batches.n)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fb7c3a7b080>

There is not much we can do to improve the accuracy. 

# Single Dense Layer

In [17]:
def get_fc_model():
    model = Sequential([
        Lambda(norm_input, input_shape = (1,28,28), output_shape = (1,28,28)),
        Flatten(),
        Dense(512, activation = 'softmax'),
        Dense(10, activation = 'softmax')
    ])
    model.compile(Adam(), loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return model

## Show that you should start with a small learning rate

In [26]:
fc_test = get_fc_model()
fc_test.optimizer.lr = 0.1
fc_test.fit_generator(generator = batches, samples_per_epoch = batches.n, nb_epoch = 5, 
                     validation_data = test_batches, nb_val_samples = test_batches.n)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f4e03051c50>

As you can see the training after 5 epoch isn't even close to 1 epoch with lr = 0.001. 

In [23]:
fc = get_fc_model()

In [24]:
fc.fit_generator(batches, samples_per_epoch = batches.n, nb_epoch = 2,
                validation_data = test_batches, nb_val_samples = test_batches.n)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f4e037b3b70>

In [27]:
fc.optimizer.lr = 0.1

In [29]:
fc.fit_generator(generator = batches, samples_per_epoch = batches.n, nb_epoch = 3,
                validation_data = test_batches, nb_val_samples = test_batches.n)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f4e03057320>

In [30]:
fc.optimizer.lr = 0.01

In [31]:
fc.fit_generator(generator = batches, samples_per_epoch = batches.n, nb_epoch = 3,
                validation_data = test_batches, nb_val_samples = test_batches.n)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f4e03057358>

Looks like it is already overfitting use this nn architecture.

# Basic VGG-Style CNN 

In [16]:
def get_vgg_model():
    model = Sequential([
        Lambda(norm_input, input_shape = (1,28,28), output_shape = (1,28,28)),
        Convolution2D(32,3,3, activation = 'relu'),
        Convolution2D(32,3,3, activation = 'relu'),
        MaxPooling2D(),
        Convolution2D(64,3,3, activation = 'relu'),
        Convolution2D(64,3,3, activation = 'relu'),
        MaxPooling2D(),
        Flatten(),
        Dense(512, activation = 'relu'),
        Dense(10, activation = 'softmax')
    ])
    model.compile(Adam(), loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return model

In [39]:
vgg = get_vgg_model()

In [40]:
vgg.fit_generator(generator = batches, samples_per_epoch = batches.n, nb_epoch = 1, 
                 validation_data = test_batches, nb_val_samples = test_batches.n)

Epoch 1/1


<keras.callbacks.History at 0x7f4df5533b70>

In [42]:
vgg.optimizer.lr = 0.1

In [44]:
vgg.fit_generator(generator = batches, samples_per_epoch = batches.n, nb_epoch = 3,
                 validation_data = test_batches, nb_val_samples = test_batches.n)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f4df553a400>

Lower learning rate because it is overfitting

In [45]:
vgg.optimizer.lr = 0.01

In [47]:
vgg.fit_generator(generator = batches, samples_per_epoch = batches.n, nb_epoch = 3,
                 validation_data = test_batches, nb_val_samples = test_batches.n)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f4df553ab38>

# Data Augmentation

In [14]:
gen = image.ImageDataGenerator(rotation_range = 8, width_shift_range = 0.08, shear_range = 0.3,
                              height_shift_range = 0.08, zoom_range = 0.08)

batches_aug = gen.flow(x_train, y_train, batch_size = batch_size)
test_batches_aug = gen.flow(x_test, y_test, batch_size = batch_size)

In [17]:
vgg = get_vgg_model()

In [19]:
vgg.fit_generator(generator = batches_aug, samples_per_epoch = batches_aug.n, nb_epoch = 1,
                 validation_data = test_batches_aug, nb_val_samples = test_batches_aug.n)

Epoch 1/1


<keras.callbacks.History at 0x7f99ae7126a0>

In [20]:
vgg.optimizer.lr = 0.1

In [21]:
vgg.fit_generator(generator = batches_aug, samples_per_epoch = batches_aug.n, nb_epoch = 3,
                  validation_data = test_batches_aug, nb_val_samples = test_batches_aug.n)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f99ae7127f0>

In [22]:
vgg.optimizer.lr = 0.01
vgg.fit_generator(generator = batches_aug, samples_per_epoch = batches_aug.n, nb_epoch = 6,
                 validation_data = test_batches_aug, nb_val_samples = test_batches_aug.n)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f99ae712828>

In [23]:
vgg.optimizer.lr = 0.001
vgg.fit_generator(generator = batches_aug, samples_per_epoch = batches_aug.n, nb_epoch = 6,
                 validation_data = test_batches_aug, nb_val_samples = test_batches_aug.n)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f99ae712b70>

# Batchnorm + Data Augmentation

In [31]:
def get_vgg_bn():
    model = Sequential([
        Lambda(norm_input, input_shape = (1,28,28), output_shape = (1,28,28)),
        Convolution2D(32,3,3, activation = 'relu'),
        BatchNormalization(axis = 1),
        Convolution2D(32,3,3, activation = 'relu'),
        MaxPooling2D(),
        BatchNormalization(axis = 1),
        Convolution2D(64,3,3, activation = 'relu'),
        BatchNormalization(axis = 1),
        Convolution2D(64,3,3, activation = 'relu'),
        MaxPooling2D(),
        Flatten(),
        BatchNormalization(axis = 1),
        Dense(512, activation = 'relu'),
        BatchNormalization(axis = 1),
        Dense(10, activation = 'softmax')
    ])
    model.compile(Adam(), loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return model

In [32]:
vgg_bn = get_vgg_bn()

In [33]:
vgg_bn.fit_generator(generator = batches_aug, samples_per_epoch = batches_aug.n, nb_epoch = 1,
                 validation_data = test_batches_aug, nb_val_samples = test_batches_aug.n)

Epoch 1/1


<keras.callbacks.History at 0x7f999543bd30>

In [35]:
vgg_bn.optimizer.lr = 0.1
vgg_bn.fit_generator(generator = batches_aug, samples_per_epoch = batches_aug.n, nb_epoch = 3,
                 validation_data = test_batches_aug, nb_val_samples = test_batches_aug.n)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f999541fac8>

In [36]:
vgg_bn.optimizer.lr = 0.01
vgg_bn.fit_generator(generator = batches_aug, samples_per_epoch = batches_aug.n, nb_epoch = 3,
                 validation_data = test_batches_aug, nb_val_samples = test_batches_aug.n)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f999541fcf8>

In [37]:
vgg_bn.optimizer.lr = 0.001
vgg_bn.fit_generator(generator = batches_aug, samples_per_epoch = batches_aug.n, nb_epoch = 6,
                 validation_data = test_batches_aug, nb_val_samples = test_batches_aug.n)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f999541feb8>

# Batchnorm + dropout + data augmentation

In [38]:
def get_vgg_bn_do():
    model = Sequential([
        Lambda(norm_input, input_shape = (1,28,28), output_shape = (1,28,28)),
        Convolution2D(32,3,3, activation = 'relu'),
        BatchNormalization(axis = 1),
        Convolution2D(32,3,3, activation = 'relu'),
        MaxPooling2D(),
        BatchNormalization(axis = 1),
        Convolution2D(64,3,3, activation = 'relu'),
        BatchNormalization(axis = 1),
        Convolution2D(64,3,3, activation = 'relu'),
        MaxPooling2D(),
        Flatten(),
        BatchNormalization(axis = 1),
        Dense(512, activation = 'relu'),
        BatchNormalization(axis = 1),
        Dropout(0.5),
        Dense(10, activation = 'softmax')
    ])
    model.compile(Adam(), loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return model

In [39]:
vgg_bn_do = get_vgg_bn_do()

In [40]:
vgg_bn_do.fit_generator(generator = batches_aug, samples_per_epoch = batches_aug.n, nb_epoch = 1,
                 validation_data = test_batches_aug, nb_val_samples = test_batches_aug.n)

Epoch 1/1


<keras.callbacks.History at 0x7f9991487c88>

In [41]:
vgg_bn_do.optimizer.lr = 0.1
vgg_bn_do.fit_generator(generator = batches_aug, samples_per_epoch = batches_aug.n, nb_epoch = 3,
                 validation_data = test_batches_aug, nb_val_samples = test_batches_aug.n)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f9991487e10>

In [42]:
vgg_bn_do.optimizer.lr = 0.001
vgg_bn_do.fit_generator(generator = batches_aug, samples_per_epoch = batches_aug.n, nb_epoch = 8,
                 validation_data = test_batches_aug, nb_val_samples = test_batches_aug.n)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f9991487eb8>

# Ensembling 

In [45]:
def fit_model(train_batch):
    model = get_vgg_bn_do()
    model.fit_generator(generator = train_batch, samples_per_epoch = train_batch.n, nb_epoch = 1,
                       verbose = 0)
    
    model.optimizer.lr = 0.1
    model.fit_generator(generator = train_batch, samples_per_epoch = train_batch.n, nb_epoch = 4,
                       verbose = 0)
    
    model.optimizer.lr = 0.1
    model.fit_generator(generator = train_batch, samples_per_epoch = train_batch.n, nb_epoch = 4,
                       verbose = 0)
    
    model.optimizer.lr = 0.01
    model.fit_generator(generator = train_batch, samples_per_epoch = train_batch.n, nb_epoch = 6,
                       verbose = 0)
    
    model.optimizer.lr = 0.001
    model.fit_generator(generator = train_batch, samples_per_epoch = train_batch.n, nb_epoch = 12,
                       verbose = 0)
    
    return model

In [46]:
models = [fit_model(batches_aug) for i in range(6)]

In [47]:
path = 'data/'
model_path = path + 'model/'

In [49]:
for i,m in enumerate(models):
    m.save_weights(model_path+'cnn_mnist-'+str(i)+'.pkl')

In [51]:
evals = np.array([m.evaluate(x_test, y_test, batch_size=256) for m in models])



In [52]:
evals.shape

(6, 2)

In [54]:
evals

array([[ 0.0133,  0.996 ],
       [ 0.0142,  0.9964],
       [ 0.0171,  0.9942],
       [ 0.0133,  0.9956],
       [ 0.0145,  0.9954],
       [ 0.0156,  0.9954]])

In [53]:
evals.mean(axis = 0)

array([ 0.0147,  0.9955])

In [58]:
all_preds = np.stack([m.predict(x_test, batch_size = 256) for m in models])

In [59]:
all_preds.shape

(6, 10000, 10)

In [60]:
avg_preds = all_preds.mean(axis = 0)

In [61]:
keras.metrics.categorical_accuracy(y_test, avg_preds).eval()

array(0.9969000220298767, dtype=float32)