Hyperparameter optimization is done in 2 groups of 3 parameters each. 3 values are evaluated for each parameter. This requires 54 iterations for each group. The first group optimizes parameters for the convolutional layer and the second group optimizes other model parameters. To reduce search time, the procedure uses only 2 cross-validation folds and 3 epochs. This should be sufficient to determine which hyperparameter combination is optimal. The achieved accuracy after the optimization is above 0.99.

In [1]:
'''Trains a simple convnet on the MNIST dataset for ONLY digits 2 and 7.
Hyperparameters are optimized in 2 groups of 3 parameters.
The optimization leads to accuracy above 0.99
'''

from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
from keras.wrappers.scikit_learn import KerasClassifier
from keras import optimizers
from sklearn.model_selection import GridSearchCV
import numpy as np

# iteration parameters (with low values to reduce search time)
folds = 2
epochs = 3

def create_model(input_shape, 
                 optimizer='Adam',
                 learn_rate=0.01, 
                 init_mode='uniform',
                 activation='relu',
                 dropout_rate=0.25,
                 filters=4,
                 kernel_size=3,
                 print_progress=True):
    model = Sequential()
    model.add(Conv2D(filters=filters, 
                     kernel_size=(kernel_size, kernel_size),
                     activation=activation,
                     input_shape=input_shape,
                     kernel_initializer=init_mode))
    model.add(Conv2D(filters=filters*2, 
                     kernel_size=(kernel_size, kernel_size), 
                     activation=activation,
                     kernel_initializer=init_mode))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(dropout_rate))
    model.add(Flatten())
    model.add(Dense(16, activation=activation))
    model.add(Dropout(dropout_rate*2))
    model.add(Dense(2, activation='softmax'))
    selected_optimizer = getattr(optimizers, optimizer)
    model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=selected_optimizer(lr=learn_rate),
                  metrics=['accuracy'])
    global i
    if (print_progress):
        print('\n\nPass #%d/%d. Fold #%d/%d. Combination [ ' % 
                              (i+1, total_iter, (i%folds+1), folds), end="")
        for param in params:
            print(param+'=%s ' % eval(param), end="")    
        print(']\n', end="")
    i += 1
    return model

def grid_search(param_grid, sk_params, x_train, y_train):
    global total_iter # total iterations needed
    global i # iteration index
    global params # hyperparameters to optimize
    total_iter = np.prod([len(v) for v in param_grid.values()]) * folds
    i = 0
    params = param_grid.keys()
    model = KerasClassifier(build_fn=create_model, **sk_params, epochs=epochs)
    grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=folds, refit=False)
    print ('\nOptimizing hyperparameters: ' + str([p for p in params]), end="")
    grid_result = grid.fit(x_train, y_train)
    print('\nBest score on training set: %f using %s' % 
      (grid_result.best_score_, grid_result.best_params_))
    sk_params.update(grid_result.best_params_)
    return sk_params

def main():
    np.random.seed(1)
    batch_size = 128
    num_classes = 2
    # input image dimensions
    img_rows, img_cols = 28, 28
    # the data, shuffled and split between train and test sets
    (x_train, y_train), (x_test, y_test) = mnist.load_data()
    # only look at 2s and 7s
    train_picks = np.logical_or(y_train==2,y_train==7)
    test_picks = np.logical_or(y_test==2,y_test==7)
    x_train = x_train[train_picks]
    x_test = x_test[test_picks]
    y_train = np.array(y_train[train_picks]==7,dtype=int)
    y_test = np.array(y_test[test_picks]==7,dtype=int)
    if K.image_data_format() == 'channels_first':
        x_train = x_train.reshape(x_train.shape[0], 1, img_rows, img_cols)
        x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
        input_shape = (1, img_rows, img_cols)
    else:
        x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)
        x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
        input_shape = (img_rows, img_cols, 1)
    x_train = x_train.astype('float32')
    x_test = x_test.astype('float32')
    x_train /= 255
    x_test /= 255
    print('x_train shape:', x_train.shape)
    print(x_train.shape[0], 'train samples')
    print(x_test.shape[0], 'test samples')
    # convert class vectors to binary class matrices
    y_train = keras.utils.to_categorical(y_train, num_classes)
    y_test = keras.utils.to_categorical(y_test, num_classes)
    # hyperparameter optimization in 2 groups of 3 parameters follows
    sk_params = dict(input_shape=input_shape) # holds model parameters
    # optimize 2DConv layer hyperparameters
    param_grid = dict(init_mode=['uniform', 'lecun_uniform', 'normal'],
                      filters=[4, 6, 8],
                      kernel_size=[1, 3, 5])
    sk_params = grid_search(param_grid, sk_params, x_train, y_train)
    # optimize model hyperparameters
    param_grid = dict(optimizer = ['Adadelta', 'Adam', 'Nadam'],
                      learn_rate = [0.001, 0.01, 0.1],
                      dropout_rate = [0.0, 0.2, 0.4])
    sk_params = grid_search(param_grid, sk_params, x_train, y_train)
    # evaluate on test set
    print ('\nEvaluating model on test set with optimized hyperparameters:')
    print ([str(k)+'='+str(v) for k,v in sk_params.items() if k not in 'input_shape'])
    sk_params.update(print_progress=False)
    model = create_model(**sk_params)
    model.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=12,
              verbose=1,
              validation_data=(x_test, y_test))
    score = model.evaluate(x_test, y_test, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])
    
if __name__ == "__main__":
    main()

Using TensorFlow backend.


x_train shape: (12223, 28, 28, 1)
12223 train samples
2060 test samples

Optimizing hyperparameters: ['init_mode', 'filters', 'kernel_size']

Pass #1/54. Fold #1/2. Combination [ init_mode=uniform filters=4 kernel_size=1 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #2/54. Fold #2/2. Combination [ init_mode=uniform filters=4 kernel_size=1 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #3/54. Fold #1/2. Combination [ init_mode=uniform filters=4 kernel_size=3 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3


Pass #4/54. Fold #2/2. Combination [ init_mode=uniform filters=4 kernel_size=3 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #5/54. Fold #1/2. Combination [ init_mode=uniform filters=4 kernel_size=5 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #6/54. Fold #2/2. Combination [ init_mode=uniform filters=4 kernel_size=5 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #7/54. Fold #1/2. Combination [ init_mode=lecun_uniform filters=4 kernel_size=1 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #8/54. Fold #2/2. Combination [ init_mode=lecun_uniform filter

Epoch 2/3
Epoch 3/3

Pass #21/54. Fold #1/2. Combination [ init_mode=uniform filters=6 kernel_size=3 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #22/54. Fold #2/2. Combination [ init_mode=uniform filters=6 kernel_size=3 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3


Pass #23/54. Fold #1/2. Combination [ init_mode=uniform filters=6 kernel_size=5 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #24/54. Fold #2/2. Combination [ init_mode=uniform filters=6 kernel_size=5 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3


Pass #25/54. Fold #1/2. Combination [ init_mode=lecun_uniform filters=6 kernel_size=1 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #26/54. Fold #2/2. Combination [ init_mode=lecun_uniform filters=6 kernel_size=1 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #27/54. Fold #1/2. Combination [ init_mode=lecun_uniform filters=6 kernel_size=3 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #28/54. Fold #2/2. Combination [ init_mode=lecun_uniform filters=6 kernel_size=3 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #29/54. Fold #1/2. Combination [ init_mode=l

Epoch 3/3

Pass #40/54. Fold #2/2. Combination [ init_mode=uniform filters=8 kernel_size=3 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #41/54. Fold #1/2. Combination [ init_mode=uniform filters=8 kernel_size=5 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #42/54. Fold #2/2. Combination [ init_mode=uniform filters=8 kernel_size=5 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3


Pass #43/54. Fold #1/2. Combination [ init_mode=lecun_uniform filters=8 kernel_size=1 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #44/54. Fold #2/2. Combination [ init_mode=lecun_uniform filters=8 kernel_size=1 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #45/54. Fold #1/2. Combination [ init_mode=lecun_uniform filters=8 kernel_size=3 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3


Pass #46/54. Fold #2/2. Combination [ init_mode=lecun_uniform filters=8 kernel_size=3 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3


Pass #47/54. Fold #1/2. Combination [ init_mode=lecun_uniform filters=8 kernel_size=5 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3


Pass #48/54. Fold #2/2. Combination [ init_mode=lec

Epoch 2/3
Epoch 3/3


Pass #5/54. Fold #1/2. Combination [ optimizer=Nadam learn_rate=0.001 dropout_rate=0.0 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #6/54. Fold #2/2. Combination [ optimizer=Nadam learn_rate=0.001 dropout_rate=0.0 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #7/54. Fold #1/2. Combination [ optimizer=Adadelta learn_rate=0.01 dropout_rate=0.0 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #8/54. Fold #2/2. Combination [ optimizer=Adadelta learn_rate=0.01 dropout_rate=0.0 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3


Pass #9/54. Fold #1/2. Combination [ optimizer=Adam learn_rate=0.01 dropout_rate=0.0 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3


Pass #10/54. Fold #2/2. Combination [ optimizer=Adam learn_rate=0.01 dropout_rate=0.0 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3


Pass #11/54. Fold #1/2. Combination [ optimizer=Nadam learn_rate=0.01 dropout_rate=0.0 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #12/54. Fold #2/2. Combination [ optimizer=Nadam learn_rate=0.01 dropout_rate=0.0 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3


Pass #13/54. 


Pass #23/54. Fold #1/2. Combination [ optimizer=Nadam learn_rate=0.001 dropout_rate=0.2 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #24/54. Fold #2/2. Combination [ optimizer=Nadam learn_rate=0.001 dropout_rate=0.2 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3


Pass #25/54. Fold #1/2. Combination [ optimizer=Adadelta learn_rate=0.01 dropout_rate=0.2 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #26/54. Fold #2/2. Combination [ optimizer=Adadelta learn_rate=0.01 dropout_rate=0.2 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #27/54. Fold #1/2. Combination [ optimizer=Adam learn_rate=0.01 dropout_rate=0.2 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #28/54. Fold #2/2. Combination [ optimizer=Adam learn_rate=0.01 dropout_rate=0.2 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #29/54. Fold #1/2. Combination [ optimizer=Nadam learn_rate=0.01 dropout_rate=0.2 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3


Pass #30/54. Fold #2/2. Combination [ optimizer=Nadam learn_rate=0.01 dropout_rate=0.2 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #31/54. Fold #1/2. Combina

Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #42/54. Fold #2/2. Combination [ optimizer=Nadam learn_rate=0.001 dropout_rate=0.4 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3


Pass #43/54. Fold #1/2. Combination [ optimizer=Adadelta learn_rate=0.01 dropout_rate=0.4 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3


Pass #44/54. Fold #2/2. Combination [ optimizer=Adadelta learn_rate=0.01 dropout_rate=0.4 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #45/54. Fold #1/2. Combination [ optimizer=Adam learn_rate=0.01 dropout_rate=0.4 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #46/54. Fold #2/2. Combination [ optimizer=Adam learn_rate=0.01 dropout_rate=0.4 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3


Pass #47/54. Fold #1/2. Combination [ optimizer=Nadam learn_rate=0.01 dropout_rate=0.4 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #48/54. Fold #2/2. Combination [ optimizer=Nadam learn_rate=0.01 dropout_rate=0.4 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3

Pass #49/54. Fold #1/2. Combination [ optimizer=Adadelta learn_rate=0.1 dropout_rate=0.4 ]
Epoch 1/3
Epoch 2/3
Epoch 3/3
