#                                                                                CNN

In [2]:
from keras import optimizers
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras import backend as K
import keras

batch_size = 128
num_classes = 10
epochs = 15

# input image dimensions
img_rows, img_cols = 28, 28

# the data, split between train and test sets
(x, y), (x_test, y_test) = mnist.load_data()

if K.image_data_format() == 'channels_first':
    x = x.reshape(x.shape[0], 1, img_rows, img_cols)
    x_test = x_test.reshape(x_test.shape[0], 1, img_rows, img_cols)
    input_shape = (1, img_rows, img_cols)
else:
    x = x.reshape(x.shape[0], img_rows, img_cols, 1)
    x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1)
    input_shape = (img_rows, img_cols, 1)

x = x.astype('float32')
x_test = x_test.astype('float32')

x_train = x[0:50000]
x_val = x[50000::]


x_train /= 255
x_test /= 255
print(x.shape)
print(x_train.shape, 'train samples')
print(x_val.shape, 'validation samples')
print(x_test.shape, 'test samples')

# convert class vectors to binary class matrices
y = keras.utils.to_categorical(y, num_classes)
y_train = y[0:50000]
y_val=y[50000::]
y_test = keras.utils.to_categorical(y_test, num_classes)


Using TensorFlow backend.


Downloading data from https://s3.amazonaws.com/img-datasets/mnist.npz
(60000, 28, 28, 1)
(50000, 28, 28, 1) train samples
(10000, 28, 28, 1) validation samples
(10000, 28, 28, 1) test samples


# CNN with 1 Hidden layer 128 kernels (with size 3x3 and maxpooling of 2x2 followed by a feed forward layer with 512 units

In [0]:
def CNN_1hidden(batchsize,lr):
    model = Sequential()
    model.add(Conv2D(128, kernel_size=(3, 3),
                     activation='relu',
                     input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))

    sgd = optimizers.SGD(lr=lr, decay=1e-6, momentum=0.9, nesterov=False)

    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer='sgd',
                  metrics=['accuracy'])


    model.fit(x_train, y_train,
              batch_size=batchsize,
              epochs=epochs,
              verbose=1,
              validation_data=(x_val, y_val))
    score = model.evaluate(x_test, y_test, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])

In [4]:
CNN_1hidden(1024,0.1)

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 26, 26, 128)       1280      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 13, 13, 128)       0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 21632)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               2769024   
_________________________________________________________________
dense_2 (Dense)              (None, 10)                1290      
Total params: 2,771,594
Trainable params: 2,771,594
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.cast instead.
Train on 500

# CNN with 2 Hidden layer 128 kernels (with size 3x3 and maxpooling of 2x2 followed by a feed forward layer with 512 units

In [0]:
def CNN_2hidden(batchsize,lr):
    model = Sequential()
    model.add(Conv2D(128, kernel_size=(3, 3),
                     activation='relu',
                     input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(128, kernel_size=(3, 3),
                     activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))

    sgd = optimizers.SGD(lr=lr, decay=1e-6, momentum=0.9, nesterov=False)

    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer='sgd',
                  metrics=['accuracy'])


    model.fit(x_train, y_train,
              batch_size=batchsize,
              epochs=epochs,
              verbose=1,
              validation_data=(x_val, y_val))
    score = model.evaluate(x_test, y_test, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])

## Batch Size =1024 Learning rate =0.1

In [7]:
CNN_2hidden(1024,0.1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 26, 26, 128)       1280      
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 13, 13, 128)       0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 11, 11, 128)       147584    
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 5, 5, 128)         0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 3200)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 128)               409728    
_________________________________________________________________
dense_6 (Dense)              (None, 10)                1290      
Total para


**Batch size:1024; lr=0.1; With the addition of one hidden layer the accuracy has increased from 91.5 to 92.2 **

one hidden layer :- Trainable params: 2,771,594
Two hidden layer:- Trainable params: 559,882 


## Batch Size =128; learning rate =0.1

In [8]:
CNN_2hidden(128,0.1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_5 (Conv2D)            (None, 26, 26, 128)       1280      
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 13, 13, 128)       0         
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 11, 11, 128)       147584    
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 5, 5, 128)         0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 3200)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 128)               409728    
_________________________________________________________________
dense_8 (Dense)              (None, 10)                1290      
Total para

### By Reducing batch size from 1024 to 128 there is frequent update so the accuracy has improved from 92% to 97%

## Batch Size =32 and Learning rate =0.1

In [9]:
CNN_2hidden(32,0.1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_7 (Conv2D)            (None, 26, 26, 128)       1280      
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 13, 13, 128)       0         
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 11, 11, 128)       147584    
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 5, 5, 128)         0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 3200)              0         
_________________________________________________________________
dense_9 (Dense)              (None, 128)               409728    
_________________________________________________________________
dense_10 (Dense)             (None, 10)                1290      
Total para

### By reducing batch size from 128 to 32 the accuracy has improved significantly to 98.7% from 97%

### Training time for 15 iteration also doubled

## BatchSize =1 learning rate =0.1 

In [10]:
CNN_2hidden(1,0.1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_9 (Conv2D)            (None, 26, 26, 128)       1280      
_________________________________________________________________
max_pooling2d_9 (MaxPooling2 (None, 13, 13, 128)       0         
_________________________________________________________________
conv2d_10 (Conv2D)           (None, 11, 11, 128)       147584    
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 5, 5, 128)         0         
_________________________________________________________________
flatten_6 (Flatten)          (None, 3200)              0         
_________________________________________________________________
dense_11 (Dense)             (None, 128)               409728    
_________________________________________________________________
dense_12 (Dense)             (None, 10)                1290      
Total para

KeyboardInterrupt: ignored

**With Batchsize of 1 Training compute is heavy so interupted in single iteration.**

**Here update is after each sample so for the single iteration accuracy achieved is 97%.**

**Its Computation need makes it inefficient in terms of time complexity **

### Batch size of 32 would be appropriate for MNIST dataset interms of accuracy(98.9%) as well as training time 

## Lets investigate Learning rate

## Learning rate =0.05  Batch size =32 

### Learning rate is halved

In [11]:
CNN_2hidden(32,0.05)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_11 (Conv2D)           (None, 26, 26, 128)       1280      
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 13, 13, 128)       0         
_________________________________________________________________
conv2d_12 (Conv2D)           (None, 11, 11, 128)       147584    
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 5, 5, 128)         0         
_________________________________________________________________
flatten_7 (Flatten)          (None, 3200)              0         
_________________________________________________________________
dense_13 (Dense)             (None, 128)               409728    
_________________________________________________________________
dense_14 (Dense)             (None, 10)                1290      
Total para

### With learning rate of 0.05 accuracy achieved is almost the same as 0.1 learning rate

## Learning rate =0.01  Batch size =32 

In [13]:
CNN_2hidden(32,0.01)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_15 (Conv2D)           (None, 26, 26, 128)       1280      
_________________________________________________________________
max_pooling2d_15 (MaxPooling (None, 13, 13, 128)       0         
_________________________________________________________________
conv2d_16 (Conv2D)           (None, 11, 11, 128)       147584    
_________________________________________________________________
max_pooling2d_16 (MaxPooling (None, 5, 5, 128)         0         
_________________________________________________________________
flatten_9 (Flatten)          (None, 3200)              0         
_________________________________________________________________
dense_17 (Dense)             (None, 128)               409728    
_________________________________________________________________
dense_18 (Dense)             (None, 10)                1290      
Total para

### By reducing learning rate to 0.01 update will be slower than 0.1 so the accuracy achieved is 98.4 which 0.5% less

## Learning rate =0.01  Batch size =32 

In [14]:
CNN_2hidden(32,0.001)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_17 (Conv2D)           (None, 26, 26, 128)       1280      
_________________________________________________________________
max_pooling2d_17 (MaxPooling (None, 13, 13, 128)       0         
_________________________________________________________________
conv2d_18 (Conv2D)           (None, 11, 11, 128)       147584    
_________________________________________________________________
max_pooling2d_18 (MaxPooling (None, 5, 5, 128)         0         
_________________________________________________________________
flatten_10 (Flatten)         (None, 3200)              0         
_________________________________________________________________
dense_19 (Dense)             (None, 128)               409728    
_________________________________________________________________
dense_20 (Dense)             (None, 10)                1290      
Total para

#### By reducing learning rate to 0.001 update will be slower than 0.1 so the accuracy achieved is 98.4 which 0.5% less

#### Accuracy achieved is almost similar to the 0.01 learning rate


## CNN 2 hidden layer networks performs better than 1 hidden layer network; Deep networks provides extra dimensional freedom for classification

## For the CNN 2 hidden layer architecture; Batch size =32 and Learning rate =0.1 The accuracy achieved is the highest validation accuracy : 98.75%; Training Accuracy : 99.12;Test Accuracy :98.9%