## DNN WITH 3 HIDDEN LAYER 

In [30]:
import keras as K
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop


num_classes = 10
epochs = 15


# the data, split between train and test sets
(x, y), (x_test, y_test) = mnist.load_data()

x = x.reshape(60000, 784)
x_train = x[0:50000]
x_val = x[50000::]
x_test = x_test.reshape(10000, 784)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255
print(x_train.shape[0], 'training samples')
print(x_val.shape[0], 'validation samples')
print(x_test.shape[0], 'test samples')

# convert class vectors to binary class matrices
y = keras.utils.to_categorical(y, num_classes)
y_train = y[0:50000]
y_val=y[50000::]
y_test = keras.utils.to_categorical(y_test, num_classes)


50000 training samples
10000 validation samples
10000 test samples


### DNN function with SGD optimizer (Momentum = 0.9)

In [31]:
 def DNN(batchsize,lr):
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(784,)))
    model.add(Dense(512, activation='relu', input_shape=(784,)))
    model.add(Dense(512, activation='relu', input_shape=(784,)))
    model.add(Dense(num_classes, activation='softmax'))
    sgd = K.optimizers.SGD(lr=lr, decay=1e-6, momentum=0.9, nesterov=False)

    model.summary()

    model.compile(loss='categorical_crossentropy',
                  optimizer='sgd',
                  metrics=['accuracy'])

    history = model.fit(x_train, y_train,
                        batch_size=batchsize,
                        epochs=epochs,
                        verbose=1,
                        validation_data=(x_val, y_val))
    score = model.evaluate(x_test, y_test, verbose=0)
    print('test loss:', score[0])
    print('test accuracy:', score[1])


In [32]:
DNN(batchsize=1024,lr=0.1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_61 (Dense)             (None, 512)               401920    
_________________________________________________________________
dense_62 (Dense)             (None, 512)               262656    
_________________________________________________________________
dense_63 (Dense)             (None, 512)               262656    
_________________________________________________________________
dense_64 (Dense)             (None, 10)                5130      
Total params: 932,362
Trainable params: 932,362
Non-trainable params: 0
_________________________________________________________________
Train on 50000 samples, validate on 10000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
test loss: 0.3733648595571518
test accuracy: 0.

### Reducing Batch size to 128 Batch size =128 and learning rate =0.1

In [33]:
DNN(batchsize=128,lr=0.1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_65 (Dense)             (None, 512)               401920    
_________________________________________________________________
dense_66 (Dense)             (None, 512)               262656    
_________________________________________________________________
dense_67 (Dense)             (None, 512)               262656    
_________________________________________________________________
dense_68 (Dense)             (None, 10)                5130      
Total params: 932,362
Trainable params: 932,362
Non-trainable params: 0
_________________________________________________________________
Train on 50000 samples, validate on 10000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
test loss: 0.15726898441538215
test accuracy: 0

#### By reducing the batchsize from 1024 to 128 Accuracy has increased from 90 to 95%

### Reducing Batch size further to 32 to investigate performance; Batch size =32 and learning rate =0.1

In [34]:
DNN(batchsize=32,lr=0.1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_69 (Dense)             (None, 512)               401920    
_________________________________________________________________
dense_70 (Dense)             (None, 512)               262656    
_________________________________________________________________
dense_71 (Dense)             (None, 512)               262656    
_________________________________________________________________
dense_72 (Dense)             (None, 10)                5130      
Total params: 932,362
Trainable params: 932,362
Non-trainable params: 0
_________________________________________________________________
Train on 50000 samples, validate on 10000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
test loss: 0.07710769472182728
test accuracy: 0

#### By reducing Batch size from 128 to 32 the accuracy has further increased to 32

## Batchsize=1 and Learning rate =0.1

In [37]:
epochs = 1
DNN(batchsize=1,lr=0.1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_77 (Dense)             (None, 512)               401920    
_________________________________________________________________
dense_78 (Dense)             (None, 512)               262656    
_________________________________________________________________
dense_79 (Dense)             (None, 512)               262656    
_________________________________________________________________
dense_80 (Dense)             (None, 10)                5130      
Total params: 932,362
Trainable params: 932,362
Non-trainable params: 0
_________________________________________________________________
Train on 50000 samples, validate on 10000 samples
Epoch 1/1
test loss: 0.11279717418067157
test accuracy: 0.9652


#### By reducing Batch size from 32 to 1 .The Compute time is too high and for single iteration training accuracy achieved is 92

#### As the compute time is too high we will fix our Batch size to 32 and vary the learning rate and check

### Reducing Learning rate by half to lr=0.05



In [38]:
epochs = 15
DNN(batchsize=32,lr=0.05)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_81 (Dense)             (None, 512)               401920    
_________________________________________________________________
dense_82 (Dense)             (None, 512)               262656    
_________________________________________________________________
dense_83 (Dense)             (None, 512)               262656    
_________________________________________________________________
dense_84 (Dense)             (None, 10)                5130      
Total params: 932,362
Trainable params: 932,362
Non-trainable params: 0
_________________________________________________________________
Train on 50000 samples, validate on 10000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
test loss: 0.07967538918443025
test accuracy: 0


## Learning rate =0.01 and Batch Size=32

In [40]:
epochs = 15
DNN(batchsize=32,lr=0.01)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_89 (Dense)             (None, 512)               401920    
_________________________________________________________________
dense_90 (Dense)             (None, 512)               262656    
_________________________________________________________________
dense_91 (Dense)             (None, 512)               262656    
_________________________________________________________________
dense_92 (Dense)             (None, 10)                5130      
Total params: 932,362
Trainable params: 932,362
Non-trainable params: 0
_________________________________________________________________
Train on 50000 samples, validate on 10000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
test loss: 0.07659204743197187
test accuracy: 0

## Learning rate =0.001 and Batch Size=32

In [39]:
epochs = 15
DNN(batchsize=32,lr=0.001)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_85 (Dense)             (None, 512)               401920    
_________________________________________________________________
dense_86 (Dense)             (None, 512)               262656    
_________________________________________________________________
dense_87 (Dense)             (None, 512)               262656    
_________________________________________________________________
dense_88 (Dense)             (None, 10)                5130      
Total params: 932,362
Trainable params: 932,362
Non-trainable params: 0
_________________________________________________________________
Train on 50000 samples, validate on 10000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
test loss: 0.0820373574842699
test accuracy: 0.

####  By reducing learning rate to 0.05, 0.01 and 0.01 accuracy is almost similar 97% and the update is comparitively slower
###  For the Given DNN architecture appropriate hyperparameter choice would be
### Batchsize of 32 and learning rate = 0.1 can give us fast convergence as well as accuracy of 97%