In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Read the data from the h5py file and understand the train/test**

In [None]:
import h5py as h5fy

In [None]:
h5f=h5fy.File('../input/street-view-house-nos-h5-file/SVHN_single_grey1.h5','r')

In [None]:
h5f.keys

In [None]:
X_train = h5f['X_train'][:]
y_train = h5f['y_train'][:]
X_test = h5f['X_test'][:]
y_test = h5f['y_test'][:]
print('X_train' ,X_train.shape)
print('y_train' ,y_train.shape)
print('X_test' ,X_test.shape)
print('y_test' ,y_test.shape)

****Observation about dataset:****

1) We have a tensor with 32*32*n Dimension.(n is number of rows in the tensor while each row as 32*32 matrix)

2) We have training values interms of pixel seems. Each 32*32 defines an image of number.

![](http://)3) We have to predict the number (0 to 9) which is our target variable. Our target is multi-level classification so it is evident that while, input shape is 32*32*n(each row of tensor being 32*32 pixel) and our o/p or target has only one column i.e. from 0 to 9.

**let's plot one input row of our tensor**

In [None]:
import matplotlib.pyplot as plt
plt.imshow(X_train[9][:][:])
print('Label: ', y_train[9])

**Data Pre-processing**

In [None]:
from tensorflow.keras.utils import to_categorical

In [None]:
X_train.shape

In [None]:
#Let's flatten in out and convert the 3D into 2D tumpy array
X_train = X_train.reshape((X_train.shape[0], -1))
X_test = X_test.reshape((X_test.shape[0], -1))

In [None]:
# converting y data into categorical (one-hot encoding)
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [None]:
print("X_train",X_train.shape," X_test", X_test.shape, 'y_train',y_train.shape, 'y_test',y_test.shape)

Let's Build a NN 

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras import optimizers
from tensorflow.keras.layers import BatchNormalization, Dropout

Let's try with a Vanila model:
1) Sequential model 
2) relu as Activation Function for input and Hidden Layer and softmax as Activation Function for our output layer
3) With SGD optimizer and loss function categorical_crossentropy


In [None]:
def create_vanila_model():
    model = Sequential()
    
    model.add(Dense(50, input_shape = (1024, )))                  
    model.add(Activation('relu'))    
    model.add(Dense(50))                   
    model.add(Activation('relu'))    
    model.add(Dense(50))                   
    model.add(Activation('relu'))    
    model.add(Dense(50))                    
    model.add(Activation('relu'))    
    model.add(Dense(10))
    model.add(Activation('softmax'))
    
    sgd = optimizers.SGD(lr = 0.001)
    model.compile(optimizer = sgd, loss = 'categorical_crossentropy', metrics = ['accuracy'])
    
    return model

In [None]:
    model = create_vanila_model()
    history = model.fit(X_train, y_train, batch_size=200, epochs = 200,verbose = 0)

In [None]:
results = model.evaluate(X_test, y_test)

Just 13% of accuracy is not acceptable. Let's use BatchNormalization.

In [None]:
def create_batchnorm_model():
    model = Sequential()
    
    model.add(Dense(50, input_shape = (1024, )))
    model.add(BatchNormalization())                    
    model.add(Activation('relu'))    
    model.add(Dense(50))
    model.add(BatchNormalization())                    
    model.add(Activation('relu'))    
    model.add(Dense(50))
    model.add(BatchNormalization())                    
    model.add(Activation('relu'))    
    model.add(Dense(50))
    model.add(BatchNormalization())                    
    model.add(Activation('relu'))    
    model.add(Dense(10))
    model.add(Activation('softmax'))
    
    sgd = optimizers.SGD(lr = 0.001)
    model.compile(optimizer = sgd, loss = 'categorical_crossentropy', metrics = ['accuracy'])
    
    return model

In [None]:
    model = create_batchnorm_model()
    history = model.fit(X_train, y_train, batch_size=100, epochs = 100, verbose = 0)

In [None]:
results = model.evaluate(X_test, y_test)

Let's use a kernel initializer(The fancy term use just for initilizing weights :))

In [None]:
def mlp_kernel_init_Batch_model():
    model = Sequential()
    
    model.add(Dense(50, input_shape = (1024, ), kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))    
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(10, kernel_initializer='he_normal'))
    model.add(Activation('softmax'))
    
    sgd = optimizers.SGD(lr = 0.001)
    model.compile(optimizer = sgd, loss = 'categorical_crossentropy', metrics = ['accuracy'])
    
    return model

In [None]:
model = create_batchnorm_model()
history = model.fit(X_train, y_train, batch_size=100, epochs = 100, verbose = 0)

In [None]:
#Let's evaluate with test data:
results = model.evaluate(X_test, y_test)

**Seems just adding a kernel_initializer has no improvement. Let's try adding a dropout to train our model little better.**

In [None]:
def bn_ki_dropout_model():
    model = Sequential()
    
    model.add(Dense(50, input_shape = (1024, ), kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))    
    model.add(Dropout(0.2))
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(10, kernel_initializer='he_normal'))
    model.add(Activation('softmax'))
    sgd = optimizers.SGD(lr = 0.001)
    model.compile(optimizer = sgd , loss = 'categorical_crossentropy', metrics = ['accuracy'])
    
    return model

In [None]:
model = bn_ki_dropout_model()
history = model.fit(X_train, y_train, batch_size=100, epochs = 100, verbose = 0)

In [None]:
#Let's evaluate with test data:
results = model.evaluate(X_test, y_test)

Observation : This shows that **we need all the features and can't afford to ignore any features**. Let's go back and don't use dropout. Instead may be we can try a different optimizer altogether.

In [None]:
def adam_optimizer_model():
    model = Sequential()
    
    model.add(Dense(50, input_shape = (1024, ), kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))    
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(10, kernel_initializer='he_normal'))
    model.add(Activation('softmax'))
    
    adam = optimizers.Adam(lr = 0.001)
    model.compile(optimizer = adam, loss = 'categorical_crossentropy', metrics = ['accuracy'])
    
    return model

In [None]:
model = adam_optimizer_model()
history = model.fit(X_train, y_train, batch_size=100, epochs = 100, verbose = 0)

In [None]:
#Let's evaluate with test data:
results = model.evaluate(X_test, y_test)

So with **Adam as optimizer we got an over-fit model**. Let's go back to dropout strategy with Adam. would be worth to observe!

In [None]:
def adam_with_dropout_model():
    model = Sequential()
    
    model.add(Dense(50, input_shape = (1024, ), kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))    
    model.add(Dropout(0.2))
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.2))
    model.add(Dense(10, kernel_initializer='he_normal'))
    model.add(Activation('softmax'))
    
    adam = optimizers.Adam(lr = 0.001)
    model.compile(optimizer = adam, loss = 'categorical_crossentropy', metrics = ['accuracy'])
    
    return model

In [None]:
model = adam_with_dropout_model()
history = model.fit(X_train, y_train, batch_size=100, epochs = 100, verbose = 0)

In [None]:
#Let's evaluate with test data:
results = model.evaluate(X_test, y_test)

**Observation : Adam optimizer with dropout strategy looks a bit better as our test accuracy is ~76%.**

In [None]:
def sgd_momentum_model():
    model = Sequential()
    
    model.add(Dense(50, input_shape = (1024, ), kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))    
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(10, kernel_initializer='he_normal'))
    model.add(Activation('softmax'))
    sdgm = optimizers.SGD(lr=0.01, momentum=0.9)
    model.compile(optimizer = sdgm, loss = 'categorical_crossentropy', metrics = ['accuracy'])
    
    return model

In [None]:
model = sgd_momentum_model()
history = model.fit(X_train, y_train, batch_size=100, epochs = 100, verbose = 0)

In [None]:
#Let's evaluate with test data:
results = model.evaluate(X_test, y_test)

### **Observation : With SGD momentum optimizer model looks a bit better as our test accuracy is ~80%.**

In [None]:
def sgd_momentum_ki_u_model():
    model = Sequential()
    
    model.add(Dense(50, input_shape = (1024, ), kernel_initializer='uniform'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(50, kernel_initializer='uniform'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))    
    model.add(Dense(50, kernel_initializer='uniform'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(50, kernel_initializer='uniform'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(10, kernel_initializer='uniform'))
    model.add(Activation('softmax'))
    sdgm = optimizers.SGD(lr=0.01, momentum=0.9)
    model.compile(optimizer = sdgm, loss = 'categorical_crossentropy', metrics = ['accuracy'])
    
    return model

In [None]:
model = sgd_momentum_ki_hu_model()
history = model.fit(X_train, y_train, batch_size=100, epochs = 100, verbose = 0)

In [None]:
#Let's evaluate with test data:
results = model.evaluate(X_test, y_test)

Observation : With SGD momentum optimizer and uniform kernel initializer. Seems not better than SGD momentum + he uniform KI.

In [None]:
def adam_ki_hn_model():
    model = Sequential()
    
    model.add(Dense(50, input_shape = (1024, ), kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))    
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(10, kernel_initializer='he_normal'))
    model.add(Activation('softmax'))
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    
    return model

In [None]:
model = adam_ki_hn_model()
history = model.fit(X_train, y_train, batch_size=100, epochs = 100, verbose = 0)

In [None]:
#Let's evaluate with test data:
results = model.evaluate(X_test, y_test)

### Observation : With adam optimizer, model looks a bit better as our test accuracy is > 80%

Let's Test by changing our network with a new hidden layer and observe if we get better result!

In [None]:
def adam_ki_hn_model_plus_one_Hiddnen():
    model = Sequential()
    
    model.add(Dense(50, input_shape = (1024, ), kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu')) 
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))    
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dense(10, kernel_initializer='he_normal'))
    model.add(Activation('softmax'))
    model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
    
    return model

In [None]:
model = adam_ki_hn_model_plus_one_Hiddnen()
history = model.fit(X_train, y_train, batch_size=100, epochs = 100, verbose = 0)

In [None]:
#Let's evaluate with test data:
results = model.evaluate(X_test, y_test)

**Observation : As we can see here, our model accuracy slightly decreased after addition of new hidden layer. This can be attributed to the model is short of started memorizing the trained dataset and will not generalize well. So let's don't change the structure of our NN**

Observation : We have the accuracy of 80%. Not tried with  Image Augmentation. This might have helped. Please do let me know if you have any comments on that!

In [None]:
def adam2_with_dropout_model():
    model = Sequential()
    
    model.add(Dense(50, input_shape = (1024, ), kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))    
    model.add(Dropout(0.5))
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(50, kernel_initializer='he_normal'))
    model.add(BatchNormalization())
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Dense(10, kernel_initializer='he_normal'))
    model.add(Activation('softmax'))
    
    adam = optimizers.Adam(lr = 0.01 , beta_1=0.9 , decay =0)
    model.compile(optimizer = adam, loss = 'categorical_crossentropy', metrics = ['accuracy'])
    
    return model

In [None]:
model = adam2_with_dropout_model()
history = model.fit(X_train, y_train, batch_size=100, epochs = 100, verbose = 1)

**Observation** : With the above hyper parameter tuinig our best model would have accuracy 80%.