In [1]:
import pandas as pd
import numpy as np

np.random.seed(1212)

import keras
from keras.models import Model
from keras.layers import *
from keras import optimizers

In [2]:
df_train = pd.read_csv('mnist_digit_train.csv')
df_test = pd.read_csv('mnist_digit_test.csv')

In [3]:
df_train.head() # 784 features, 1 label

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df_features = df_train.iloc[:, 1:785]
df_label = df_train.iloc[:, 0]

X_test = df_test.iloc[:, 0:784]

print(X_test.shape)

(28000, 784)


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_cv, y_train, y_cv = train_test_split(df_features, df_label, 
                                                test_size=0.2,
                                                random_state=1212)

X_train = X_train.values.reshape(33600, 784)
X_cv = X_cv.values.reshape(8400, 784)
X_test = X_test.values.reshape(28000, 784)


# Data cleaning, normalization and selection

In [7]:
print((min(X_train[1]), max(X_train[1])))

(0, 255)


In [8]:
# Feature Normalization 
X_train = X_train.astype('float32'); X_cv= X_cv.astype('float32'); X_test = X_test.astype('float32')
X_train /= 255; X_cv /= 255; X_test /= 255

# Convert labels to One Hot Encoded
num_digits = 10
y_train = keras.utils.to_categorical(y_train, num_digits)
y_cv = keras.utils.to_categorical(y_cv, num_digits)

In [9]:
# Printing 2 examples of labels after conversion
print(y_train[0]) # 2
print(y_train[3]) # 7

[0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]


# Model Fitting

In [10]:
# Input Parameters
n_input = 784 # number of features
n_hidden_1 = 300
n_hidden_2 = 100
n_hidden_3 = 100
n_hidden_4 = 200
num_digits = 10

In [11]:
Inp = Input(shape=(784,))
x = Dense(n_hidden_1, activation='relu', name = "Hidden_Layer_1")(Inp)
x = Dense(n_hidden_2, activation='relu', name = "Hidden_Layer_2")(x)
x = Dense(n_hidden_3, activation='relu', name = "Hidden_Layer_3")(x)
x = Dense(n_hidden_4, activation='relu', name = "Hidden_Layer_4")(x)
output = Dense(num_digits, activation='softmax', name = "Output_Layer")(x)

In [12]:
# Our model would have '6' layers - input layer, 4 hidden layer and 1 output layer
model = Model(Inp, output)
model.summary() # We have 297,910 parameters to estimate

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 784)]             0         
                                                                 
 Hidden_Layer_1 (Dense)      (None, 300)               235500    
                                                                 
 Hidden_Layer_2 (Dense)      (None, 100)               30100     
                                                                 
 Hidden_Layer_3 (Dense)      (None, 100)               10100     
                                                                 
 Hidden_Layer_4 (Dense)      (None, 200)               20200     
                                                                 
 Output_Layer (Dense)        (None, 10)                2010      
                                                                 
Total params: 297910 (1.14 MB)
Trainable params: 297910 (1.14

In [13]:
# Insert Hyperparameters
learning_rate = 0.1
training_epochs = 20
batch_size = 100
sgd = optimizers.SGD(lr=learning_rate)



In [14]:
# We rely on the plain vanilla Stochastic Gradient Descent as our optimizing methodology
model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])

In [15]:
history1 = model.fit(X_train, y_train,
                     batch_size = batch_size,
                     epochs = training_epochs,
                     verbose = 2,
                     validation_data=(X_cv, y_cv))

Epoch 1/20
336/336 - 3s - loss: 1.7596 - accuracy: 0.5315 - val_loss: 0.8909 - val_accuracy: 0.7824 - 3s/epoch - 8ms/step
Epoch 2/20
336/336 - 2s - loss: 0.5928 - accuracy: 0.8421 - val_loss: 0.4409 - val_accuracy: 0.8758 - 2s/epoch - 5ms/step
Epoch 3/20
336/336 - 2s - loss: 0.3968 - accuracy: 0.8875 - val_loss: 0.3578 - val_accuracy: 0.8968 - 2s/epoch - 5ms/step
Epoch 4/20
336/336 - 2s - loss: 0.3343 - accuracy: 0.9046 - val_loss: 0.3110 - val_accuracy: 0.9114 - 2s/epoch - 5ms/step
Epoch 5/20
336/336 - 2s - loss: 0.2976 - accuracy: 0.9143 - val_loss: 0.2862 - val_accuracy: 0.9162 - 2s/epoch - 5ms/step
Epoch 6/20
336/336 - 2s - loss: 0.2719 - accuracy: 0.9207 - val_loss: 0.2653 - val_accuracy: 0.9239 - 2s/epoch - 5ms/step
Epoch 7/20
336/336 - 2s - loss: 0.2503 - accuracy: 0.9274 - val_loss: 0.2571 - val_accuracy: 0.9265 - 2s/epoch - 5ms/step
Epoch 8/20
336/336 - 2s - loss: 0.2324 - accuracy: 0.9326 - val_loss: 0.2432 - val_accuracy: 0.9292 - 2s/epoch - 5ms/step
Epoch 9/20
336/336 - 2s 

Achieved a training score of around 96-98% and a test score of around 95 - 97%.