In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

data = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')

In [2]:
data

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41996,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41997,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41998,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
data = np.array(data)
m, n = data.shape
m, n

(42000, 785)

In [4]:
number_of_tests = int(m * 0.15)

np.random.shuffle(data)

test_data = data[0:number_of_tests].T
Y_test = test_data[0]
X_test = test_data[1:n]
X_test = X_test / 255.0

train_data = data[number_of_tests:m].T
Y_train = train_data[0]
X_train = train_data[1:n]
X_train = X_train / 255.0

In [5]:
class NeuralNetwork:
    def __init__(self, X_train, Y_train, X_test, Y_test, LR, iterations, layer_dimensions, mini_batch_size):
        self.X_train = X_train
        self.Y_train = Y_train
        self.Y_one_hot = self.one_hot(Y_train)
        self.X_test = X_test
        self.Y_test = Y_test
        self.LR = LR
        self.iterations = iterations
        self.layer_dimensions = layer_dimensions
        self.mini_batch_size = mini_batch_size
        self.num_of_batches = len(X_train[0]) // mini_batch_size

        W, B = self.init_params(layer_dimensions)
        self.W = W
        self.B = B

        self.Z = [None for i in range(len(layer_dimensions))]
        self.Z_test = [None for i in range(len(layer_dimensions))]
        self.dZ = [None for i in range(len(layer_dimensions))]
        self.dW = [None for i in range(len(layer_dimensions))]
        self.dB = [None for i in range(len(layer_dimensions))]

    def init_params(self, layer_dimensions):
        W = [None]
        B = [None]
    
        for l in range(1, len(layer_dimensions)):
            current_layer_dimension = layer_dimensions[l]
            previous_layer_dimension = layer_dimensions[l - 1]
            w = np.random.randn(current_layer_dimension, previous_layer_dimension) * np.sqrt(2 / previous_layer_dimension)
            b = np.random.randn(current_layer_dimension, 1)
            W.append(w)
            B.append(b)

        return W, B

    def one_hot(self, Y):
        one_hot_Y = np.zeros((Y.size, Y.max() + 1))
        one_hot_Y[np.arange(Y.size), Y] = 1
        return one_hot_Y.T
    
    def ReLU(self, Z):
        return np.maximum(Z, 0)

    def derivative_of_ReLU(self, Z):
        return Z > 0
    
    def LeakyReLU(self, Z, alpha=0.01):
        return np.where(Z > 0, Z, alpha * Z)
    
    def derivative_of_LeakyReLU(self, Z, alpha=0.01):
        return np.where(Z > 0, 1, alpha)
    
    def softmax(self, Z):
        expZ = np.exp(Z - np.max(Z, axis=0, keepdims=True))
        return expZ / np.sum(expZ, axis=0, keepdims=True)
        
    def _is_last_layer(self, layer):
        return layer == len(self.layer_dimensions) - 1

    def forward_propagation(self):
        for layer in range(1, len(self.layer_dimensions)):
            self.Z[layer] = self.W[layer].dot(self.A[layer - 1]) + self.B[layer]

            if self._is_last_layer(layer):
                self.A[layer] = self.softmax(self.Z[layer])
            else:
                self.A[layer] = self.LeakyReLU(self.Z[layer])

    def backward_propagation(self, lambda_reg=0.01):
        for layer in range(len(self.layer_dimensions) - 1, 0, -1):
            if self._is_last_layer(layer):
                self.dZ[layer] = self.A[layer] - self.Y_one_hot_batch
            else:
                self.dZ[layer] = self.W[layer + 1].T.dot(self.dZ[layer + 1]) * self.derivative_of_LeakyReLU(self.Z[layer])

            self.dW[layer] = 1 / self.mini_batch_size * self.dZ[layer].dot(self.A[layer - 1].T) + (lambda_reg / self.mini_batch_size) * self.W[layer]
            self.dB[layer] = 1 / self.mini_batch_size * np.sum(self.dZ[layer]) 
    
    def update_params(self):
        for layer in range(1, len(self.layer_dimensions)):
            self.W[layer] = self.W[layer] - self.LR * self.dW[layer]
            self.B[layer] = self.B[layer] - self.LR * self.dB[layer]
    
    def get_predictions(self, A):
        return np.argmax(A, 0)
    
    def get_accuracy(self, predictions, Y):
        return np.sum(predictions == Y) / Y.size
    
    def predict(self):
        for layer in range(1, len(self.layer_dimensions)):
            self.Z_test[layer] = self.W[layer].dot(self.A_test[layer - 1]) + self.B[layer]

            if self._is_last_layer(layer):
                self.A_test[layer] = self.softmax(self.Z_test[layer])
            else:
                self.A_test[layer] = self.LeakyReLU(self.Z_test[layer])

        return self.get_predictions(self.A_test[-1])
    
    def gradient_descent(self):
        train = []
        test = []
        
        for i in range(self.iterations + 1):
            if i % 10 == 0:
                print("==== Iteration:", i, "====")

            for current_batch in range(0, self.num_of_batches):
                self.current_batch = current_batch
                self.batch_starting_point = current_batch * self.mini_batch_size
                self.X_train_batch = self.X_train[:, self.batch_starting_point:self.batch_starting_point + self.mini_batch_size]
                self.Y_train_batch = self.Y_train[self.batch_starting_point:self.batch_starting_point + self.mini_batch_size]
                self.A = [self.X_train_batch] + [None for i in range(len(layer_dimensions) - 1)]
                self.A_test = [self.X_test] + [None for i in range(len(layer_dimensions) - 1)]
                self.Y_one_hot_batch = self.Y_one_hot[:, self.batch_starting_point:self.batch_starting_point + self.mini_batch_size]
                
                self.forward_propagation()
                self.backward_propagation()
                self.update_params()
                
                training_predictions = self.get_predictions(self.A[-1])
                training_accuracy = self.get_accuracy(training_predictions, self.Y_train_batch)
                test_predictions = self.predict()
                test_accuracy = self.get_accuracy(test_predictions, self.Y_test)
        
                train.append(training_accuracy)
                test.append(test_accuracy)
        
                if i % 10 == 0:
                    print("==== Batch:", current_batch + 1, '====')
                    print('- training_accuracy: ', training_accuracy)
                    print('- test_accuracy: ', test_accuracy)
    
        return train, test

LR = 0.1
ITERATIONS = 3000
layer_dimensions = [784, 200, 200, 200, 10]
mini_batch_size = 2100 # 17 mini batches of 2100 training examples (35700 examples in total)
number_of_batches = len(X_train[0]) // mini_batch_size

nn = NeuralNetwork(X_train, Y_train, X_test, Y_test, LR, ITERATIONS, layer_dimensions, mini_batch_size)
train, test = nn.gradient_descent()

==== Iteration: 0 ====
==== Batch: 1 ====
- training_accuracy:  0.09571428571428571
- test_accuracy:  0.10158730158730159
==== Batch: 2 ====
- training_accuracy:  0.10285714285714286
- test_accuracy:  0.15317460317460319
==== Batch: 3 ====
- training_accuracy:  0.1376190476190476
- test_accuracy:  0.17682539682539683
==== Batch: 4 ====
- training_accuracy:  0.18619047619047618
- test_accuracy:  0.24174603174603174
==== Batch: 5 ====
- training_accuracy:  0.23761904761904762
- test_accuracy:  0.2722222222222222
==== Batch: 6 ====
- training_accuracy:  0.28095238095238095
- test_accuracy:  0.37873015873015875
==== Batch: 7 ====
- training_accuracy:  0.3880952380952381
- test_accuracy:  0.4695238095238095
==== Batch: 8 ====
- training_accuracy:  0.4742857142857143
- test_accuracy:  0.43793650793650796
==== Batch: 9 ====
- training_accuracy:  0.4328571428571429
- test_accuracy:  0.4922222222222222
==== Batch: 10 ====
- training_accuracy:  0.48095238095238096
- test_accuracy:  0.38333333333

KeyboardInterrupt: 

In [None]:
import plotly.graph_objs as go
import plotly.io as pio

iterations = [i for i in range((ITERATIONS + 1) * number_of_batches)]

train_data = pd.DataFrame({
    'x': iterations,
    'y': train
})

test_data = pd.DataFrame({
    'x': iterations,
    'y': test
})

trace1 = go.Scatter(x = train_data.x,
                    y = train_data.y,
                    mode = "lines",
                    name = "training",
                    marker = dict(color = 'rgba(16, 112, 2, 0.8)'))

trace2 = go.Scatter(x = test_data.x,
                    y = test_data.y,
                    mode = "lines+markers",
                    name = "test",
                    marker = dict(color = 'rgba(80, 26, 80, 0.8)'))

data = [trace1, trace2]
fig = dict(data = data)
pio.show(fig)