In [5]:
import numpy as np
import pandas as pd
import seaborn as sns

data = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')

In [6]:
data

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41996,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41997,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41998,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
data = np.array(data)
m, n = data.shape
m, n

(42000, 785)

In [8]:
number_of_tests = int(m * 0.3)

np.random.shuffle(data)

test_data = data[0:number_of_tests].T
Y_test = test_data[0]
X_test = test_data[1:n]
X_test = X_test / 255.0

train_data = data[number_of_tests:m].T
Y_train = train_data[0]
X_train = train_data[1:n]
X_train = X_train / 255.0

In [16]:
class NeuralNetwork:
    def __init__(self, X_train, Y_train, X_test, Y_test, LR, iterations):
        self.X_train = X_train
        self.Y_train = Y_train
        self.Y_one_hot = self.one_hot(Y_train)
        self.X_test = X_test
        self.Y_test = Y_test
        self.LR = LR
        self.iterations = iterations

        params = self.init_params([784, 100, 100, 10])

        self.W1 = params['W1']
        self.B1 = params['B1']
        self.W2 = params['W2']
        self.B2 = params['B2']
        self.W3 = params['W3']
        self.B3 = params['B3']

        self.Z1 = None
        self.Z2 = None
        self.Z3 = None
        self.A1 = None
        self.A2 = None
        self.A3 = None

        self.dW1 = None
        self.dB1 = None
        self.dW2 = None
        self.dB2 = None
        self.dW3 = None
        self.dB3 = None

    def init_params(self, layer_dimensions):
        params = {}
    
        for l in range(1, len(layer_dimensions)):
            current_layer_dimension = layer_dimensions[l]
            previous_layer_dimension = layer_dimensions[l - 1]
            params[f"W{l}"] = np.random.randn(current_layer_dimension, previous_layer_dimension) * np.sqrt(2 / previous_layer_dimension)
            params[f"B{l}"] = np.random.randn(current_layer_dimension, 1)
    
        return params

    def one_hot(self, Y):
        one_hot_Y = np.zeros((Y.size, Y.max() + 1))
        one_hot_Y[np.arange(Y.size), Y] = 1
        return one_hot_Y.T
    
    def ReLU(self, Z):
        return np.maximum(Z, 0)

    def derivative_of_ReLU(self, Z):
        return Z > 0
    
    def LeakyReLU(self, Z, alpha=0.01):
        return np.where(Z > 0, Z, alpha * Z)
    
    def derivative_of_LeakyReLU(self, Z, alpha=0.01):
        return np.where(Z > 0, 1, alpha)
    
    def softmax(self, Z):
        expZ = np.exp(Z - np.max(Z, axis=0, keepdims=True))
        return expZ / np.sum(expZ, axis=0, keepdims=True)
        
    def forward_propagation(self, X):
        self.Z1 = self.W1.dot(X) + self.B1
        self.A1 = self.LeakyReLU(self.Z1)
        
        self.Z2 = self.W2.dot(self.A1) + self.B2
        self.A2 = self.LeakyReLU(self.Z2)
        
        self.Z3 = self.W3.dot(self.A2) + self.B3
        self.A3 = self.softmax(self.Z3)
    
    def backward_propagation(self, lambda_reg=0.01):
        self.dZ3 = self.A3 - self.Y_one_hot
        self.dW3 = 1 / m * self.dZ3.dot(self.A2.T) + (lambda_reg / m) * self.W3
        self.dB3 = 1 / m * np.sum(self.dZ3)
        
        self.dZ2 = self.W3.T.dot(self.dZ3) * self.derivative_of_LeakyReLU(self.Z2)
        self.dW2 = 1 / m * self.dZ2.dot(self.A1.T) + (lambda_reg / m) * self.W2
        self.dB2 = 1 / m * np.sum(self.dZ2)
        
        self.dZ1 = self.W2.T.dot(self.dZ2) * self.derivative_of_LeakyReLU(self.Z1)
        self.dW1 = 1 / m * self.dZ1.dot(self.X_train.T) + (lambda_reg / m) * self.W1
        self.dB1 = 1 / m * np.sum(self.dZ1)
    
    def update_params(self):
        self.W1 = self.W1 - self.LR * self.dW1
        self.B1 = self.B1 - self.LR * self.dB1    
        
        self.W2 = self.W2 - self.LR * self.dW2  
        self.B2 = self.B2 - self.LR * self.dB2
        
        self.W3 = self.W3 - self.LR * self.dW3  
        self.B3 = self.B3 - self.LR * self.dB3
    
    def get_predictions(self, A):
        return np.argmax(A, 0)
    
    def get_accuracy(self, predictions, Y):
        return np.sum(predictions == Y) / Y.size
    
    def predict(self, X):
        self.forward_propagation(X)
        return self.get_predictions(self.A3)
    
    def gradient_descent(self):
        train = []
        test = []
        
        for i in range(self.iterations + 1):
            self.forward_propagation(self.X_train)
            self.backward_propagation()
            self.update_params()
            
            training_predictions = self.get_predictions(self.A3)
            training_accuracy = self.get_accuracy(training_predictions, self.Y_train)
            test_predictions = self.predict(self.X_test)
            test_accuracy = self.get_accuracy(test_predictions, self.Y_test)
    
            train.append(training_accuracy)
            test.append(test_accuracy)
    
            if i % 10 == 0:
                print("Iteration: ", i)
                print(training_accuracy)
    
        return train, test

In [18]:
LR = 0.1
ITERATIONS = 3000
layer_dimensions = [784, 100, 100, 10]

nn = NeuralNetwork(X_train, Y_train, X_test, Y_test, LR, ITERATIONS, layer_dimensions)
train, test = nn.gradient_descent()

Iteration:  0
0.09925170068027211
Iteration:  10
0.5733333333333334


KeyboardInterrupt: 

In [None]:
index = 2900
train[index], test[index]

In [None]:
import plotly.graph_objs as go
import plotly.io as pio

iterations = [i for i in range(ITERATIONS + 1)]

train_data = pd.DataFrame({
    'x': iterations,
    'y': train
})

test_data = pd.DataFrame({
    'x': iterations,
    'y': test
})

trace1 = go.Scatter(x = train_data.x,
                    y = train_data.y,
                    mode = "lines",
                    name = "training",
                    marker = dict(color = 'rgba(16, 112, 2, 0.8)'))

trace2 = go.Scatter(x = test_data.x,
                    y = test_data.y,
                    mode = "lines+markers",
                    name = "test",
                    marker = dict(color = 'rgba(80, 26, 80, 0.8)'))

data = [trace1, trace2]
fig = dict(data = data)
pio.show(fig)

In [9]:
class NeuralNetwork:
    def __init__(self, X_train, Y_train, X_test, Y_test, LR, iterations, layer_dimensions):
        self.X_train = X_train
        self.Y_train = Y_train
        self.Y_one_hot = self.one_hot(Y_train)
        self.X_test = X_test
        self.Y_test = Y_test
        self.LR = LR
        self.iterations = iterations
        self.layer_dimensions = layer_dimensions

        W, B = self.init_params(layer_dimensions)
        self.W = W
        self.B = B

        self.Z = [None for i in range(len(layer_dimensions))]
        self.Z1 = None
        self.Z2 = None
        self.Z3 = None

        self.A = [X_train] + [None for i in range(len(layer_dimensions) - 1)]
        self.A1 = None
        self.A2 = None
        self.A3 = None

        self.dW1 = None
        self.dB1 = None
        self.dW2 = None
        self.dB2 = None
        self.dW3 = None
        self.dB3 = None

    def init_params(self, layer_dimensions):
        params = {}
        W = []
        B = []
    
        for l in range(1, len(layer_dimensions)):
            current_layer_dimension = layer_dimensions[l]
            previous_layer_dimension = layer_dimensions[l - 1]
            w = np.random.randn(current_layer_dimension, previous_layer_dimension) * np.sqrt(2 / previous_layer_dimension)
            b = np.random.randn(current_layer_dimension, 1)
            W.append(w)
            B.append(b)
    
        return B, W

    def one_hot(self, Y):
        one_hot_Y = np.zeros((Y.size, Y.max() + 1))
        one_hot_Y[np.arange(Y.size), Y] = 1
        return one_hot_Y.T
    
    def ReLU(self, Z):
        return np.maximum(Z, 0)

    def derivative_of_ReLU(self, Z):
        return Z > 0
    
    def LeakyReLU(self, Z, alpha=0.01):
        return np.where(Z > 0, Z, alpha * Z)
    
    def derivative_of_LeakyReLU(self, Z, alpha=0.01):
        return np.where(Z > 0, 1, alpha)
    
    def softmax(self, Z):
        expZ = np.exp(Z - np.max(Z, axis=0, keepdims=True))
        return expZ / np.sum(expZ, axis=0, keepdims=True)
        
    def _is_last_layer(layer):
        layer == len(self.layer_dimensions) - 1

    def forward_propagation(self):
        for layer in range(1, len(self.layer_dimensions)):
            self.Z[layer] = self.W[layer].dot(A[layer - 1]) + self.B[layer]

            if is_last_layer(layer):
                self.A[layer] = self.softmax(self.Z[layer])
            else:
                self.A[layer] = self.LeakyReLU(self.Z[layer])
    
    def backward_propagation(self, lambda_reg=0.01):
        self.dZ3 = self.A3 - self.Y_one_hot
        self.dW3 = 1 / m * self.dZ3.dot(self.A2.T) + (lambda_reg / m) * self.W3
        self.dB3 = 1 / m * np.sum(self.dZ3)
        
        self.dZ2 = self.W3.T.dot(self.dZ3) * self.derivative_of_LeakyReLU(self.Z2)
        self.dW2 = 1 / m * self.dZ2.dot(self.A1.T) + (lambda_reg / m) * self.W2
        self.dB2 = 1 / m * np.sum(self.dZ2)
        
        self.dZ1 = self.W2.T.dot(self.dZ2) * self.derivative_of_LeakyReLU(self.Z1)
        self.dW1 = 1 / m * self.dZ1.dot(self.X_train.T) + (lambda_reg / m) * self.W1
        self.dB1 = 1 / m * np.sum(self.dZ1)
    
    def update_params(self):
        self.W1 = self.W1 - self.LR * self.dW1
        self.B1 = self.B1 - self.LR * self.dB1    
        
        self.W2 = self.W2 - self.LR * self.dW2  
        self.B2 = self.B2 - self.LR * self.dB2
        
        self.W3 = self.W3 - self.LR * self.dW3  
        self.B3 = self.B3 - self.LR * self.dB3
    
    def get_predictions(self, A):
        return np.argmax(A, 0)
    
    def get_accuracy(self, predictions, Y):
        return np.sum(predictions == Y) / Y.size
    
    def predict(self, X):
        self.forward_propagation(X)
        return self.get_predictions(self.A3)
    
    def gradient_descent(self):
        train = []
        test = []
        
        for i in range(self.iterations + 1):
            self.forward_propagation(self.X_train)
            self.backward_propagation()
            self.update_params()
            
            training_predictions = self.get_predictions(self.A3)
            training_accuracy = self.get_accuracy(training_predictions, self.Y_train)
            test_predictions = self.predict(self.X_test)
            test_accuracy = self.get_accuracy(test_predictions, self.Y_test)
    
            train.append(training_accuracy)
            test.append(test_accuracy)
    
            if i % 10 == 0:
                print("Iteration: ", i)
                print(training_accuracy)
    
        return train, test

LR = 0.1
ITERATIONS = 3000
layer_dimensions = [784, 100, 100, 10]

nn = NeuralNetwork(X_train, Y_train, X_test, Y_test, LR, ITERATIONS, layer_dimensions)
train, test = nn.gradient_descent()
nn.predict(X_test)

[array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 None,
 None,
 None]