# Neural Network from scratch

In [3]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

data = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')

In [5]:
data

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41996,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41997,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41998,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
data = np.array(data)
m, n = data.shape
m, n

(42000, 785)

In [7]:
number_of_tests = int(m * 0.3)

np.random.shuffle(data) # shuffle before splitting into test and training sets

test_data = data[0:number_of_tests].T
Y_test = test_data[0]
X_test = test_data[1:n]
X_test = X_test / 255.0

train_data = data[number_of_tests:m].T
Y_train = train_data[0]
X_train = train_data[1:n]
X_train = X_train / 255.0

In [None]:
def init_params():
    W1 = np.random.rand(100, 784) * 0.01
    B1 = np.zeros((100, 1))
    W2 = np.random.rand(10, 100) * 0.01
    B2 = np.zeros((10, 1))
    return W1, B1, W2, B2

def ReLU(Z):
    return np.maximum(Z, 0)

def softmax(Z):
    return np.exp(Z) / sum(np.exp(Z))
    
def forward_propagation(W1, B1, W2, B2, X):
    Z1 = W1.dot(X) + B1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + B2
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2

def derivative_of_ReLU(Z):
    return Z > 0

def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    return one_hot_Y.T

def backward_propagation(Z1, A1, Z2, A2, W1, W2, X, Y):
    dZ2 = A2 - Y
    dW2 = 1 / m * dZ2.dot(A1.T)
    dB2 = 1 / m * np.sum(dZ2)
    dZ1 = W2.T.dot(dZ2) * derivative_of_ReLU(Z1)
    dW1 = 1 / m * dZ1.dot(X.T)
    dB1 = 1 / m * np.sum(dZ1)
    return dW1, dB1, dW2, dB2

def update_params(W1, B1, W2, B2, dW1, dB1, dW2, dB2, LR):
    W1 = W1 - LR * dW1
    B1 = B1 - LR * dB1    
    W2 = W2 - LR * dW2  
    B2 = B2 - LR * dB2    
    return W1, B1, W2, B2

In [None]:
def get_predictions(A2):
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y):
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, LR, iterations):
    W1, B1, W2, B2 = init_params()
    one_hot_Y = one_hot(Y)
    
    for i in range(iterations + 1):
        Z1, A1, Z2, A2 = forward_propagation(W1, B1, W2, B2, X)
        dW1, db1, dW2, db2 = backward_propagation(Z1, A1, Z2, A2, W1, W2, X, one_hot_Y)
        W1, B1, W2, B2 = update_params(W1, B1, W2, B2, dW1, db1, dW2, db2, LR)
        if i % 10 == 0:
            print("Iteration: ", i)
            predictions = get_predictions(A2)
            print(get_accuracy(predictions, Y))
    return W1, B1, W2, B2

In [None]:
# W1, B1, W2, B2, W3, B3 = gradient_descent(X_train, Y_train, 0.0001, 500)
W1, B1, W2, B2 = gradient_descent(X_train, Y_train, 0.1, 500)

~85% accuracy on training set.

In [None]:
def make_predictions(X, W1, B1, W2, B2):
    _, _, _, A2 = forward_propagation(W1, B1, W2, B2, X)
    predictions = get_predictions(A2)
    return predictions

Let's look at a couple of examples:

In [None]:
test_predictions = make_predictions(X_test, W1, B1, W2, B2)
get_accuracy(test_predictions, Y_test)

Still 84% accuracy, so our model generalized from the training data pretty well.

In [12]:
def init_params():
    W1 = np.random.randn(100, 784) * np.sqrt(2 / 784)
    B1 = np.zeros((100, 1))
    W2 = np.random.randn(100, 100) * np.sqrt(2 / 10) 
    B2 = np.zeros((100, 1))
    W3 = np.random.randn(10, 100) * np.sqrt(2 / 10) 
    B3 = np.zeros((10, 1))
    return W1, B1, W2, B2, W3, B3

def ReLU(Z):
    return np.maximum(Z, 0)

def LeakyReLU(Z, alpha=0.01):
    return np.where(Z > 0, Z, alpha * Z)

def derivative_of_LeakyReLU(Z, alpha=0.01):
    return np.where(Z > 0, 1, alpha)

def softmax(Z):
    expZ = np.exp(Z - np.max(Z, axis=0, keepdims=True))
    return expZ / np.sum(expZ, axis=0, keepdims=True)
    
def forward_propagation(W1, B1, W2, B2, W3, B3, X):
    Z1 = W1.dot(X) + B1
    A1 = LeakyReLU(Z1)
    Z2 = W2.dot(A1) + B2
    A2 = LeakyReLU(Z2)
    Z3 = W3.dot(A2) + B3
    A3 = softmax(Z3)
    return Z1, A1, Z2, A2, Z3, A3

def derivative_of_ReLU(Z):
    return Z > 0

def one_hot(Y):
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    return one_hot_Y.T

def backward_propagation(Z1, A1, Z2, A2, Z3, A3, W1, W2, W3, X, Y, lambda_reg=0.01):
    dZ3 = A3 - Y
    dW3 = 1 / m * dZ3.dot(A2.T) + (lambda_reg / m) * W3
    dB3 = 1 / m * np.sum(dZ3)
    
    dZ2 = W3.T.dot(dZ3) * derivative_of_LeakyReLU(Z2)
    dW2 = 1 / m * dZ2.dot(A1.T) + (lambda_reg / m) * W2
    dB2 = 1 / m * np.sum(dZ2)
    
    dZ1 = W2.T.dot(dZ2) * derivative_of_LeakyReLU(Z1)
    dW1 = 1 / m * dZ1.dot(X.T) + (lambda_reg / m) * W1
    dB1 = 1 / m * np.sum(dZ1)
    
    return dW1, dB1, dW2, dB2, dW3, dB3

def update_params(W1, B1, W2, B2, W3, B3, dW1, dB1, dW2, dB2, dW3, dB3, LR):
    W1 = W1 - LR * dW1
    B1 = B1 - LR * dB1    
    W2 = W2 - LR * dW2  
    B2 = B2 - LR * dB2
    W3 = W3 - LR * dW3  
    B3 = B3 - LR * dB3
    return W1, B1, W2, B2, W3, B3

def get_predictions(A):
    return np.argmax(A, 0)

def get_accuracy(predictions, Y):
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, LR, iterations):
    W1, B1, W2, B2, W3, B3 = init_params()
    one_hot_Y = one_hot(Y)
    
    for i in range(iterations + 1):
        Z1, A1, Z2, A2, Z3, A3 = forward_propagation(W1, B1, W2, B2, W3, B3, X)
        dW1, dB1, dW2, dB2, dW3, dB3 = backward_propagation(Z1, A1, Z2, A2, Z3, A3, W1, W2, W3, X, one_hot_Y)
        W1, B1, W2, B2, W3, B3 = update_params(W1, B1, W2, B2, W3, B3, dW1, dB1, dW2, dB2, dW3, dB3, LR)
        if i % 10 == 0:
            print("Iteration: ", i)
            predictions = get_predictions(A3)
            print(get_accuracy(predictions, Y))
    return W1, B1, W2, B2, W3, B3

In [14]:
W1, B1, W2, B2, W3, B3 = gradient_descent(X_train, Y_train, 0.1, 900)

Iteration:  0
0.10595238095238095
Iteration:  10
0.21636054421768708
Iteration:  20
0.31010204081632653
Iteration:  30
0.47183673469387755
Iteration:  40
0.4496938775510204
Iteration:  50
0.6715646258503402
Iteration:  60
0.704047619047619
Iteration:  70
0.7734353741496599
Iteration:  80
0.8345578231292518
Iteration:  90
0.809625850340136
Iteration:  100
0.8687414965986394
Iteration:  110
0.8777891156462585
Iteration:  120
0.8601360544217687
Iteration:  130
0.8849319727891156
Iteration:  140
0.8929931972789116
Iteration:  150
0.8976190476190476
Iteration:  160
0.9012585034013605
Iteration:  170
0.9044897959183673
Iteration:  180
0.906734693877551
Iteration:  190
0.8131292517006803
Iteration:  200
0.9093877551020408
Iteration:  210
0.9130952380952381
Iteration:  220
0.9151700680272109
Iteration:  230
0.9172789115646258
Iteration:  240
0.9192857142857143
Iteration:  250
0.9209863945578232
Iteration:  260
0.9226190476190477
Iteration:  270
0.9235714285714286
Iteration:  280
0.921836734693

In [15]:
def make_predictions(X, W1, B1, W2, B2, W3, B3):
    _, _, _, _, _, A3 = forward_propagation(W1, B1, W2, B2, W3, B3, X)
    predictions = get_predictions(A3)
    return predictions

In [17]:
test_predictions = make_predictions(X_test, W1, B1, W2, B2, W3, B3)
get_accuracy(test_predictions, Y_test)

0.9466666666666667