In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("../input/digit-recognizer/train.csv")
data.head()

In [3]:
data = data.to_numpy()
samples, features = data.shape
np.random.shuffle(data)

In [4]:
test_data = data[0:1000].T
X_test = test_data[1:features]
Y_test = test_data[0]

train_data = data[1000:samples].T
X_train = train_data[1:features]
Y_train = train_data[0]

In [5]:
X_train.shape

In [18]:
Y_train.shape

In [22]:
def initial_parameters():
    first_weights = np.random.rand(10, 784)
    first_bias = np.random.rand(10, 1)
    second_weights = np.random.rand(10, 10)
    second_bias = np.random.rand(10, 1)
    return first_weights, first_bias, second_weights, second_bias

"""def Leaky_ReLU(step_1):
    for i in step_1:
        if i>0:
            return i
        else:
            return 0.01*i
        
def LR_derivative(step_2, alpha):
    derivatives = np.ones_like(step_2).astype(np.float32)
    derivatives[step_2 < 0] = alpha
    return derivatives
"""
def ReLU(Z):
    return np.maximum(Z, 0)
def ReLU_deriv(Z):
    return Z > 0
        
def Softmax(step_3):
    prob_array = np.exp(step_3)/np.exp(step_3).sum()
    return prob_array

def One_Hot_Encoder(Y_train):
    binary_labels = np.zeros((Y_train.size, Y_train.max() + 1))
    binary_labels[np.arange(Y_train.size), Y_train] = 1
    binary_labels = binary_labels.T
    return binary_labels

def forward_pass(first_weights, first_bias, second_weights, second_bias):
    step_1 = first_weights.dot(X_train) + first_bias
    step_2 = ReLU(step_1)
    step_3 = second_weights.dot(step_2) + second_bias
    step_4 = Softmax(step_3)
    return step_1, step_2, step_3, step_4

def backward_pass(step_1, step_2, step_3, step_4, seond_weights, X_train, Y_train):
    m = Y_train.shape[1]
    binary_labels = One_Hot_Encoder(Y_train)
    d_step_3 = step_4 - binary_labels
    d_second_weights = 1/m * d_step_3.dot(step_1.T)
    d_second_bias = 1/m * np.sum(d_step_3)
    d_step_2 = second_weights.T.dot(d_step_3) * ReLU_deriv(step_2)
    d_first_weights = 1/m * d_step_2.dot(X_train.T)
    d_first_bias = 1/m * np.sum(d_step_2)
    return d_first_weights, d_first_bias, d_second_weights, d_second_bias

def update_parameters(first_weights, first_bias, second_weights, second_bias,
                      d_first_weights, d_first_bias, d_second_weights, d_second_bias, 
                      learning_rate):
    first_weights = first_weights - d_first_weights*learning_rate
    first_bias = first_bias - d_first_bias*learning_rate
    second_weights = second_weights - d_second_weights*learning_rate
    second_bias = second_bias - d_second_bias*learning_rate
    return first_weights, first_bias, second_weights, second_bias

def preditions(step_4):
    return np.argmax(step_4, 0)

def accuracy(predictions, Y_train):
    print(predictions, Y_train)
    return np.sum(predictions == Y_train)/Y_train.shape[0]


In [23]:
def gradient_descent(X_train, Y_train, learning_rate, epochs):
    first_weights, first_bias, second_weights, second_bias = initial_parameters()
    for epoch in range(epochs):
        step_1, step_2, step_3, step_4 = forward_pass(first_weights, first_bias,
                                                      second_weights, second_bias)
        
        d_first_weights, d_first_bias, d_second_weights, d_second_bias = backward_pass(
                                                                        step_1, step_2, 
                                                                        step_3, step_4, 
                                                                        second_weights, 
                                                                        X_train, Y_train)
        
        first_weights, first_bias, second_weights, second_bias = update_parameters(
                                                            first_weights, first_bias, 
                                                            second_weights, second_bias,
                                                            d_first_weights, d_first_bias, 
                                                            d_second_weights, d_second_bias, 
                                                            learning_rate)

        if (epoch % 10) == 0:
            print(f"Epochs: {epoch}/{epochs}")
            pred = predictions(step_4)
            print(get_accuracy(pred, Y_train))
    return first_weights, first_bias, second_weights, second_bias

In [24]:
learning_rate = 0.01
epochs = 150
first_weights, first_bias, second_weights, second_bias = gradient_descent(X_train, Y_train, learning_rate, epochs)