In [122]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import time

In [123]:
train_data = pd.read_csv('train.csv')
X_train = train_data.drop(labels=['label'], axis=1)
y_train = train_data['label']
del train_data
X_train /= 255.0
y_train = to_categorical(y_train, num_classes=10)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.1)
X_train = X_train.T
X_val = X_val.T
y_train = y_train.T
y_val = y_val.T
print('X_train.shape', X_train.shape)
print('X_val.shape', X_val.shape)
print('y_train.shape', y_train.shape)
print('y_val.shape', y_val.shape)

X_train.shape (784, 37800)
X_val.shape (784, 4200)
y_train.shape (10, 37800)
y_val.shape (10, 4200)


In [191]:
def ReLU(z):
    return np.maximum(0, z)
def ReLU_deriv(z):
    return z > 0
def softmax(z):
    return np.exp(z)/np.sum(np.exp(z),axis=0)

In [195]:
def init_params():
    w1 = np.random.rand(800, 784) - 0.5
    b1 = np.random.rand(800, 1) - 0.5
    w2 = np.random.rand(10, 800) - 0.5
    b2 = np.random.rand(10, 1) - 0.5
    return w1, b1, w2, b2

def forward_prop(w1, b1, w2, b2, X):
    Z1 = w1.dot(X) + b1
    A1 = ReLU(Z1)
    # print('A1.shape', A1.shape)
    Z2 = w2.dot(A1) + b2
    A2 = softmax(Z2)
    # print('A2.shape', A2.shape)
    return Z1, A1, Z2, A2

def back_prop(Z1, A1, Z2, A2, w2, X, Y):
    m = X.shape[1]
    dz2 = A2 - Y
    # print('dz2.shape',dz2.shape)
    dw2 = 1 / m * dz2.dot(A1.T)
    # print('dw2.shape',dw2.shape)
    db2 = (1 / m * np.sum(dz2, axis=1)).reshape(10,1)
    # print('db2.shape',db2.shape)
    dz1 = (w2.T).dot(dz2) * ReLU_deriv(Z1)
    # print('dz1.shape',dz1.shape)
    dw1 = 1 / m * dz1.dot(X.T)
    # print('dw1.shape',dw1.shape)
    db1 = (1 / m * np.sum(dz1, axis=1)).reshape(800,1)
    # print('db1.shape',db1.shape)
    return dw1, db1, dw2, db2

def update_params(w1, b1, w2, b2, dw1, db1, dw2, db2, learning_rate):
    w1 = w1 - learning_rate * dw1
    b1 = b1 - learning_rate * db1
    w2 = w2 - learning_rate * dw2
    b2 = b2 - learning_rate * db2
    return w1, b1, w2, b2



def get_predictions(a2):
    return np.argmax(a2, axis=0)

def predict(w1, b1, w2, b2, X):
    Z1, A1, Z2, A2 = forward_prop(w1, b1, w2, b2, X)
    return get_predictions(A2)

def get_acc(y_hat, y):
    return np.sum(y_hat == get_predictions(y)) / y.shape[1]



In [199]:
def gradient_descent(X, y, learning_rate, iterations):
    w1, b1, w2, b2 = init_params()
    for i in range(1,iterations+1):
        Z1, A1, Z2, A2 = forward_prop(w1, b1, w2, b2, X)
        dw1, db1, dw2, db2 = back_prop(Z1, A1, Z2, A2, w2, X, y)
        w1, b1, w2, b2 = update_params(w1, b1, w2, b2, dw1, db1, dw2, db2, learning_rate)
        # print(w1)
        if(i % 10 == 0):
            print('iteration ', i, end='')
            predictions = get_predictions(A2)
            print(' acc: ', get_acc(predictions,y))

    return w1, b1, w2, b2
    
    

In [207]:
w1, b1, w2, b2 = gradient_descent(X_train, y_train, 0.04, 200)

iteration  10 acc:  0.49727513227513226
iteration  20 acc:  0.6536243386243387
iteration  30 acc:  0.7138359788359788
iteration  40 acc:  0.7484920634920635
iteration  50 acc:  0.7722486772486773
iteration  60 acc:  0.7894973544973545
iteration  70 acc:  0.8032010582010582
iteration  80 acc:  0.8135978835978837
iteration  90 acc:  0.8222486772486772
iteration  100 acc:  0.8287566137566138
iteration  110 acc:  0.8357671957671957
iteration  120 acc:  0.8408201058201058
iteration  130 acc:  0.8444973544973545
iteration  140 acc:  0.8490740740740741
iteration  150 acc:  0.8523015873015873
iteration  160 acc:  0.855925925925926
iteration  170 acc:  0.8595502645502645
iteration  180 acc:  0.8624338624338624
iteration  190 acc:  0.8650793650793651
iteration  200 acc:  0.8674338624338624


In [208]:
get_acc(predict(w1, b1, w2, b2, X_val), y_val)

0.8590476190476191