In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt

#The point of this project was to familiarize myself with one of the most fundamental projects in machine learning, programming an artificial neuronetwork to classify numerical characters on the MNIST data set. 

#The project below was completed using a tutorial created by @Samason Zhang. What makes this project different from typical ANNs trained on MNIST is the fact that no machine learning libraries were used. The only libraries used were Numpy, Pandas, and Matplotlib. 

#The goal of this project was to gain a more intuitive understanding of the mathematical machinery that ANNs are composed of. The hope is to provide a strong theoretical foundation to build on for more advanced ML algorithms. 

In [2]:
data = pd.read_csv('../input/digit-recognizer/train.csv')

In [3]:
print(data.shape) #tells us there are 42,000 samples with 783 inputs + 2 columns for labels
data.head()


(42000, 785)


Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
data = np.array(data)
m, n = data.shape                #this is the shape where m is the rows and n is the columns +1 because label 
np.random.shuffle(data)

data_dev = data[0:1000].T    #this gives you 1000 columns for training set
Y_dev  = data_dev[0]
X_dev = data_dev[1:n]
X_dev = X_dev / 255

data_train = data[1000:m].T 
Y_train = data_train[0]
X_train = data_train[1:n]
X_train = X_train / 255
_,m_train = X_train.shape 

In [5]:
def init_params(): 
    W1 = np.random.rand(10, 784) - 0.5 
    b1 = np.random.rand(10, 1) - 0.5 
    W2 = np.random.rand(10,10) - 0.5 
    b2 = np.random.rand(10, 1) - 0.5 
    return W1, b1, W2, b2 

def ReLU(Z):
    return np.maximum(Z, 0)

def softmax(Z): 
    A = np.exp(Z) / sum(np.exp(Z))
    return A

def forward_prop(W1, b1, W2, b2, X): 
    Z1 = W1.dot(X) + b1
    A1 = ReLU(Z1)
    Z2 = W2.dot(A1) + b2 
    A2 = softmax(Z2)
    return Z1, A1, Z2, A2 

def one_hot(Y): 
    one_hot_Y = np.zeros((Y.size, Y.max() + 1))
    one_hot_Y[np.arange(Y.size), Y] = 1
    one_hot_Y = one_hot_Y.T
    return one_hot_Y 

def deriv_ReLU(Z): 
    return Z > 0

def back_prop(Z1, A1, Z2, A2, W1, W2, X, Y): 
    m = Y.size
    one_hot_Y = one_hot(Y)
    dZ2 = A2 - one_hot_Y
    dW2 = 1 / m * dZ2.dot(A1.T)
    db2 = 1 / m * np.sum(dZ2)
    dZ1 = W2.T.dot(dZ2) * deriv_ReLU(Z1)
    dW1 = 1 / m * dZ1.dot(X.T)
    db1 = 1 / m * np.sum(dZ1)
    return dW1, db1, dW2, db2

def update_params(W1,b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 = W1 - alpha * dW1 
    b1 = b1 - alpha * db1
    W2 = W2 - alpha * dW2 
    b2 = b2 - alpha * db2 
    return W1, b1, W2, b2



    
    

In [6]:
def get_predictions(A2):
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y): 
    print(predictions, Y)
    return np.sum(predictions == Y) / Y.size

def gradient_descent(X, Y, iterations, alpha): 
    W1, b1, W2, b2 = init_params()
    for i in range(iterations): 
        Z1, A1, Z2, A2 = forward_prop(W1, b1, W2, b2, X)
        dW1, db1, dW2, db2 = back_prop(Z1, A1, Z2, A2, W1, W2, X, Y)
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        if i%10 == 0:
            print("iteration: ", i)
            print("Accuracy: ", get_accuracy(get_predictions(A2), Y))
    return W1, b1, W2, b2

In [7]:
W1, b1, W2, b2 = gradient_descent(X_train, Y_train, 500, 0.2)


iteration:  0
[2 9 7 ... 2 6 9] [8 3 3 ... 2 1 6]
Accuracy:  0.08475609756097562
iteration:  10
[2 9 3 ... 7 6 6] [8 3 3 ... 2 1 6]
Accuracy:  0.2553170731707317
iteration:  20
[2 6 3 ... 4 1 6] [8 3 3 ... 2 1 6]
Accuracy:  0.39778048780487807
iteration:  30
[2 6 3 ... 4 1 2] [8 3 3 ... 2 1 6]
Accuracy:  0.5113414634146342
iteration:  40
[7 6 3 ... 4 1 2] [8 3 3 ... 2 1 6]
Accuracy:  0.5877560975609756
iteration:  50
[7 2 3 ... 4 1 6] [8 3 3 ... 2 1 6]
Accuracy:  0.6387073170731707
iteration:  60
[8 3 3 ... 4 1 6] [8 3 3 ... 2 1 6]
Accuracy:  0.6743170731707318
iteration:  70
[8 3 3 ... 4 1 6] [8 3 3 ... 2 1 6]
Accuracy:  0.7013170731707317
iteration:  80
[8 3 3 ... 4 1 6] [8 3 3 ... 2 1 6]
Accuracy:  0.7230487804878049
iteration:  90
[8 3 3 ... 4 1 6] [8 3 3 ... 2 1 6]
Accuracy:  0.7403170731707317
iteration:  100
[8 3 3 ... 8 1 6] [8 3 3 ... 2 1 6]
Accuracy:  0.7537073170731707
iteration:  110
[1 3 3 ... 4 1 6] [8 3 3 ... 2 1 6]
Accuracy:  0.7305365853658536
iteration:  120
[8 3 3 ..