In [22]:
import numpy as np
import pandas as pd
data = pd.read_csv('train.csv')

In [23]:
data = np.array(data)
m, n = data.shape

np.random.shuffle(data)

# cross validation data
data_dev = data[0:1000].T

Y_dev = data_dev[0]
X_dev = data_dev[1:n]

data_train = data[1000:m].T
Y_train = data_train[0]
X_train = data_train[1:n]

# Normalize the training and validation data
X_train = X_train / 255.0
X_dev = X_dev / 255.0


In [20]:
def __init__params():
    W1 = np.random.randn(10, 784) * .1
    b1 = np.random.rand(10, 1) * .1
    W2 = np.random.randn(10, 10)
    b2 = np.random.rand(10, 1)
    return W1, b1, W2, b2
    
def forward_prop(W1, b1, W2, b2, X):
    z1 = np.dot(W1, X) + b1
    A1 = ReLu(z1)
    z2 = np.dot(W2, A1) + b2
    A2 = softmax(z2)
    return z1, A1, z2, A2

def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=0, keepdims=True))  # Ensure numerical stability
    return exp_z / np.sum(exp_z, axis=0, keepdims=True)
    
def ReLu(z):
    return np.maximum(0, z)

def one_hot(y):
    one_hot_y = np.zeros((y.size, y.max() + 1)) # Create a zero matrix with size (number of samples, number of classes)
    one_hot_y[np.arange(y.size), y] = 1 # Set the corresponding class indices to 1
    return one_hot_y.T
    
def deriv_ReLu(z):
    return z > 0

def back_prop(z1, A1, z2, A2, W2, x, y):
    m = y.size
    one_hot_y = one_hot(y)
    dz2 = A2 - one_hot_y
    dW2 = 1 / m * np.dot(dz2, A1.T)
    db2 = 1 / m * np.sum(dz2, axis=1, keepdims=True)
    dz1 = np.dot(W2.T, dz2) * deriv_ReLu(z1)
    dW1 = 1 / m * np.dot(dz1, x.T)
    db1 = 1 / m * np.sum(dz1, axis=1, keepdims=True)
    return dW1, db1, dW2, db2

def update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha):
    W1 = W1 - alpha * dW1
    b1 = b1 - alpha * db1
    W2 = W2 - alpha * dW2
    b2 = b2 - alpha * db2
    return W1, b1, W2, b2

In [26]:
def get_predictions(A2):
    return np.argmax(A2, 0)

def get_accuracy(predictions, Y):
    #print(predictions, Y)
    return np.sum(predictions == Y) / Y.size

# def gradient_descent(X, Y, iterations, alpha):
#     W1, b1, W2, b2 = __init__params()
#     for i in range(iterations):
#         z1, A1, z2, A2 = forward_prop(W1, b1, W2, b2, X)
#         dw1, db1, dw2, db2 = back_prop(z1, A1, z2, A2, W2, X, Y)
#         W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dw1, db1, dw2, db2, alpha)
#         if i % 50 == 0:
#             print("Iteration: ", i)
#             print("Accuracy: ", get_accuracy(get_predictions(A2), Y))
#     return W1, b1, W2, b2

def gradient_descent(X_train, Y_train, X_dev, Y_dev, iterations, alpha):
    # Initialize parameters (weights and biases)
    W1, b1, W2, b2 = __init__params()
    
    for i in range(iterations):
        # Forward propagation on training data
        z1, A1, z2, A2 = forward_prop(W1, b1, W2, b2, X_train)
        
        # Backward propagation to compute gradients
        dW1, db1, dW2, db2 = back_prop(z1, A1, z2, A2, W2, X_train, Y_train)
        
        # Update parameters using the gradients
        W1, b1, W2, b2 = update_params(W1, b1, W2, b2, dW1, db1, dW2, db2, alpha)
        
        # Print accuracies every 50 iterations
        if i % 50 == 0:
            # Training set accuracy
            print(f"Iteration: {i}")
            train_predictions = get_predictions(A2)
            train_accuracy = get_accuracy(train_predictions, Y_train)
            print(f"Training Accuracy: {train_accuracy}")
            
            # Evaluate on development set (dev set) without updating weights
            _, _, _, A2_dev = forward_prop(W1, b1, W2, b2, X_dev)
            dev_predictions = get_predictions(A2_dev)
            dev_accuracy = get_accuracy(dev_predictions, Y_dev)
            print(f"Dev Accuracy: {dev_accuracy}")
            
    return W1, b1, W2, b2


#W1, b1, W2, b2 = gradient_descent(X_train, Y_train, 500, 0.1)
W1, b1, W2, b2 = gradient_descent(X_train, Y_train, X_dev, Y_dev, 550, 0.1)

        

Iteration: 0
Training Accuracy: 0.1203170731707317
Dev Accuracy: 0.185
Iteration: 50
Training Accuracy: 0.5270731707317073
Dev Accuracy: 0.544
Iteration: 100
Training Accuracy: 0.6883170731707317
Dev Accuracy: 0.734
Iteration: 150
Training Accuracy: 0.7420487804878049
Dev Accuracy: 0.774
Iteration: 200
Training Accuracy: 0.780829268292683
Dev Accuracy: 0.807
Iteration: 250
Training Accuracy: 0.7980731707317074
Dev Accuracy: 0.818
Iteration: 300
Training Accuracy: 0.8100975609756098
Dev Accuracy: 0.832
Iteration: 350
Training Accuracy: 0.8178292682926829
Dev Accuracy: 0.835
Iteration: 400
Training Accuracy: 0.823780487804878
Dev Accuracy: 0.842
Iteration: 450
Training Accuracy: 0.8298292682926829
Dev Accuracy: 0.849
Iteration: 500
Training Accuracy: 0.8341951219512195
Dev Accuracy: 0.851
