In [1]:
# library import

import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt
import torch
import torch.nn.functional as F


In [2]:
# data prep

data = pd.read_csv('digit-recognizer/train.csv')

data = np.array(data)
m, n = data.shape
np.random.shuffle(data)

data_dev = data[:1000].T
Y_dev = data_dev[0]
X_dev = data_dev[1:] / 255

data_train = data[1000:].T
Y_train = data_train[0]
X_train = data_train[1:] / 255

X_train_torch = torch.tensor(X_train, dtype=torch.float32)
Y_train_torch = torch.tensor(Y_train, dtype=torch.long)
X_dev = torch.tensor(X_dev, dtype=torch.float32)
Y_dev = torch.tensor(Y_dev, dtype=torch.long)

In [8]:
# functions

def init_params():
    W1 = torch.randn((128, 784), requires_grad=True) * torch.sqrt(torch.tensor(2.0 / 784))
    b1 = torch.zeros((128, 1), requires_grad=True)
    
    W2 = torch.randn((64, 128), requires_grad=True) * torch.sqrt(torch.tensor(2.0 / 128))
    b2 = torch.zeros((64, 1), requires_grad=True)
    
    W3 = torch.randn((10, 64), requires_grad=True) * torch.sqrt(torch.tensor(2.0 / 64))
    b3 = torch.zeros((10, 1), requires_grad=True)
    
    return W1, b1, W2, b2, W3, b3



def ReLU(Z):
    return torch.maximum(Z, torch.tensor(0.0))

def softmax(Z):
    expZ = torch.exp(Z - torch.max(Z, dim=0, keepdim=True).values)
    return expZ / torch.sum(expZ, dim=0, keepdim=True)

def forward_prop(W1, b1, W2, b2, W3, b3, X):
    Z1 = W1 @ X + b1
    A1 = ReLU(Z1) 
    
    Z2 = W2 @ A1 + b2
    A2 = ReLU(Z2)  
    
    Z3 = W3 @ A2 + b3
    A3 = softmax(Z3) 
    
    return Z1, A1, Z2, A2, Z3, A3

def one_hot(Y, num_classes):
    one_hot_Y = F.one_hot(Y, num_classes=num_classes).float().T
    return one_hot_Y

def back_prop(Z1, A1, Z2, A2, Z3, A3, W2, W3, X, Y):
    m = X.shape[1]
    one_hot_Y = one_hot(Y, num_classes=A3.shape[0])
    
    dZ3 = A3 - one_hot_Y
    dW3 = (1 / m) * dZ3 @ A2.T
    db3 = (1 / m) * torch.sum(dZ3, dim=1, keepdim=True)
    
    dZ2 = W3.T @ dZ3 * (Z2 > 0) 
    dW2 = (1 / m) * dZ2 @ A1.T
    db2 = (1 / m) * torch.sum(dZ2, dim=1, keepdim=True)
    
    dZ1 = W2.T @ dZ2 * (Z1 > 0)  
    dW1 = (1 / m) * dZ1 @ X.T
    db1 = (1 / m) * torch.sum(dZ1, dim=1, keepdim=True)
    
    return dW1, db1, dW2, db2, dW3, db3


def update_params(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, alpha):
    with torch.no_grad():
        W1 -= alpha * dW1
        b1 -= alpha * db1
        W2 -= alpha * dW2
        b2 -= alpha * db2
        W3 -= alpha * dW3
        b3 -= alpha * db3
    return W1, b1, W2, b2, W3, b3


def get_predictions(A2):
    return torch.argmax(A2, dim=0)

def get_accuracy(predictions, Y):
    return torch.mean((predictions == Y).float()).item()

def gradient_descent(X, Y, iterations, alpha):
    W1, b1, W2, b2, W3, b3 = init_params()
    
    for i in range(iterations):
        Z1, A1, Z2, A2, Z3, A3 = forward_prop(W1, b1, W2, b2, W3, b3, X)
        dW1, db1, dW2, db2, dW3, db3 = back_prop(Z1, A1, Z2, A2, Z3, A3, W2, W3, X, Y)
        W1, b1, W2, b2, W3, b3 = update_params(W1, b1, W2, b2, W3, b3, dW1, db1, dW2, db2, dW3, db3, alpha)
        
        if i % 10 == 0:
            predictions = get_predictions(A3)
            accuracy = get_accuracy(predictions, Y)
            print(f"Iteration {i}, Accuracy: {accuracy:.4f}")
    
    return W1, b1, W2, b2, W3, b3





In [12]:
W1, b1, W2, b2, W3, b3 = gradient_descent(X_train_torch, Y_train_torch, 400, 0.2)

Iteration 0, Accuracy: 0.0851
Iteration 10, Accuracy: 0.7394
Iteration 20, Accuracy: 0.6248
Iteration 30, Accuracy: 0.8240
Iteration 40, Accuracy: 0.8555
Iteration 50, Accuracy: 0.8588
Iteration 60, Accuracy: 0.8930
Iteration 70, Accuracy: 0.8956
Iteration 80, Accuracy: 0.8941
Iteration 90, Accuracy: 0.9083
Iteration 100, Accuracy: 0.9128
Iteration 110, Accuracy: 0.9149
Iteration 120, Accuracy: 0.9130
Iteration 130, Accuracy: 0.9180
Iteration 140, Accuracy: 0.9237
Iteration 150, Accuracy: 0.9264
Iteration 160, Accuracy: 0.9284
Iteration 170, Accuracy: 0.9306
Iteration 180, Accuracy: 0.9324
Iteration 190, Accuracy: 0.9338
Iteration 200, Accuracy: 0.9357
Iteration 210, Accuracy: 0.9371
Iteration 220, Accuracy: 0.9385
Iteration 230, Accuracy: 0.9400
Iteration 240, Accuracy: 0.9410
Iteration 250, Accuracy: 0.9422
Iteration 260, Accuracy: 0.9431
Iteration 270, Accuracy: 0.9444
Iteration 280, Accuracy: 0.9454
Iteration 290, Accuracy: 0.9467
Iteration 300, Accuracy: 0.9478
Iteration 310, Accu