### Building a two-layer Neural Net in Numpy using the MNIST dataset

##### Load MNIST datasets from:
https://s3.amazonaws.com/jrwprojects/fashion_mnist_train_images.npy
https://s3.amazonaws.com/jrwprojects/fashion_mnist_train_labels.npy
https://s3.amazonaws.com/jrwprojects/fashion_mnist_test_images.npy
https://s3.amazonaws.com/jrwprojects/fashion_mnist_test_labels.npy

In [57]:
import numpy as np

In [None]:
X_train = np.load('data/fashion_mnist_train_images.npy')
y_train = np.load('data/fashion_mnist_train_labels.npy')

X_test = np.load('data/fashion_mnist_test_images.npy')
y_test = np.load('data/fashion_mnist_test_labels.npy')

In [59]:
print(X_train.shape)

(60000, 784)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)

x_scaler = MinMaxScaler()
X_train_scaled = x_scaler.fit_transform(X_train)
X_val_scaled = x_scaler.transform(X_val)
X_test_scaled = x_scaler.transform(X_test)

(48000, 784)
(12000, 784)
(48000,)


In [72]:
def softmax(logits):
        exp_logits = np.exp(logits - np.max(logits, axis=1, keepdims=True))  # Numerical stability improvement
        return exp_logits / np.sum(exp_logits, axis=1, keepdims=True)

def relu(x):
    return np.maximum(x, 0)

class NN():
    def __init__(self, input_size, hidden_size, output_size):
        np.random.seed(91)
        self.w1 = np.random.randn(input_size, hidden_size) * 0.01
        self.b1 = np.zeros((1, hidden_size))

        self.w2 = np.random.randn(hidden_size, output_size) * 0.01
        self.b2 = np.zeros((1, output_size))

    def forward(self, x):
        z = np.dot(x, self.w1) + self.b1 # shape (batch_size, hidden_size)
        a = relu(z)
        z2 = np.dot(a, self.w2) + self.b2 # shape (batch_size, output_size)
        y_pred = softmax(z2)
        return z, a, z2, y_pred # shape (batch_size, output_size)
    
    def loss_fn(self, batch_size, y, y_pred, alpha):
        predicted_probs = y_pred[range(batch_size), y]
        log_likelihood = -np.log(predicted_probs)
        loss = np.sum(log_likelihood) / batch_size

        l2 = (alpha / 2) * (np.sum(self.w1**2) + np.sum(self.w2**2))
        loss += l2
        return loss
    
    def backprop(self, batch_size, y, y_pred, x, z, a, lr):
        y_onehot = np.zeros_like(y_pred)
        y_onehot[np.arange(batch_size), y] = 1

        # gradients
        d_z2 = (y_pred - y_onehot) / y_pred.shape[0]
        d_w2 = a.T @ d_z2  # Gradient of W2
        d_b2 = np.sum(d_z2, axis=0, keepdims=True)  # Gradient of b2
        d_a = d_z2 @ self.w2.T  # Backpropagate through W2
        d_z1 = d_a * (z > 0)  # Backpropagate through ReLU (derivative of ReLU)
        d_w1 = x.T @ d_z1  # Gradient of W1
        d_b1 = np.sum(d_z1, axis=0, keepdims=True)  # Gradient of b1

        # Gradient descent update (with learning rate)
        self.w1 -= lr * d_w1
        self.b1 -= lr * d_b1
        self.w2 -= lr * d_w2
        self.b2 -= lr * d_b2 

In [91]:
input_size = X_train_scaled.shape[1]
hidden_size = 40
output_size = 10 # 10 class labels
alpha = 0.1
batch_size = 64
epochs = 10
lr = 0.0002

model = NN(input_size, hidden_size, output_size)

def train(X, y, epochs, num_samples):
    total_loss = 0
    for _ in range(epochs):
        indices = np.arange(num_samples)
        np.random.shuffle(indices)

        epoch_loss = 0
        for i in range(0, num_samples, batch_size):
            batch = indices[i : i + batch_size]
            X_batch, y_batch = X[batch], y[batch]

            cur_batch_size = len(X_batch)
            z, a, _, y_pred = model.forward(X_batch)
            loss = model.loss_fn(cur_batch_size, y_batch, y_pred, alpha)
            model.backprop(cur_batch_size, y_batch, y_pred, X_batch, z, a, lr)
            epoch_loss += loss
        epoch_loss = epoch_loss / (num_samples // batch_size) # num batches
        # print(f"Epoch loss: {epoch_loss}")
        total_loss += epoch_loss
    # print(f"Avg_loss: {total_loss / epochs}") 

def val(X, y, num_samples):
    total_loss = 0
    for i in range(0, num_samples, batch_size):
        X_batch = X[i : i + batch_size]
        y_batch = y[i : i + batch_size]
        cur_batch_size = len(X_batch)

        _, _, _, y_pred = model.forward(X_batch)
        prediction = np.argmax(y_pred, axis=1)
        loss = model.loss_fn(cur_batch_size, y_batch, y_pred, alpha)
        total_loss += (loss / (num_samples // batch_size))
        accuracy = np.sum(prediction == y_batch) / batch_size
    # print(f"Val loss: {total_loss}")
    # print(f"Accuracy: {accuracy}")
    return total_loss, accuracy


In [80]:
train(X_train_scaled, y_train, num_samples = X_train_scaled.shape[0])
val(X_val_scaled, y_val, num_samples = X_val_scaled.shape[0])

Epoch loss: 2.4591702298250753
Epoch loss: 2.45752670747476
Epoch loss: 2.45566420908833
Epoch loss: 2.4534551569578116
Epoch loss: 2.4507302094204086
Epoch loss: 2.447303064011403
Epoch loss: 2.442968021391681
Epoch loss: 2.43748816391725
Epoch loss: 2.430606421757479
Epoch loss: 2.422077535169948
Avg_loss: 2.4456989719014146
Total loss: 2.429888749565764


In [None]:
from itertools import product

mini_batches = [32, 64, 128]
learning_rates = [1e-3, 1e-4]
epochs = [5, 10, 20]
l2_alphas = [1e-3, 1.0]
hidden_size = [20, 40]

best_params = {}
loss = np.inf

for b, lr, e, l2, h in product(mini_batches, learning_rates, epochs, l2_alphas, hidden_size):
    alpha = l2 
    lr = lr
    model = NN(input_size, h, output_size)
    train(X_train_scaled, y_train, e, num_samples = X_train_scaled.shape[0])
    val_loss, accuracy = val(X_val_scaled, y_val, num_samples = X_val_scaled.shape[0])
    if val_loss <= min(val_loss, loss):
        loss = val_loss
        best_params = {"Batch size" : b, "Learning rate" : lr, "Epochs" : e, "Alpha" : l2, "Hidden size" : h}
        print(f"\nCurrent best params: {best_params}")
        print(f"\nLoss: {val_loss.item():.4f}, Accuracy: {accuracy:.4f}")
print(f"Best params: {best_params}")


Current best params: ('Batch size: 32', 'Learning rate: 0.001', 'Epochs: 5', 'Alpha: 0.001', 'Hidden size: 20')

Loss: 1.9323, Accuracy: 0.2188

Current best params: ('Batch size: 32', 'Learning rate: 0.001', 'Epochs: 5', 'Alpha: 0.001', 'Hidden size: 40')

Loss: 1.8051, Accuracy: 0.2500

Current best params: ('Batch size: 32', 'Learning rate: 0.001', 'Epochs: 10', 'Alpha: 0.001', 'Hidden size: 20')

Loss: 1.2309, Accuracy: 0.2812

Current best params: ('Batch size: 32', 'Learning rate: 0.001', 'Epochs: 10', 'Alpha: 0.001', 'Hidden size: 40')

Loss: 1.1725, Accuracy: 0.2969

Current best params: ('Batch size: 32', 'Learning rate: 0.001', 'Epochs: 20', 'Alpha: 0.001', 'Hidden size: 20')

Loss: 0.8441, Accuracy: 0.3125

Current best params: ('Batch size: 32', 'Learning rate: 0.001', 'Epochs: 20', 'Alpha: 0.001', 'Hidden size: 40')

Loss: 0.8233, Accuracy: 0.3125

Current best params: ('Batch size: 64', 'Learning rate: 0.001', 'Epochs: 20', 'Alpha: 0.001', 'Hidden size: 40')

Loss: 0.823