# Lab Assignment 1

Student name: [fill in]

## Notebook version

This notebook includes all the codes in the codebase of lab assignment 1. Completing and submitting this script is equivalent to submitting the codebase. Please note that your submitted script should include errorless cell outputs that contain necessary information that proves you have successfully run the notebook in your own directory.

You can choose to (1) run this notebook locally on your end or (2) run this notebook on colab. For the former, you will need to download the dataset to your device that resembles the instructions for the codebase. For the latter, **you will need to upload the dataset to your Google Drive** account, and connect your colab notebook to your Google Drive. Then, go to "File->Save a copy in Drive" to create a copy you can edit.


#### Colab (if applicable)

If you are running this script on colab, uncomment and run the cell below:

In [2]:
import os
os.chdir('/home/jake/ghub/csci-5922-deep_learning')

Note that the Google Drive directory has the root `/content/drive/`. For instance, my directory to the dataset is `'/content/drive/My Drive/Courses/CSCI 5922/CSCI 5922 SP25/Demo/MNIST/'`.

### mnist.py

In [3]:
#Original source: https://www.kaggle.com/code/hojjatk/read-mnist-dataset
#It has been modified for ease of use w/ pytorch

#You do NOT need to modify ANY code in this file!

import numpy as np
import struct
from array import array
import torch

class MnistDataloader(object):
    def __init__(self, training_images_filepath,training_labels_filepath,
                 test_images_filepath, test_labels_filepath):
        self.training_images_filepath = training_images_filepath
        self.training_labels_filepath = training_labels_filepath
        self.test_images_filepath = test_images_filepath
        self.test_labels_filepath = test_labels_filepath

    def read_images_labels(self, images_filepath, labels_filepath):
        n = 60000 if "train" in images_filepath else 10000
        labels = torch.zeros((n, 10))
        with open(labels_filepath, 'rb') as file:
            magic, size = struct.unpack(">II", file.read(8))
            if magic != 2049:
                raise ValueError('Magic number mismatch, expected 2049, got {}'.format(magic))
            l = torch.tensor(array("B", file.read())).unsqueeze(-1)
            l = torch.concatenate((torch.arange(0, n).unsqueeze(-1), l), dim = 1).type(torch.int32)
            labels[l[:,0], l[:,1]] = 1

        with open(images_filepath, 'rb') as file:
            magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
            if magic != 2051:
                raise ValueError('Magic number mismatch, expected 2051, got {}'.format(magic))
            image_data = array("B", file.read())
        images = torch.zeros((n, 28**2))
        for i in range(size):
            img = np.array(image_data[i * rows * cols:(i + 1) * rows * cols])
            #img = img.reshape(28, 28)
            images[i, :] = torch.tensor(img)

        return images, labels

    def load_data(self):
        x_train, y_train = self.read_images_labels(self.training_images_filepath, self.training_labels_filepath)
        x_test, y_test = self.read_images_labels(self.test_images_filepath, self.test_labels_filepath)
        return (x_train, y_train),(x_test, y_test)

### activations.py

In [37]:
import torch

class ReLU():
    #Complete this class
    def forward(self,x: torch.tensor) -> torch.tensor:
        #implement ReLU(x) here
         return torch.maximum(torch.tensor(0.0), x)

    def backward(self,delta: torch.tensor, x: torch.tensor) -> torch.tensor:
        #implement delta * ReLU'(x) here
        return delta * torch.where(x > 0, torch.tensor(1.0), torch.tensor(0.0))

class LeakyReLU():
    #Complete this class
    def forward(self,x: torch.tensor) -> torch.tensor:
        #implement LeakyReLU(x) here
        return torch.maximum(0.1 * x, x)

    def backward(self,delta: torch.tensor, x: torch.tensor) -> torch.tensor:
        #implement delta * LeakyReLU'(x) here
        return delta * torch.where(x > 0, torch.tensor(1.0), torch.tensor(0.1))

In [38]:
r = ReLU()
x = torch.tensor([-1.0, 0.0, 1.0])
print(r.forward(x))
delta = torch.tensor([1.0, 2.0, 3.0])
print(r.backward(delta, x))

tensor([0., 0., 1.])
tensor([0., 0., 3.])


### framework.py

In [40]:
import torch
import numpy as np
import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class MLP:
    '''
    This class should implement a generic MLP learning framework. The core structure of the program has been provided for you.
    But, you need to complete the following functions:
    1: initialize()
    2: forward(), including activations
    3: backward(), including activations
    4: TrainMLP()
    '''
    def __init__(self, layer_sizes: list[int], device: torch.device):

        # Storage for model parameters
        self.layer_sizes: list[int] = layer_sizes
        self.num_layers = len(layer_sizes)
        self.weights: list[torch.tensor] = []
        self.biases: list[torch.tensor] = []
        self.device = device

        # Temporary data
        self.features = {}

    def set_hp(self, lr: float, bs: int, activation: object) -> None:
        self.learning_rate = lr
        self.batch_size = bs
        self.activation_function = activation

    def initialize(self):
        for i in range(len(self.layer_sizes) - 1): # create l-1 weight matrices (l includes input layer)
            d_in = self.layer_sizes[i]
            d_out = self.layer_sizes[i + 1]
            weight = torch.empty(d_in, d_out, device=self.device)
            torch.nn.init.uniform_(weight, -np.sqrt(6 / (d_in + d_out)), np.sqrt(6 / (d_in + d_out)))
            self.weights.append(weight)
            self.biases.append(torch.zeros(d_out, device=self.device))

    def forward(self, x):
        x = x.to(self.device)
        
        for i in range(len(self.layer_sizes) - 2): # loop through hidden layers (l-2, including input layer)
            # net input
            z = x @ self.weights[i] + self.biases[i]
            self.features[f'h_{i}'] = {'z': z}
            # activation
            a = self.activation_function.forward(z)
            self.features[f'h_{i}']['a'] = a
            # update x
            x = a
        # output layer
        z = x @ self.weights[-1] + self.biases[-1]
        self.features[f'z_o'] = {'z': z}
        # not sure why we do ReLU before softmax
        a = self.activation_function.forward(z)
        self.features[f'a_o'] = {'a': a}
        yhat = torch.softmax(z, dim=1)
        self.features[f'yhat'] = yhat
        return yhat

    def backward(self, delta: torch.tensor) -> None:
        #Complete this function

        '''
        This function should backpropagate the provided delta through the entire MLP, and update the weights according to the hyper-parameters
        stored in the class variables.
        '''
        return


def TrainMLP(model: MLP, x_train: torch.tensor, y_train: torch.tensor) -> MLP:
    #Complete this function

    '''
    This function should train the MLP for 1 epoch, using the provided data and forward/backward propagating as necessary.
    '''

    #set up a random sampling of the data
    bs = model.batch_size
    N = x_train.shape[0]
    rng = np.random.default_rng()
    idx = rng.permutation(N)

    #variable to accumulate total loss over the epoch
    L = 0

    for i in tqdm.tqdm(range(N // bs)):
        x = x_train[idx[i * bs:(i + 1) * bs], ...]
        y = y_train[idx[i * bs:(i + 1) * bs], ...]

        #forward propagate and compute loss (l) here
        y_hat = model.forward(x)
        p = torch.exp(y_hat)
        p /= torch.sum(p, dim = 1, keepdim = True)
        l = -1 * torch.sum(y * torch.log(p))
        L += l
        print(i)

        # #backpropagate here
        # delta = p - y # b x output
        # # grad relu
        # delta_aout = model.activation_function.backward(delta, model.features['z_o']['z']) # b x output
        # # gradient of matrix product w.r.t. matrix is the other matrix transposed
        # delta_w2 = model.features['h_0']['a'].T @ delta_aout # h1 x output 
        # # update w2
        # model.weights[-1] = model.weights[-1] - (model.learning_rate * delta_w2)
        # # gradient of bias is just the delta
        # delta_b2 = torch.sum(delta_aout, dim=0)  
        # # update b2
        # model.biases[-1] = model.biases[-1] - (model.learning_rate * delta_b2)
        # # now do the same for the hidden layer parameters
        # # delta for hidden layer
        # # need to sum over the output dimension
        # delta_a1 

        # delta_a1 = model.activation_function
        # break
        # self.weights[-1] = self.weights[-1] - (model.learning_rate * delta_w2)
        # # bias gradient is just the delta
        # break

        grads_w = [None] * len(model.weights)
        grads_b = [None] * len(model.biases)
        
        # Compute gradient of loss w.r.t. logits
        y_pred = self.features[-1]
        delta = y_pred - y_true  # dCE/dO
        
        # Backpropagate through layers
        for i in reversed(range(len(self.weights))):
            grads_w[i] = self.features[i].T @ delta / y_true.shape[0]
            grads_b[i] = delta.mean(dim=0, keepdim=True)
            
            if i > 0:
                delta = (delta @ self.weights[i].T) * (self.features[i] > 0).float()  # ReLU derivative
        
        # Update parameters
        for i in range(len(self.weights)):
            self.weights[i] -= self.learning_rate * grads_w[i]
            self.biases[i] -= self.learning_rate * grads_b[i]




    print("Train Loss:", L / ((N // bs) * bs))


def TestMLP(model: MLP, x_test: torch.tensor, y_test: torch.tensor) -> tuple[float, float]:
    bs = model.batch_size
    N = x_test.shape[0]

    rng = np.random.default_rng()
    idx = rng.permutation(N)

    L = 0
    A = 0

    for i in tqdm.tqdm(range(N // bs)):
        x = x_test[idx[i * bs:(i + 1) * bs], ...]
        y = y_test[idx[i * bs:(i + 1) * bs], ...]

        y_hat = model.forward(x)
        p = torch.exp(y_hat)
        p /= torch.sum(p, dim = 1, keepdim = True)
        l = -1 * torch.sum(y * torch.log(p))
        L += l

        A += torch.sum(torch.where(torch.argmax(p, dim = 1) == torch.argmax(y, dim = 1), 1, 0))

    print("Test Loss:", L / ((N // bs) * bs), "Test Accuracy: {:.2f}%".format(100 * A / ((N // bs) * bs)))

def normalize_mnist() -> tuple[torch.tensor, torch.tensor, torch.tensor, torch.tensor]:
    '''
    This function loads the MNIST dataset, then normalizes the "X" values to have zero mean, unit variance.
    '''

    base_path = 'mnist/'
    mnist = MnistDataloader(base_path + "train-images.idx3-ubyte", base_path + "train-labels.idx1-ubyte",
                            base_path + "t10k-images.idx3-ubyte", base_path + "t10k-labels.idx1-ubyte")
    (x_train, y_train), (x_test, y_test) = mnist.load_data()

    x_mean = torch.mean(x_train, dim=0, keepdim=True)
    x_std = torch.std(x_train, dim=0, keepdim=True)

    x_train -= x_mean
    x_train /= x_std
    x_train[x_train != x_train] = 0

    x_test -= x_mean
    x_test /= x_std
    x_test[x_test != x_test] = 0

    return x_train.to(device), y_train.to(device), x_test.to(device), y_test.to(device)

def main():
    '''
    This is an example of how to use the framework when completed. You can build off of this code to design your experiments for part 2.
    '''

    x_train, y_train, x_test, y_test = normalize_mnist()
    print("Data loaded and normalized.")

    '''
    For the experiment, adjust the list [784,...,10] as desired to test other architectures.
    You are encouraged to play around with any of the following values if you so desire:
    E, lr, bs, activation
    '''

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = MLP([784, 256, 10], device)
    model.initialize()
    model.set_hp(lr=1e-6, bs=512, activation=ReLU())
    print("Model initialized and hyperparameters set.")

    # E = 1
    E = 1
    for epoch in range(E):
        print(f"Epoch {epoch+1}/{E}")
        TrainMLP(model, x_train, y_train)
        # TestMLP(model, x_test, y_test)

if __name__ == "__main__":
    main()

Data loaded and normalized.
Model initialized and hyperparameters set.
Epoch 1/1


  0%|                                                   | 0/117 [00:00<?, ?it/s]

0
shape torch.Size([256, 10]) torch.Size([256, 10])
Train Loss: tensor(0.0198, device='cuda:0')





In [30]:
m = MLP([784, 256, 10], device=device)
m.initialize()
len(m.weights)
for i in range(0,len(m.layer_sizes)-1):
    print(i)


0
1


In [42]:
torch.log(torch.tensor(torch.e))

tensor(1.0000)