[Homework for AI-For-Beginners course: 04 - Own Framework](https://github.com/microsoft/AI-For-Beginners/blob/main/lessons/3-NeuralNetworks/04-OwnFramework/lab/MyFW_MNIST.ipynb)

### Instructions

1. Take the framework code from the lesson and paste it into this notebook, or (even better) into a separate Python module
1. Define and train one-layered perceptron, observing training and validation accuracy during training
1. Try to understand if overfitting took place, and adjust layer parameters to improve accuracy
1. Repeat previous steps for 2- and 3-layered perceptrons. Try to experiment with different activation functions between layers.
1. Try to answer the following questions:
    - Does the inter-layer activation function affect network performance?
    - Do we need 2- or 3-layered network for this task?
    - Did you experience any problems training the network? Especially as the number of layers increased.
    - How do weights of the network behave during training? You may plot max abs value of weights vs. epoch to understand the relation.

In [None]:
# display all outputs from Jupyter notebook cells, not just last.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from datasets import load_dataset
import torch
import numpy as np

# pick the seed for reproducability - change it to explore the effects of random variations
np.random.seed(1)
import random

In [2]:
dataset = load_dataset("mnist")
dataset_train, dataset_test = dataset["train"], dataset["test"]

In [3]:
train_x = np.array(dataset_train['image']).reshape(-1, 784) # train_x.shape = (60000, 28, 28) -> (60000, 784)
train_labels = dataset_train['label'] 

test_x = np.array(dataset_test['image']).reshape(-1, 784) # test_x.shape = (10000, 28, 28) -> (10000, 784)
test_labels = dataset_test['label']

In [4]:
class Linear:
    # nin - number of input features, nout - number of output clasess
    def __init__(self,nin,nout):
        self.W = np.random.normal(0, 1.0/np.sqrt(nin), (nout, nin)) #weights
        self.b = np.zeros((1,nout)) # bias  vector
        self.dW = np.zeros_like(self.W)
        self.db = np.zeros_like(self.b)
        
    def forward(self, x):
        self.x=x
        return np.dot(x, self.W.T) + self.b
    
    def backward(self, dz):
        dx = np.dot(dz, self.W)
        dW = np.dot(dz.T, self.x)
        db = dz.sum(axis=0)
        self.dW = dW
        self.db = db
        return dx
    
    def update(self,lr):
        self.W -= lr*self.dW
        self.b -= lr*self.db

# ReLU (Rectified Linear Unit) is an activation function used to introduce non-linearity into the network.
# It operates element-wise on the input matrix and replaces all negative pixel values in the feature map by zero. The function is f(x) = max(0, x).
# It's often used in the hidden layers of a neural network.
class ReLU:
    def forward(self, z):
        self.z = z
        return np.maximum(0, z)

    def backward(self, dp):
        dz = np.where(self.z > 0, 1.0, 0.0)
        return dp * dz

class Softmax:
    def forward(self,z):
        self.z = z # z.shape = (batch_size, num_classes). z.shape = (60000, 10)
        zmax = z.max(axis=1,keepdims=True) #zmax.shape = (60000, 1, 28)
        expz = np.exp(z-zmax)
        Z = expz.sum(axis=1,keepdims=True)
        return expz / Z
    def backward(self,dp):
        p = self.forward(self.z)
        pdp = p * dp
        return pdp - p * pdp.sum(axis=1, keepdims=True)
    
class CrossEntropyLoss:
    def forward(self,p,y):
        self.p = p
        self.y = y
        p_of_y = p[np.arange(len(y)), y]
        log_prob = np.log(p_of_y + 1e-9)  # Add a small constant to prevent log(0)
        return -log_prob.mean()
    def backward(self,loss):
        dlog_softmax = np.zeros_like(self.p)
        dlog_softmax[np.arange(len(self.y)), self.y] -= 1.0/len(self.y)
        return dlog_softmax / (self.p + 1e-9)  # Add a small constant to prevent division by zero

class Net:
    def __init__(self):
        self.layers = []
    
    def add(self,l):
        self.layers.append(l)
        
    def forward(self,x):
        for l in self.layers:
            x = l.forward(x)
        return x
    
    def backward(self,z):
        for l in self.layers[::-1]:
            z = l.backward(z)
        return z
    
    def update(self,lr):
        for l in self.layers:
            if 'update' in l.__dir__():
                l.update(lr)

In [5]:
def get_loss_acc(x,y,loss=CrossEntropyLoss()):
    p = net.forward(x)
    l = loss.forward(p,y)
    pred = np.argmax(p,axis=1)
    acc = (pred==y).mean()
    return l,acc

def train_epoch(net, train_x, train_labels, loss=CrossEntropyLoss(), batch_size=4, lr=0.1):
    for i in range(0,len(train_x),batch_size):
        xb = train_x[i:i+batch_size]
        yb = train_labels[i:i+batch_size]

        p = net.forward(xb) 
        l = loss.forward(p,yb)
        dp = loss.backward(l)
        dx = net.backward(dp)
        net.update(lr)

In [6]:
# 1 layer network
net = Net()
net.add(Linear(28*28, 10)) # 28*28 = 784 is the number of input features (pixels. a.k.a. a digit is represented as png file by those dimensions), 10 is the number of output features (classes. a.k.a. expected digits 0-9)
net.add(Softmax())
loss = CrossEntropyLoss()

print("Initial loss={}, accuracy={}: ".format(*get_loss_acc(train_x,train_labels)))

train_epoch(net,train_x,train_labels, batch_size=320, lr=0.001)
        
print("Final loss={}, accuracy={}: ".format(*get_loss_acc(train_x,train_labels)))
print("Test loss={}, accuracy={}: ".format(*get_loss_acc(test_x,test_labels)))

Initial loss=19.06245656631888, accuracy=0.0655: 
Final loss=4.352638784997269, accuracy=0.7744333333333333: 
Test loss=4.162335114860613, accuracy=0.7846: 


In [7]:
# 2 layer network
net = Net()
net.add(Linear(28*28, 100)) # First layer with 100 neurons
net.add(ReLU())  # Non-linear activation function
net.add(Linear(100, 10)) # Second layer with 10 neurons (one for each class)
net.add(Softmax())
loss = CrossEntropyLoss()

print("Initial loss={}, accuracy={}: ".format(*get_loss_acc(train_x,train_labels)))

train_epoch(net,train_x,train_labels, batch_size=320, lr=0.001)
        
print("Final loss={}, accuracy={}: ".format(*get_loss_acc(train_x,train_labels)))
print("Test loss={}, accuracy={}: ".format(*get_loss_acc(test_x,test_labels)))

Initial loss=17.070339247873864, accuracy=0.12121666666666667: 
Final loss=3.7820837971444985, accuracy=0.78795: 
Test loss=3.7585837164578204, accuracy=0.7902: 
