# Backpropagation

## We have to consider the following steps

## How to start now?

## Prepare your dataset

In [36]:
# Your code
import torch 
import torch.nn as nn
import torchvision
import numpy as np
import torchvision.transforms as transforms

def load_mnist_data(root_path='./data', batch_size=4):
    """
    Loads MNIST dataset into your directory.
    You can change the root_path to point to a already existing path if you want to safe a little bit of memory :)
    """
    transform = transforms.Compose(
        [transforms.ToTensor(),
        transforms.Normalize((0.5), (0.5))]
    )

    trainset = torchvision.datasets.MNIST(root=root_path, train=True, download=True, transform=transform)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

    testset = torchvision.datasets.MNIST(root=root_path, train=False, download=True, transform=transform)
    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)

    return trainloader, testloader



## Building your neural network

In [37]:
# Your code
class LinearLayer:
    def __init__(self,input_size, output_size) -> None:
        self.input_size= input_size
        self.output_size= output_size
        self.weights = torch.empty(self.input_size, self.output_size)
        nn.init.xavier_normal_(self.weights)
        self.bias = torch.nn.init.zeros_(torch.empty(( output_size)))
    
    def forward(self,x):
        self.currentZ = torch.matmul(x, self.weights) + self.bias
        self.currentActivation = self.currentZ #linear activation funct
        return self.currentActivation
  
    #the derivative of the identity activation function is 1
    def derivative(self, x):
        return torch.from_numpy(np.ones(x.shape)).type(torch.float32)


class NeuralNet:
    def __init__(self) -> None:
        self.layers = [
            LinearLayer(input_size=784, output_size=64),
            LinearLayer(input_size=64, output_size=64),
              LinearLayer(input_size=64, output_size=64),
               LinearLayer(input_size=64, output_size=64),
              LinearLayer(input_size=64, output_size=64),
            LinearLayer(input_size=64, output_size=10)
        ]

    def __call__(self, x):
        
        return self.forward(x)
        
    def forward(self,x):
        self.currentInput=x
      
        for layer in self.layers:
            x = layer.forward(x)
        return x
    

class SGDoptim:
    def __init__(self, model, lr) -> None:
        self.model = model
        self.lr = lr
        

    def step(self, deltas):
        for layer, delta in enumerate(deltas):
                
            if(layer==0):
                newWeights = torch.matmul(self.model.currentInput.T, delta)
            else:
                newWeights = torch.matmul(self.model.layers[layer-1].currentActivation.T, delta)
            # print(self.model.layers[layer].bias.shape, delta.shape)
            # print(self.model.currentInput.shape[1])
            self.model.layers[layer].weights = self.model.layers[layer].weights - self.lr/self.model.currentInput.shape[0] * newWeights
            self.model.layers[layer].bias = self.model.layers[layer].bias - self.lr/self.model.currentInput.shape[0] * torch.mean(delta, dim=0)
            # print(self.model.layers[layer].bias.shape)
            # print(torch.mean(delta, dim=0).shape)
       
        

        

## Implement your loss function(s)

In [38]:
# Your code 

import torch.nn.functional as F
class crossEntropyLoss:
    def __init__(self, model) -> None:
        self.model = model
    def __call__(self, logits, target):
        
        loss = []
        y=[]

        for targetInd,data in enumerate(logits):
            currentTarget = np.zeros(data.shape[0])
            currentTarget[target[targetInd]]=1
            y.append(currentTarget)
            currentErr = -1* torch.log(data.dot(torch.from_numpy(currentTarget).type(torch.float32)))
            loss.append(currentErr)
        y = torch.from_numpy( np.array(y) )
        self.currentTarget = y
        
        # loss = np.array([F.cross_entropy(pred, t).item() for pred, t in zip(logits, target)])

        return loss

    def backward(self, logits):
        deltas=[]
        deltas.append(self.derivative(logits, self.model.layers[-1]))
        for layer in range(len(self.model.layers)-2,-1,-1):  
            
            deltas.insert(0, self.model.layers[layer+1].derivative(self.model.layers[layer].currentZ) *

            (self.model.layers[layer+1].weights @ deltas[0].T).T)
            
    
        return deltas

    
    def derivative(self, activation, layer):
        # layer.derivative(layer.currentZ) * 
        
        return (torch.sub(activation,self.currentTarget) ).type(torch.float32)


## Implement the training loop

In [39]:
# Your code
import torch.nn.functional as F

# Consider the following steps:
# 1) Loop through your training data
#   1. 1) Choose number of epochs (How often do you want to loop through your complete dataset?)
# 2) Forward the data through your network
# 3) Calculate the loss
# 4) Perform backpropagation with SGD and update the weights
#   4. 1) Choose a learning rate to update your weights
# Repeat 1, 2, 3, 4 until the training converges or maximum epochs are reached

import tqdm
def train_model(model, batch_size: int = 4, epochs: int = 20):
    # we only consider the mnist train data for this example
    train_loader, _ = load_mnist_data(batch_size=batch_size)
    criterion = crossEntropyLoss(model)
    optimizer = SGDoptim(model, lr=0.001)
    iterations = 0
    for epoch in range(epochs):
        running_loss = 0.0
        running_accuracy = []
        for imgs, targets in tqdm.tqdm(train_loader, desc=f'Training iteration {epoch + 1}'):
        
            iterations += 1

            outputs = model(imgs.reshape(imgs.shape[0], -1))
          
            outputs = F.softmax(outputs,dim=1)
           
            loss = criterion(outputs, targets)
            
            deltas = criterion.backward(outputs)
            
            optimizer.step(deltas)
            
            # print statistics
            running_loss += np.mean(loss)

            # Calculate the Accuracy (how many of all samples are correctly classified?)
            max_outputs = torch.max(outputs, dim=1).indices
            accuracy = (max_outputs.detach() == targets.detach()).to(dtype=torch.float32).mean()
            running_accuracy.append(accuracy)
    
        print(f'Epoch {epoch + 1} finished with loss: {running_loss / len(train_loader):.3f} and accuracy {torch.tensor(running_accuracy).mean():.3f}')

In [40]:

train_model(NeuralNet(), batch_size=32)

Training iteration 1: 100%|██████████| 1875/1875 [00:08<00:00, 231.70it/s]


Epoch 1 finished with loss: 0.789 and accuracy 0.764


Training iteration 2: 100%|██████████| 1875/1875 [00:06<00:00, 293.48it/s]


Epoch 2 finished with loss: 0.416 and accuracy 0.880


Training iteration 3: 100%|██████████| 1875/1875 [00:06<00:00, 291.60it/s]


Epoch 3 finished with loss: 0.370 and accuracy 0.893


Training iteration 4: 100%|██████████| 1875/1875 [00:06<00:00, 306.38it/s]


Epoch 4 finished with loss: 0.349 and accuracy 0.900


Training iteration 5: 100%|██████████| 1875/1875 [00:06<00:00, 295.39it/s]


Epoch 5 finished with loss: 0.335 and accuracy 0.904


Training iteration 6: 100%|██████████| 1875/1875 [00:06<00:00, 292.58it/s]


Epoch 6 finished with loss: 0.325 and accuracy 0.906


Training iteration 7: 100%|██████████| 1875/1875 [00:06<00:00, 309.21it/s]


Epoch 7 finished with loss: 0.319 and accuracy 0.909


Training iteration 8: 100%|██████████| 1875/1875 [00:06<00:00, 308.31it/s]


Epoch 8 finished with loss: 0.313 and accuracy 0.911


Training iteration 9: 100%|██████████| 1875/1875 [00:06<00:00, 305.21it/s]


Epoch 9 finished with loss: 0.308 and accuracy 0.912


Training iteration 10: 100%|██████████| 1875/1875 [00:06<00:00, 291.05it/s]


Epoch 10 finished with loss: 0.304 and accuracy 0.913


Training iteration 11: 100%|██████████| 1875/1875 [00:05<00:00, 317.90it/s]


Epoch 11 finished with loss: 0.301 and accuracy 0.914


Training iteration 12: 100%|██████████| 1875/1875 [00:06<00:00, 308.04it/s]


Epoch 12 finished with loss: 0.298 and accuracy 0.916


Training iteration 13: 100%|██████████| 1875/1875 [00:06<00:00, 297.57it/s]


Epoch 13 finished with loss: 0.295 and accuracy 0.915


Training iteration 14: 100%|██████████| 1875/1875 [00:05<00:00, 312.93it/s]


Epoch 14 finished with loss: 0.293 and accuracy 0.916


Training iteration 15: 100%|██████████| 1875/1875 [00:06<00:00, 288.64it/s]


Epoch 15 finished with loss: 0.290 and accuracy 0.918


Training iteration 16: 100%|██████████| 1875/1875 [00:06<00:00, 297.67it/s]


Epoch 16 finished with loss: 0.289 and accuracy 0.918


Training iteration 17: 100%|██████████| 1875/1875 [00:06<00:00, 302.67it/s]


Epoch 17 finished with loss: 0.287 and accuracy 0.919


Training iteration 18: 100%|██████████| 1875/1875 [00:05<00:00, 313.32it/s]


Epoch 18 finished with loss: 0.285 and accuracy 0.919


Training iteration 19: 100%|██████████| 1875/1875 [00:05<00:00, 323.41it/s]


Epoch 19 finished with loss: 0.283 and accuracy 0.920


Training iteration 20: 100%|██████████| 1875/1875 [00:06<00:00, 296.65it/s]


Epoch 20 finished with loss: 0.282 and accuracy 0.920
