# Evaluating performance of the SGD alghorithm
In this example, a classic neural network is created using the `neural` framework. This network is than trained on the MNIST data set of hand-written digits using the SGD (*Stochastic Gradient Descent*) algorithm. This notebook serves as an evaluation of SGD performance.

In [None]:
import matplotlib.pyplot as plt

In [None]:
import sys
sys.path.append("..")

In [None]:
from itertools import product

In [None]:
import numpy as np
import time

from neural import MNIST, Tensor, nn, optim
from utils import *

## Importing MNIST training data

In [None]:
# Loading training set
allTrainImages, allTrainLabels = MNIST.get("train")
# Images are normalized, all values are in the range [-1, 1]
allTrainImages = normalize(allTrainImages, 0.5, 0.5)

## Defining the Neural Network architecture

In [None]:
class Network(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(784, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)
        self.logSoftmax = nn.LogSoftmax(dim=1)
        
    def forward(self, x):
        x = nn.ReLU()(self.fc1(x))
        x = nn.ReLU()(self.fc2(x))
        x = self.fc3(x)
        x = self.logSoftmax(x)
        return x

## Choosing training criterion (loss function)

In [None]:
# Loss function
reduction = "mean"
criterion = nn.NLLLoss(reduction=reduction)

## Training

### Choosing training parameters

In [None]:
# Defining sweep parameters
lrs = [0.001, 0.01, 0.03]
momentums = [0.0, 0.5,0.9]
batchSizes = [10, 50, 100]

## Training for each parameter combination

In [None]:
numConfigs = np.prod([len(lrs), len(momentums), len(batchSizes)])
configs = product(lrs, momentums, batchSizes)

print(f"Total number of configurations: {numConfigs}")
for j, (lr, momentum, batchSize) in enumerate(configs):
    model = Network()    

    optimizer = optim.SGD(
        model.parameters(),
        lr=lr,
        momentum=momentum)
    
    numBatches = allTrainImages.shape[0] // batchSize
    numTraining = int(numBatches * batchSize)

    trainImages = allTrainImages[:numTraining].reshape(numBatches, -1, allTrainImages.shape[-2], allTrainImages.shape[-1])
    trainLabels = allTrainLabels[:numTraining].reshape(numBatches, -1)
        
    lossTrack = np.zeros(numBatches)
    startTime = time.time()
    for i, (images, labels) in enumerate(zip(trainImages, trainLabels)):
        images = images.reshape(images.shape[0], -1)
        optimizer.zeroGrad()
        out = model(images)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        lossTrack[i] = loss.item()
    else:
        endTime = time.time()
        print(f"Finished configuration {j} in {endTime - startTime:.2f}s")
        np.savetxt(f"lossTrack_{lr}_{momentum}_{batchSize}", lossTrack)

## Convergence rate vs `momentum`

In [None]:
# Choose batchSize and lr for the plot
batchSize = batchSizes[2]
lr = lrs[0]

# Plot data
plots = [(np.loadtxt(f"lossTrack_{lr}_{momentum}_{batchSize}"), batchSize, 
          f"Learning rate: {lr}\nMomentum: {momentum}\nBatch size: {batchSize}") for momentum in momentums]  

plotLossTrack(plots)

## Convergence rate vs batch size

In [None]:
# Choose momentum and lr for the plot
momentum = momentums[0]
lr = lrs[2]

# Plot data
plots = [(np.loadtxt(f"lossTrack_{lr}_{momentum}_{batchSize}"), batchSize, 
          f"Learning rate: {lr}\nMomentum: {momentum}\nBatch size: {batchSize}") for batchSize in batchSizes]  

plotLossTrack(plots)

## Convergence rate vs learning rate

In [None]:
batchSize = batchSizes[1]
momentum = momentums[0]

plots = [(np.loadtxt(f"lossTrack_{lr}_{momentum}_{batchSize}"), batchSize, 
          f"Learning rate: {lr}\nMomentum: {momentum}\nBatch size: {batchSize}") for lr in lrs]  

plotLossTrack(plots)

## Model performance

In [None]:
# Image iterator
imgIter = iter(allTrainImages)

Run the cell bellow multiple times to check model performance for different images.

In [None]:
img = next(imgIter)
img_ = img.reshape(1, -1)

logps = model(img_)
# Output of the network are log-probabilities
ps = np.exp(logps)

showMNIST(img.squeeze(), ps.squeeze())