# Lecture 25: SGD and ADAM Learning Rules

## Load Packages

In [None]:
%matplotlib inline
import torch
import torchvision
import numpy as np
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from torchvision import datasets, transforms

from skimage.measure import compare_ssim as ssim #Structural similarity index
print(torch.__version__) # This code has been updated for PyTorch 1.0.0

## Load Data

In [None]:
transform = transforms.Compose([transforms.ToTensor()])
BatchSize = 100

trainset = torchvision.datasets.MNIST(root='./MNIST', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BatchSize,
                                          shuffle=True, num_workers=4) # Creating dataloader

testset = torchvision.datasets.MNIST(root='./MNIST', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=BatchSize,
                                         shuffle=False, num_workers=4) # Creating dataloader

In [None]:
# Check availability of GPU

use_gpu = torch.cuda.is_available()
if use_gpu:
    print('GPU is available!')
    device = "cuda"
else:
    print('GPU is not available!')
    device = "cpu"

## Neural Network

In [None]:
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.Layer1 = nn.Sequential(
            nn.Linear(28*28, 400),
            nn.ReLU(),
            nn.Linear(400, 256),
            nn.ReLU())
        self.Layer2 = nn.Sequential(
            nn.Linear(256, 10))

    def forward(self, x):
        x = self.Layer1(x)
        x = self.Layer2(x)
        return x

In [None]:
net1 = NeuralNet()
net2 = NeuralNet()
net3 = NeuralNet()

net1 = net1.to(device) # Network to be trained using SGD
net2 = net2.to(device) # Network to be trained using SGD with momentum
net3 = net3.to(device) # Network to be trained using Adam

## Training with different Optimizer

In [None]:
def Train(model,optimizer,criterion,datainput,label):
    model.train() # For training
    optimizer.zero_grad()
    output = model(datainput)
    loss = criterion(output, label)
    loss.backward()
    optimizer.step()
    return loss.item()

In [None]:
iterations = 10
learning_rate = 0.1
criterion = nn.CrossEntropyLoss()

optimizer1 = optim.SGD(net1.parameters(), lr=1e-4) # SGD
optimizer2 = optim.SGD(net2.parameters(), lr=1e-4, momentum=0.9) # SGD with momentum
optimizer3 = optim.Adam(net3.parameters(), lr=1e-4) # Adam

Plotacc1 = []
Plotacc2 = []
Plotacc3 = []

plotLoss1 = []
plotLoss2 = []
plotLoss3 = []

for epoch in range(iterations):  # loop over the dataset multiple times

    correct1 = 0
    correct2 = 0
    correct3 = 0
    runningLoss1 = 0
    runningLoss2 = 0
    runningLoss3 = 0
    total = 0    
    
    for i,data in enumerate(trainloader,0):
        # get the inputs
        inputs, labels = data
        inputs, labels = inputs.view(-1, 28*28).to(device), labels.to(device)         
        trainLoss1 = Train(net1,optimizer1,criterion,inputs,labels)
        trainLoss2 = Train(net2,optimizer2,criterion,inputs,labels)
        trainLoss3 = Train(net3,optimizer3,criterion,inputs,labels)

        runningLoss1 += trainLoss1
        runningLoss2 += trainLoss2
        runningLoss3 += trainLoss3
   
    runningLoss1 = runningLoss1/(i+1)
    runningLoss2 = runningLoss2/(i+1)
    runningLoss3 = runningLoss3/(i+1)
          
   
    plotLoss1.append(runningLoss1)
    plotLoss2.append(runningLoss2)
    plotLoss3.append(runningLoss3)
    
    net1.eval() # For testing [Affects batch-norm and dropout layers (if any)]
    net2.eval()
    net3.eval()
    with torch.no_grad(): # Gradient computation is not involved in inference
    
        for data in testloader:
            inputs, labels = data
            inputs, labels = inputs.view(-1, 28*28).to(device), labels.to(device)
            total += labels.size(0)

            outputs = net1(inputs)
            _, predicted = torch.max(outputs.data, 1)
            correct1 += (predicted == labels).sum()

            outputs = net2(inputs)
            _, predicted = torch.max(outputs.data, 1)
            correct2 += (predicted == labels).sum()

            outputs = net3(inputs)
            _, predicted = torch.max(outputs.data, 1)
            correct3 += (predicted == labels).sum()

    Plotacc1.append(float(correct1)*100/float(total))
    Plotacc2.append(float(correct2)*100/float(total))
    Plotacc3.append(float(correct3)*100/float(total))
    
    print('At Epoch '+str(epoch+1))
    print('SGD: Loss = {:.6f} , Acc = {:.4f}'.format(runningLoss1,float(correct1)*100/float(total)))
    print('SGD with momentum: Loss = {:.6f} , Acc = {:.4f}'.format(runningLoss2,float(correct2)*100/float(total)))
    print('Adam: Loss = {:.6f} , Acc = {:.4f}'.format(runningLoss3,float(correct3)*100/float(total)))
    
fig = plt.figure()        
plt.plot(range(epoch+1),plotLoss1,'r-',label='SGD')
plt.plot(range(epoch+1),plotLoss2,'g-',label='SGD with momentum')   
plt.plot(range(epoch+1),plotLoss3,'b-',label='Adam')  
plt.legend(loc='best')
plt.xlabel('Epochs')
plt.ylabel('Training Loss')  
    
fig = plt.figure()        
plt.plot(range(epoch+1),Plotacc1,'r-',label='SGD')
plt.plot(range(epoch+1),Plotacc2,'g-',label='SGD with momentum')
plt.plot(range(epoch+1),Plotacc3,'b-',label='Adam')    
plt.legend(loc='best')
plt.xlabel('Epochs')
plt.ylabel('Testing Accuracy')  
print('Finished Training')