# Stochastic Gradient Descent and Learning Rate Dynamics


Load Packages
==

In [None]:
%matplotlib inline
import torch
import matplotlib.pyplot as plt
import numpy as np
import torchvision
import torchvision.transforms as transforms
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math

Load Data:
===============

In [None]:
transform = transforms.Compose([transforms.ToTensor()])
BatchSize = 100

trainset = torchvision.datasets.MNIST(root='./MNIST', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=BatchSize,
                                          shuffle=True, num_workers=4) # Creating dataloader

testset = torchvision.datasets.MNIST(root='./MNIST', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=BatchSize,
                                         shuffle=False, num_workers=4) # Creating dataloader

classes = ('zero', 'one', 'two', 'three',
           'four', 'five', 'six', 'seven', 'eight', 'nine')

In [None]:
# Check availability of GPU
use_gpu = torch.cuda.is_available()
if use_gpu:
    print('GPU is available!')

Neural Network:
===============

In [None]:
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.Layer1 = nn.Sequential(
            nn.Linear(28*28, 400),
            nn.ReLU(),
            nn.Linear(400, 256),
            nn.ReLU())
        self.Layer2 = nn.Sequential(
            nn.Linear(256, 10))

    def forward(self, x):
        x = self.Layer1(x)
        x = self.Layer2(x)
        return x


net1 = NeuralNet()
net2 = NeuralNet()
net3 = NeuralNet()
net4 = NeuralNet()

if use_gpu:
    net1 = net1.cuda()
    net2 = net2.cuda()
    net3 = net3.cuda()
    net4 = net4.cuda()

Training with different Optimizer:
===========

In [None]:
def Train(model,optimizer,IP,LB):
    optimizer.zero_grad()
    OP = model(IP)
    loss = criterion(F.log_softmax(OP,dim=1), LB)
    loss.backward()
    optimizer.step()
    return loss

In [None]:
iterations = 10
criterion = nn.NLLLoss()

optimizer1 = optim.SGD(net1.parameters(), lr=1e-4)
optimizer2 = optim.SGD(net2.parameters(), lr=1e-4, momentum=0.9)
optimizer3 = optim.Adagrad(net3.parameters(), lr=1e-4)
optimizer4 = optim.Adam(net4.parameters(), lr=1e-4)
PlotAcc1 = []
PlotAcc2 = []
PlotAcc3 = []
PlotAcc4 = []

for epoch in range(iterations):  # loop over the dataset multiple times

    correct1 = 0
    correct2 = 0
    correct3 = 0
    correct4 = 0
    total = 0
    for i, data in enumerate(trainloader, 0):
        # get the inputs
        inputs, labels = data
        
        # wrap them in Variable
        if use_gpu:
            inputs, labels = Variable(inputs.view(-1, 28*28)).cuda(), Variable(labels).cuda()
            
        _ = Train(net1,optimizer1,inputs,labels)
        _ = Train(net2,optimizer2,inputs,labels)
        _ = Train(net3,optimizer3,inputs,labels)
        _ = Train(net4,optimizer4,inputs,labels)
        
    for data in testloader:
        inputs, labels = data
        if use_gpu:
            inputs, labels = Variable(inputs.view(-1, 28*28)).cuda(), labels.cuda()
        else:
            inputs, labels = Variable(inputs.view(-1, 28*28)), labels
        total += labels.size(0)
        
        outputs = net1(inputs)
        _, predicted = torch.max(outputs.data, 1)
        correct1 += (predicted == labels).sum()
        
        outputs = net2(inputs)
        _, predicted = torch.max(outputs.data, 1)
        correct2 += (predicted == labels).sum()
        
        outputs = net3(inputs)
        _, predicted = torch.max(outputs.data, 1)
        correct3 += (predicted == labels).sum()
        
        outputs = net4(inputs)
        _, predicted = torch.max(outputs.data, 1)
        correct4 += (predicted == labels).sum()
    if use_gpu:    
        PlotAcc1.append(correct1.cpu().numpy()/float(total))
        PlotAcc2.append(correct2.cpu().numpy()/float(total))
        PlotAcc3.append(correct3.cpu().numpy()/float(total))
        PlotAcc4.append(correct4.cpu().numpy()/float(total))
    else:
        PlotAcc1.append(correct1.numpy()/float(total))
        PlotAcc2.append(correct2.numpy()/float(total))
        PlotAcc3.append(correct3.numpy()/float(total))
        PlotAcc4.append(correct4.numpy()/float(total))
        
    print('Epoch %d '%(epoch+1))
print('Finished Training')
fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 10
fig_size[1] = 10
plt.rcParams["figure.figsize"] = fig_size
fig = plt.figure()        
plt.plot(range(epoch+1),PlotAcc1,'r-',label='SGD')
plt.plot(range(epoch+1),PlotAcc2,'c-',label='SGD with momentum')
plt.plot(range(epoch+1),PlotAcc3,'g-',label='Adagrad')
plt.plot(range(epoch+1),PlotAcc4,'b-',label='Adam')        
plt.legend(loc='best')
plt.xlabel('Epochs')
plt.ylabel('Testing Accuracy')  

Neural Network:
===

In [None]:
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.Layer1 = nn.Sequential(
            nn.Linear(28*28, 400),
            nn.ReLU(),
            nn.Linear(400, 256),
            nn.ReLU())
        self.Layer2 = nn.Sequential(
            nn.Linear(256, 10)            )

    def forward(self, x):
        x = self.Layer1(x)
        x = self.Layer2(x)
        return x


net1 = NeuralNet()
net2 = NeuralNet()
net3 = NeuralNet()

if use_gpu:
    net1 = net1.cuda()
    net2 = net2.cuda()
    net3 = net3.cuda()

Training with Learning Rate Dynamics:
===========

In [None]:
iterations = 10
criterion = nn.CrossEntropyLoss()

optimizer1 = optim.SGD(net1.parameters(), lr=1e-2)
optimizer2 = optim.SGD(net2.parameters(), lr=1e-2)
optimizer3 = optim.SGD(net3.parameters(), lr=1e-2)

scheduler2 = optim.lr_scheduler.MultiStepLR(optimizer2, milestones=[3,7,9], gamma=0.9)
scheduler3 = optim.lr_scheduler.ExponentialLR(optimizer3, gamma=0.99)

Plotloss1 = []
Plotloss2 = []
Plotloss3 = []

for epoch in range(iterations):  # loop over the dataset multiple times
    
    loss1 = 0
    loss2 = 0
    loss3 = 0
    scheduler2.step()
    scheduler3.step()
    
    for i, data in enumerate(trainloader, 0):
        # get the inputs
        inputs, labels = data
        
        # wrap them in Variable
        if use_gpu:
            inputs, labels = Variable(inputs.view(-1, 28*28)).cuda(), Variable(labels).cuda()
        else:
            inputs, labels = Variable(inputs.view(-1, 28*28)), Variable(labels)
            
        loss1 += Train(net1,optimizer1,inputs,labels).item()
        loss2 += Train(net2,optimizer2,inputs,labels).item()
        loss3 += Train(net3,optimizer3,inputs,labels).item()   
        
    Plotloss1.append(loss1/(60000/BatchSize))
    Plotloss2.append(loss2/(60000/BatchSize))
    Plotloss3.append(loss3/(60000/BatchSize))
    for opt in optimizer1.param_groups:
        print('SGD Learning Rate: '+str(opt['lr']))
    for opt in optimizer2.param_groups:
        print('SGD (step decay) Learning Rate: '+str(opt['lr']))
    for opt in optimizer3.param_groups:
        print('SGD (step exp_decay) Learning Rate: '+str(opt['lr']))
    print('Epoch %d ; SGD:  %f ; SGD step_decay:  %f ; SGD exp_decay:  %f'%((epoch+1),loss1/(60000/BatchSize),loss2/(60000/BatchSize),loss3/(60000/BatchSize)))
    print('_______________________________________________________________')
print('Finished Training')
fig = plt.figure()        
plt.plot(range(epoch+1),Plotloss1,'r-',label='SGD')
plt.plot(range(epoch+1),Plotloss2,'g-',label='SGD with step_decay')     
plt.plot(range(epoch+1),Plotloss3,'b-',label='SGD with exp_decay')  
plt.legend(loc='best')
plt.xlabel('Epochs')
plt.ylabel('Training Loss')  