<a href="https://colab.research.google.com/github/kamikazekartik/cs744_assignments/blob/master/project/LeNet_MNIST_ksreenivasan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Baseline performance and timings for training LeNet on MNIST/EMNIST.

Before running the notebook, go to Runtime -> Change Runtime Type in the menu and set Hardware Accelerator to GPU. **Make sure you change it back** when finished to avoid being penalized by Colab.

You can use the dataset variable to decide if to run on MNIST or EMNIST (EMNIST will be slightly slower since the training data is significantly larger)

Each cell runs with different precision settings

Strange behavior right now:
1. Full precision is the fastest.
2. Half precision is about half a second slower per epoch.
3. AMP is still slower by another whole second.

Code is based on [here](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html).

In [6]:
import time
import os, random
import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import torch
from torch.utils.data import random_split
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torchvision import models
import logging

logging.basicConfig()
logger = logging.getLogger()
logger.setLevel(logging.INFO)

def seed_experiment(seed=0):
    # seed = 1234
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    #TODO: Do we need deterministic in cudnn ? Double check
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print("Seeded everything with seed: {}".format(seed))

# seed experiment
seed_experiment(42)

# Options:
use_amp = False
use_half_all = True
use_half_conv = False
use_half_lin = False
dataset = 'EMNIST'

# Make sure we're using a GPU, and report what GPU it is.
# (Otherwise this would run **forever**)
if torch.cuda.is_available():
  print("using "+torch.cuda.get_device_name(0))
else:
  print('No GPU available (enable it?), quitting.')
  exit()
device = torch.device("cuda:0")

# Set up dataset:
batch_size = 1024
test_batch_size = 1000

if dataset == 'MNIST':
    train_set = torchvision.datasets.MNIST('./data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ]))
    test_set = torchvision.datasets.MNIST('./data', train=False, transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ]))
elif dataset == 'EMNIST':
  train_set = torchvision.datasets.EMNIST('./data', split="digits", train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ]))
  test_set = torchvision.datasets.EMNIST('./data', split="digits", train=False, transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ]))



train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=test_batch_size, shuffle=False, num_workers=2)
classes = ('0','1','2','3','4','5','6','7','8','9')

# Set up and load LeNet model:
# this version seems to be significantly slower when we use half precision
# TODO: perhaps because of all the conversions?
class LeNet(nn.Module):
  def __init__(self):
    global use_half_all, use_half_conv, use_half_lin
    super(LeNet, self).__init__()
    self.conv1 = nn.Conv2d(1, 6, 5).half() if use_half_all or use_half_conv else nn.Conv2d(1, 6, 5)
    self.conv2 = nn.Conv2d(6, 16, 5).half() if use_half_all or use_half_conv else nn.Conv2d(6, 16, 5)
    self.fc1 = nn.Linear(256, 120).half() if use_half_all or use_half_lin else nn.Linear(256, 120)
    self.fc2 = nn.Linear(120, 84).half() if use_half_all or use_half_lin else nn.Linear(120, 84)
    self.fc3 = nn.Linear(84, 10).half() if use_half_all or use_half_lin else nn.Linear(84, 10)
    self.relu1 = nn.ReLU().half() if use_half_all else nn.ReLU()
    self.pool1 = nn.MaxPool2d(2).half() if use_half_all else nn.MaxPool2d(2)
    self.relu2 = nn.ReLU().half() if use_half_all else nn.ReLU()
    self.pool2 = nn.MaxPool2d(2).half() if use_half_all else nn.MaxPool2d(2)
    self.relu3 = nn.ReLU().half() if use_half_all else nn.ReLU()
    self.relu4 = nn.ReLU().half() if use_half_all else nn.ReLU()
    self.relu5 = nn.ReLU().half() if use_half_all else nn.ReLU()

  def forward(self, x):
    global use_half_all, use_half_conv, use_half_lin
    y = self.conv1(x.half()).float() if use_half_all or use_half_conv else self.conv1(x)
    y = self.relu1(y.half()).float() if use_half_all else self.relu1(y)
    y = self.pool1(y.half()).float() if use_half_all else self.pool1(y)
    y = self.conv2(y.half()).float() if use_half_all or use_half_conv else self.conv2(y)
    y = self.relu2(y.half()).float() if use_half_all else self.relu2(y)
    y = self.pool2(y.half()).float() if use_half_all else self.pool1(y)
    y = y.view(y.shape[0], -1)
    y = self.fc1(y.half()).float() if use_half_all or use_half_lin else self.fc1(y)
    y = self.relu3(y.half()).float() if use_half_all else self.relu3(y)
    y = self.fc2(y.half()).float() if use_half_all or use_half_lin else self.fc2(y)
    y = self.relu4(y.half()).float() if use_half_all else self.relu4(y)
    y = self.fc3(y.half()).float() if use_half_all or use_half_lin else self.fc3(y)
    y = self.relu5(y.half()).float() if use_half_all else self.relu5(y)
    return y

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 4 * 4, 120)  # 6*6 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

# test function:
def test(dataset, model, device, test_loader, criterion):
  class_correct = list(0. for i in range(10))
  class_total = list(0. for i in range(10))
  if dataset in ["EMNIST", "MNIST"]:
      classes = [str(i) for i in range(10)]

  model.eval()
  test_loss = 0
  correct = 0
  with torch.no_grad():
      for data, target in test_loader:
          if use_half_all:
            data = data.half()
          data, target = data.to(device), target.to(device)
          output = model(data)
          _, predicted = torch.max(output, 1)
          c = (predicted == target).squeeze()

          test_loss = criterion(output, target).item()
          pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
          correct += pred.eq(target.view_as(pred)).sum().item()

          for image_index in range(test_batch_size):
              label = target[image_index]
              class_correct[label] += c[image_index].item()
              class_total[label] += 1

  test_loss /= len(test_loader.dataset)

  for i in range(10):
      logger.info('Accuracy of %5s : %2d %%' % (
          classes[i], 100 * class_correct[i] / class_total[i]))

  logger.info('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
      test_loss, correct, len(test_loader.dataset),
      100. * correct / len(test_loader.dataset)))
  
  return 100.0 * correct/len(test_loader.dataset)

# train method
def train(model, optimizer, criterion, scaler, train_loader, use_amp, epoch=0):

  for batch_idx, (inputs, labels) in enumerate(train_loader): # Iterating through the train loader
    if use_half_all:
      inputs = inputs.half()
    inputs, labels = inputs.to(device), labels.to(device)
    optimizer.zero_grad()            # Reset the gradient in every iteration
    with torch.cuda.amp.autocast(enabled=use_amp):
      outputs = model(inputs)
      loss = criterion(outputs,labels) # Loss forward pass
    scaler.scale(loss).backward()      # Loss backward pass
    scaler.step(optimizer)
    scaler.update()                    # Update all the parameters by the given learning rule
    # optimizer.zero_grad()              # set_to_none=True here can modestly improve performance
  
    if batch_idx % 500 == 0:
      logger.info('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
          epoch, batch_idx * len(inputs), len(train_loader.dataset),
          100. * batch_idx / len(train_loader), loss.item()))


  return loss.item()

def run_experiment(MAX_EPOCHS=3):
  epoch_list = [0]
  loss_epoch_list = [-1]
  epoch_train_time_list = [-1]
  total_train_time_list = [-1]
  lr_list = [-1]
  test_acc_list = []

  model = Net().to(device)
  if use_half_all:
    model.half()
  criterion = nn.CrossEntropyLoss()
  curr_lr = 0.01
  optimizer = optim.SGD(model.parameters(), lr=curr_lr, momentum=0.9)
  scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
  total_training_time = 0

  # check accuracy before training
  test_acc = test(dataset, model, device, test_loader, criterion)
  test_acc_list.append(test_acc)

  for epoch in range(1, MAX_EPOCHS):
    start_time = time.time()
    last_epoch_loss = train(model, optimizer, criterion, scaler, train_loader, use_amp, epoch)
    end_time = time.time()
    epoch_training_time = end_time - start_time
    total_training_time += epoch_training_time
    epoch_list.append(epoch)
    epoch_train_time_list.append(epoch_training_time)
    total_train_time_list.append(total_training_time)
    lr_list.append(curr_lr)
    test_acc = test("EMNIST", model, device, test_loader, criterion)
    test_acc_list.append(test_acc)
    loss_epoch_list.append(last_epoch_loss)

    # cut learning rate in half every 20 epochs
    if epoch % 20 == 19:
      curr_lr = 0.5 * curr_lr
      for g in optimizer.param_groups:
        g['lr'] = curr_lr


  # (OPTIONAL) Save trained model:
  #PATH = './cifar_net.pt'
  #torch.save(net.state_dict(), PATH)

  # (OPTIONAL) Load saved model
  #net.load_state_dict(torch.load(PATH))
  #net.to(device)

  results_df = pd.DataFrame({"epoch": epoch_list, "training_loss": loss_epoch_list, "test_acc": test_acc_list, "epoch_train_time": epoch_train_time_list, "total_train_time": total_train_time_list, "lr": lr_list, })
  return results_df

seed_experiment(42)
use_half_all=False
results_df = run_experiment(MAX_EPOCHS=3)
results_df

Seeded everything with seed: 42
using Tesla T4
Downloading and extracting zip archive
Downloading http://www.itl.nist.gov/iaui/vip/cs_links/EMNIST/gzip.zip to ./data/EMNIST/raw/emnist.zip


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting ./data/EMNIST/raw/emnist.zip to ./data/EMNIST/raw
Processing byclass
Processing bymerge
Processing balanced
Processing letters
Processing digits
Processing mnist
Done!
Seeded everything with seed: 42


INFO:root:Accuracy of     0 : 82 %
INFO:root:Accuracy of     1 :  0 %
INFO:root:Accuracy of     2 :  0 %
INFO:root:Accuracy of     3 : 59 %
INFO:root:Accuracy of     4 :  0 %
INFO:root:Accuracy of     5 :  0 %
INFO:root:Accuracy of     6 :  0 %
INFO:root:Accuracy of     7 :  0 %
INFO:root:Accuracy of     8 :  0 %
INFO:root:Accuracy of     9 :  0 %
INFO:root:
Test set: Average loss: 0.0001, Accuracy: 5682/40000 (14%)

INFO:root:Accuracy of     0 : 98 %
INFO:root:Accuracy of     1 : 98 %
INFO:root:Accuracy of     2 : 94 %
INFO:root:Accuracy of     3 : 94 %
INFO:root:Accuracy of     4 : 95 %
INFO:root:Accuracy of     5 : 95 %
INFO:root:Accuracy of     6 : 97 %
INFO:root:Accuracy of     7 : 97 %
INFO:root:Accuracy of     8 : 94 %
INFO:root:Accuracy of     9 : 94 %
INFO:root:
Test set: Average loss: 0.0000, Accuracy: 38515/40000 (96%)

INFO:root:Accuracy of     0 : 98 %
INFO:root:Accuracy of     1 : 98 %
INFO:root:Accuracy of     2 : 98 %
INFO:root:Accuracy of     3 : 97 %
INFO:root:Accurac

Unnamed: 0,epoch,training_loss,test_acc,epoch_train_time,total_train_time,lr
0,0,-1.0,14.205,-1.0,-1.0,-1.0
1,1,0.133681,96.2875,38.841566,38.841566,0.01
2,2,0.068642,97.9125,39.276011,78.117577,0.01


In [7]:
seed_experiment(42)
use_half_all=True
results_df = run_experiment(MAX_EPOCHS=3)
use_half_all=False
results_df

Seeded everything with seed: 42


INFO:root:Accuracy of     0 : 82 %
INFO:root:Accuracy of     1 :  0 %
INFO:root:Accuracy of     2 :  0 %
INFO:root:Accuracy of     3 : 59 %
INFO:root:Accuracy of     4 :  0 %
INFO:root:Accuracy of     5 :  0 %
INFO:root:Accuracy of     6 :  0 %
INFO:root:Accuracy of     7 :  0 %
INFO:root:Accuracy of     8 :  0 %
INFO:root:Accuracy of     9 :  0 %
INFO:root:
Test set: Average loss: 0.0001, Accuracy: 5682/40000 (14%)

INFO:root:Accuracy of     0 : 98 %
INFO:root:Accuracy of     1 : 97 %
INFO:root:Accuracy of     2 : 96 %
INFO:root:Accuracy of     3 : 91 %
INFO:root:Accuracy of     4 : 98 %
INFO:root:Accuracy of     5 : 94 %
INFO:root:Accuracy of     6 : 98 %
INFO:root:Accuracy of     7 : 96 %
INFO:root:Accuracy of     8 : 95 %
INFO:root:Accuracy of     9 : 91 %
INFO:root:
Test set: Average loss: 0.0000, Accuracy: 38332/40000 (96%)

INFO:root:Accuracy of     0 : 98 %
INFO:root:Accuracy of     1 : 98 %
INFO:root:Accuracy of     2 : 98 %
INFO:root:Accuracy of     3 : 96 %
INFO:root:Accurac

Unnamed: 0,epoch,training_loss,test_acc,epoch_train_time,total_train_time,lr
0,0,-1.0,14.205,-1.0,-1.0,-1.0
1,1,0.123901,95.83,39.849759,39.849759,0.01
2,2,0.055481,97.865,39.819161,79.66892,0.01


In [8]:
seed_experiment(42)
use_amp=True
results_df = run_experiment(MAX_EPOCHS=3)
use_amp=False
results_df

Seeded everything with seed: 42


INFO:root:Accuracy of     0 : 82 %
INFO:root:Accuracy of     1 :  0 %
INFO:root:Accuracy of     2 :  0 %
INFO:root:Accuracy of     3 : 59 %
INFO:root:Accuracy of     4 :  0 %
INFO:root:Accuracy of     5 :  0 %
INFO:root:Accuracy of     6 :  0 %
INFO:root:Accuracy of     7 :  0 %
INFO:root:Accuracy of     8 :  0 %
INFO:root:Accuracy of     9 :  0 %
INFO:root:
Test set: Average loss: 0.0001, Accuracy: 5682/40000 (14%)

INFO:root:Accuracy of     0 : 97 %
INFO:root:Accuracy of     1 : 97 %
INFO:root:Accuracy of     2 : 96 %
INFO:root:Accuracy of     3 : 90 %
INFO:root:Accuracy of     4 : 97 %
INFO:root:Accuracy of     5 : 94 %
INFO:root:Accuracy of     6 : 98 %
INFO:root:Accuracy of     7 : 93 %
INFO:root:Accuracy of     8 : 97 %
INFO:root:Accuracy of     9 : 94 %
INFO:root:
Test set: Average loss: 0.0000, Accuracy: 38275/40000 (96%)

INFO:root:Accuracy of     0 : 98 %
INFO:root:Accuracy of     1 : 98 %
INFO:root:Accuracy of     2 : 98 %
INFO:root:Accuracy of     3 : 97 %
INFO:root:Accurac

Unnamed: 0,epoch,training_loss,test_acc,epoch_train_time,total_train_time,lr
0,0,-1.0,14.205,-1.0,-1.0,-1.0
1,1,0.139991,95.6875,41.133075,41.133075,0.01
2,2,0.06561,97.8725,41.429856,82.562932,0.01
