# Phase 2 - Transfer Learning on CIFAR10

In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import time
import os
import copy

import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

## Download and Load CIFAR10 

You can find CIFAR10 in ```torchvision.datasets.CIFAR10()```. Please Download the dataset and use Dataloaders to load it.

In [None]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

batch_size = 100

train_set = datasets.CIFAR10(root = '/.data', train = True, transform = transforms.ToTensor(), download = True)
#train_data = datasets.MNIST(root = './data', train = True,
#                        transform = transforms.ToTensor(), download = True)
#Extracted from Phase 1 modified

train_loader = torch.utils.data.DataLoader(dataset = train_set,
                                             batch_size = batch_size,
                                             shuffle = True)
#train_loader = torch.utils.data.DataLoader(dataset = train_data,
#                                             batch_size = batch_size,
#                                             shuffle = True)

test_set = datasets.CIFAR10(root = './data', train = False,
                       transform = transforms.ToTensor(), download = True)
#test_data = datasets.MNIST(root = './data', train = False,
#                       transform = transforms.ToTensor())
#Extracted from Phase 1 modified

test_loader = torch.utils.data.DataLoader(dataset = test_set,
                                      batch_size = batch_size, 
                                      shuffle = False)
#test_loader = torch.utils.data.DataLoader(dataset = test_data,
#                                      batch_size = batch_size, 
#                                      shuffle = False)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /.data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting /.data/cifar-10-python.tar.gz to /.data
Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting ./data/cifar-10-python.tar.gz to ./data


## Load Pretrained Model (ResNet18) and Modify Layers
Load ResNet18 model from ```torchvision.models.resnet18()```, make sure to load pretrained model with weights.
You will need to freeze the weights and modify the final layer. ResNet outputs 100 lables - please change to the correct output dimension.

In [None]:
# Instantiate Pretrained ResNet18
resnet18 = models.resnet18()

# Freeze Layers
# STACK OVERFLOW YES YES YES
for param in resnet18.parameters():
    param.requires_grad = False


# Modify final layer
resnet18.fc= nn.Linear(in_features = 512, out_features=10, bias = True)
#print (resnet18.fc.out_features)

# Load onto GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
resnet18.to(device)


# View Model to validate
print(resnet18)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

## Train and Test Functions
You can refer to Phase_1.ipynb to complete these functions. In ```train()```, rather than saving the accuracy and loss per ```stat_count```, you will need to save the loss and accuracy ***per epoch***

In [None]:
# Train Function
def train(model, loss_fn, optimizer, train_loader, batch_size, num_epochs, input_size, stat_count=100, device=None):
    list_loss= []
    list_accuracy = []
    iteration_loss = 0.0
    total_sum = 0
    right = 0
    if device is not None:
        model.to(device)
    else:
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        model.to(device)
    # Iterate through all Epochs
    for epoch in range(num_epochs):
        # Iterate through training dataset
        for i, data in enumerate(train_loader, 0):
            # Flatten images and load images/labels onto GPU
            images, labels = data[0].to(device), data[1].to(device)
            #images = images.view(-1, input_size)
            # Zero collected gradients at each step
            optimizer.zero_grad()
            # Forward Propagate
            outputs = model(images)
            # Calculate Loss
            loss = loss_fn(outputs, labels)
            # Back propagate
            loss.backward()
            # Update weights
            optimizer.step()
            
            # Print statistics on every stat_count iteration
            if (i+1) % stat_count == 0:
              iteration_loss += loss.item() * images.size(0)
              list_loss.append(iteration_loss)
              iteration_loss = 0.0
              optimizer.step()
              _, predicted = torch.max(outputs.data, 1)
              total_sum = total_sum + labels.size(0)
              right += (predicted == labels).sum().item()
              list_accuracy.append((100* right/total_sum))
                
        #Printing every epoch
        print('Epoch [%d/%d], Loss: %.4f, Accuracy: %.4f'           
            %(epoch+1, num_epochs, loss.item(),(100* right/total_sum)))
    return list_loss, list_accuracy
    
# Test Function
def test_accuracy(model, test_loader, input_size, device=None):
    if device is not None:
        model.to(device)
    else:
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        model.to(device)
    correct = 0
    total = 0
    with torch.no_grad():
        for test_data in test_loader:
            images, labels = test_data[0].cuda(), test_data[1].cuda()
            #images = images.view(-1, input_size)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print('Accuracy of the network on the test images: %d %%' % (100 * correct / total))
    
# Plot Learning Curves
def plot_learning_curve(list_loss, list_accuracy):
    fig, (ax1, ax2) = plt.subplots(2, sharex=True)
    ax1.plot(list_loss)
    ax1.set_title ('Loss')
    ax2.plot(list_accuracy)
    ax2.set_title ('Accuracy')


## Complete full training and testing pipeline
Retrain the final layer of ResNet18 on CIFAR10 Dataset. You can chose the optimizer and loss function.

In [None]:
# Define Parameters

input_size = 3072
num_epochs = 10
#SET EPOCHS TO 10 LATER
lr = 1e-4
num_classes = 10

list_loss = []
list_accuracy = []
# Define Loss func and Optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(resnet18.parameters(), lr=lr)

# Train Model
list_loss, list_accuracy = train(model=resnet18, loss_fn=loss_function, optimizer=optimizer, train_loader=train_loader, batch_size=batch_size, num_epochs=num_epochs, input_size=input_size)

# Plot Learning Curves
plot_learning_curve(list_loss, list_accuracy)

# Evaluate on Test Set
test_accuracy(model=resnet18, test_loader=test_loader, input_size=input_size) 


Epoch [1/10], Loss: 2.1183, Accuracy: 21.0000


KeyboardInterrupt: ignored

Question What is transfer learning and why would it be useful?
Transfer learning is applying a trained model that already has been thoroughly trained through a large dataset, freeze the immediate layers and do training on the last few layers. It reduces the time needed to make a new model and train when you can use a model that is very similar to the results you are trying to achieve. For example, identifying cars can be transferred to identifying cars.

Question What (if any) modifications to the pretrained model would you need to do and why? 
You would need to freeze the layers on the neural network except the last few layers and run training.

Question You can also use the ResNet18 architecture without pretrained weights and retrain
on your own dataset (e.g. fully trained on CIFAR10). Discuss in detail why this may be a good or
bad idea.
It's a bad idea because you would have to undergo a lot of training time and it won't necessarily be an optimized model.

Question Please explain precision and recall.
Precision is how consistent it can identify the correct label and recall is how fast it can identify it.

Question Please include your confusion matrix and explain what is it and its purpose.

Honestly I am ran out of time and couldn't include one.

Resnet18 Without Freezing Layers

In [None]:
# Instantiate Pretrained ResNet18
resnet18_nonfreeze = models.resnet18()

# Modify final layer
resnet18_nonfreeze.fc= nn.Linear(in_features = 512, out_features=10, bias = True)
#print (resnet18.fc.out_features)

# Load onto GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
resnet18_nonfreeze.to(device)

# View Model to validate
print(resnet18_nonfreeze)

In [None]:
# Train Function
def train(model, loss_fn, optimizer, train_loader, batch_size, num_epochs, input_size, stat_count=100, device=None):
    list_loss= []
    list_accuracy = []
    iteration_loss = 0.0
    total_sum = 0
    right = 0
    if device is not None:
        model.to(device)
    else:
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        model.to(device)
    # Iterate through all Epochs
    for epoch in range(num_epochs):
        # Iterate through training dataset
        for i, data in enumerate(train_loader, 0):
            # Flatten images and load images/labels onto GPU
            images, labels = data[0].to(device), data[1].to(device)
            #images = images.view(-1, input_size)
            # Zero collected gradients at each step
            optimizer.zero_grad()
            # Forward Propagate
            outputs = model(images)
            # Calculate Loss
            loss = loss_fn(outputs, labels)
            # Back propagate
            loss.backward()
            # Update weights
            optimizer.step()
            
            # Print statistics on every stat_count iteration
            if (i+1) % stat_count == 0:
              iteration_loss += loss.item() * images.size(0)
              list_loss.append(iteration_loss)
              iteration_loss = 0.0
              optimizer.step()
              _, predicted = torch.max(outputs.data, 1)
              total_sum = total_sum + labels.size(0)
              right += (predicted == labels).sum().item()
              list_accuracy.append((100* right/total_sum))
                
        #Printing every epoch
        print('Epoch [%d/%d], Loss: %.4f, Accuracy: %.4f'           
            %(epoch+1, num_epochs, loss.item(),(100* right/total_sum)))
    return list_loss, list_accuracy
    
# Test Function
def test_accuracy(model, test_loader, input_size, device=None):
    if device is not None:
        model.to(device)
    else:
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        model.to(device)
    correct = 0
    total = 0
    with torch.no_grad():
        for test_data in test_loader:
            images, labels = test_data[0].cuda(), test_data[1].cuda()
            #images = images.view(-1, input_size)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print('Accuracy of the network on the test images: %d %%' % (100 * correct / total))
    
# Plot Learning Curves
def plot_learning_curve(list_loss, list_accuracy):
    fig, (ax1, ax2) = plt.subplots(2, sharex=True)
    ax1.plot(list_loss)
    ax1.set_title ('Loss')
    ax2.plot(list_accuracy)
    ax2.set_title ('Accuracy')


In [None]:
# Define Parameters

input_size = 3072
num_epochs = 10
lr = 1e-4
num_classes = 10

list_loss = []
list_accuracy = []
# Define Loss func and Optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(resnet18_nonfreeze.parameters(), lr=lr)

# Train Model
list_loss, list_accuracy = train(model=resnet18_nonfreeze, loss_fn=loss_function, optimizer=optimizer, train_loader=train_loader, batch_size=batch_size, num_epochs=num_epochs, input_size=input_size)

# Plot Learning Curves
plot_learning_curve(list_loss, list_accuracy)

# Evaluate on Test Set
test_accuracy(model=resnet18_nonfreeze, test_loader=test_loader, input_size=input_size) 

Question What was the difference between freezing layers and retraining all? Explain in terms
of classification metrics and why you saw such metrics. Is what you say as expected?
Freezing layers ended up with much lower accuracy than retraining all layers. I did not expect it, but it is probably because the weights on the frozen layers were not able to adjust to the weights being changed on the last layer and inadvertently causing the model to have more improper training.

I don't have a classification metric.  

Question For a pretrained model like ResNet, in what general scenarios would you (1) freeze
your pretrained weights adn retrain last layer, (2) retrain entire architecture, and (3) Freeze all
convolution layer weights and retrain all FC layers? Explain with example scenarios if possible.

For scenario 1, if the classifications are very similar, such as a bird to a parrot, then I would freeze all layers except last layer and retrain.

For scenario 2, if the classifications are entirely different but identifications for things like edges or texture could be used, then I would retrain the entire architecture. For example, identifying cubes could be used for identifying buildings.

For scenario 3, if classifications are  different but have similar features, such as sand to sugar grains, then I would freeze all convolutional later weights and retrain all layers.

