## Transfer learning

In the context of CV, take a pretrained 1000 class resnet image classifier and finetune it for our two class use case (classify images of bees vs ants using hymenoptera data)

In addition, we will also learn about using datasets.ImageFolder, and scheduler

In [1]:
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time 
import os
import copy

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
mean  = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])

## 3 values because normalization across 3 channels

## Define data transformations : slightly different transforms for train and val

In [5]:
data_transforms = {
  'train' : transforms.Compose([
      transforms.RandomResizedCrop(224),
      transforms.RandomHorizontalFlip(),
      transforms.ToTensor(),
      transforms.Normalize(mean, std)
     
 ]),
  'val' : transforms.Compose([
      
      transforms.Resize(256),
      transforms.CenterCrop(224),
      transforms.ToTensor(),
      transforms.Normalize(mean, std)
      
  ])
    
    
}

## Import data. use datasets.ImageFolder

In [6]:
datadir = 'data/hymenoptera_data'
sets = ['train', 'val']
image_datasets = {x : datasets.ImageFolder(os.path.join(datadir,x), data_transforms[x]) for x in ['train', 'val']}

In [8]:
dataset_sizes = {x : len(image_datasets[x]) for x in ['train', 'val']}

In [9]:
dataset_sizes

{'train': 244, 'val': 153}

In [13]:
class_names = image_datasets['train'].classes ## I'm guessing it picks class names from folder names

In [14]:
class_names

['ants', 'bees']

## data loaders

In [16]:
dataloaders = {x : torch.utils.data.DataLoader(image_datasets[x], batch_size=4, shuffle=True, num_workers=0) for x in ['train', 'val']}

## function for training and validation
Unlike earlier,  have a common method for both

In [17]:
def train_model(model, criterion, optimizer, scheduler, num_epochs):
    
    since = time.time()
    best_model_weights = copy.deepcopy(model.state_dict()) ## why deepcopy ?
    best_acc = 0.0
    
    for epoch in range(num_epochs):
    
        print(f'Epoch {epoch + 1}/{num_epochs}')
        print('-'*10)
        
        ## each epoch has training and validation phase
        for phase in ['train', 'val']:
            if phase=='train':
                model.train()
            else:
                model.eval()  ## This is necessary as since we have resnet which has dropouts, batch_norm, etc
                              ## some of these need to be switched on only during training and model.train() and model.eval() helps with these
                    
            running_loss = 0.0
            running_correct = 0
            
            ## iterate over batches within epoch
            for inputs, labels in dataloaders[phase]:
                
                inputs = inputs.to(device)
                labels = labels.to(device)
                
                ## forward pass - if train, capture gradients
                with torch.set_grad_enabled(phase = 'train'): ## fancier one line way of using torch.no_grad() to handle train and eval
                    output = model(inputs)
                    loss = criterion(output, labels)
                    
                    _, preds = torch.max(output, 1)
                    
                    ## backward prop + optimize only in training
                    if phase=='train':
                        optimizer.zero_grad() ## zero grad can either be done before loss backward and step or after, doesn't really matter
                        loss.backward()
                        optimizer.step()
                        
                    running_loss = running_loss + loss.item()*inputs.shape[0] ## loss is averaged loss. multiplying by inputs.shape[0] gives total loss (sum instead of average)
                    
                    running_correct = running_correct + torch.sum(labels==preds)
            if phase == 'train':
                scheduler.step()  ## train, evaluate and then update the scheduler
                                ## note that scheduler operates at the epoch level, not at the batches within epoch level
                                ## hence outside for loop
            epoch_loss = running_loss/dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
            
            if phase=='val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                
            
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))
    
    ## load best model weights
    model.load_state_dict(best_model_wts)
    return model
            
            
                
                    
            
    

## Load pretrained resnet model

In [19]:
model = models.resnet18(pretrained=True)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to C:\Users\kkiit/.cache\torch\hub\checkpoints\resnet18-f37072fd.pth
100%|█████████████████████████████████████████████████████████████████████████████| 44.7M/44.7M [00:14<00:00, 3.29MB/s]


## Two methods : Method 1 : Finetune all weights of pretrained model for current classification
## Method 2 : Finetune just last layer

## Method 1 : All weight finetuning

We now want to replace the last linear layer of model which outputs logits with a new
linear layer with 2 class output

In [23]:
num_ftrs = model.fc.in_features ## first we find out the number of inputs in the last linear layer

In [24]:
print(model.fc.out_features) ## note that currently, last layer has 1000 outputs/1000 classes in resnet

1000


In [25]:
## replace last linear layer with custom linear layer with 2 outputs 

In [26]:
model.fc = nn.Linear(num_ftrs, 2)

In [27]:
model.fc.out_features

2

In [28]:
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)


In [None]:
## define a linear scheduler

In [30]:
step_lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [None]:
## means that every 7 epochs, learning rate new = gamma * learning rate old

## Actual running !!

In [None]:
model = train_model(model, criterion, optimizer, step_lr_scheduler, num_epochs=25)

## Method 2 : Only last layer finetuning

In [31]:
model_conv = torchvision.models.resnet18(pretrained=True)



In [None]:
## first freeze all layers to backprop using requires_grad=False

for param in model_conv.parameters():
    param.requires_grad = False

    
## now just reset last layer, that makes only last layer with requires_grad=True
num_ftrs = model_conv.fc.in_features ## first we find out the number of inputs in the last linear layer
model_conv.fc = nn.Linear(num_fts, 2) ## automatically has requires_grad = True for last layer alone

model_conv = model_conv.to(device)

optimizer_conv = optim.SGD(model_conv.fc.parameters(), lr=0.001, momentum=0.9) ## only last layer being optimized, slightly different hyperparam than when optimizing full network




In [None]:
## Method 2 : Only last layer finetuning

In [None]:
model_conv = train_model(model_conv, criterion, optimizer_conv,
                         exp_lr_scheduler, num_epochs=25)

## References

https://www.youtube.com/watch?v=K0lWSB2QoIQ&list=PLqnslRFeH2UrcDBWF5mfPGpqQDSta6VK4&index=16