# Prequisites

In [1]:
import torchvision
from torchvision import transforms
import torch
import os
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import shutil
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
from PIL import Image

# Data

In [2]:
train_data_path = "train"

transformations = transforms.Compose([
    transforms.Resize((64,64)),
    transforms.ToTensor(),
    transforms.Normalize(mean = [0.485, 0.456, 0.406],
                          std = [0.229, 0.224, 0.225])
])

#Normalizing values between 0 and 1 makes the model a bit easier to learn the features since they are standardized or capped within a smaller range
#Also prevents the values from getting too large during training phase - exploding gradient problem

train_data = torchvision.datasets.ImageFolder(train_data_path, transform=transformations)

val_data_path = "val"
val_data = torchvision.datasets.ImageFolder(val_data_path, transform=transformations)

test_data_path = "test"
test_data = torchvision.datasets.ImageFolder(test_data_path, transform=transformations)

#Training set - for training pass to update the model 
#Val set - Evaluate how model is generalizing to problem domain rather than fitting to training data not used directly
#Test set - To get final evaluation of model  

# Dataloader

batch_size = 64 #Num of images sent to the network once before updating it

train_data_loader = DataLoader(train_data, batch_size=batch_size)
val_data_loader = DataLoader(val_data, batch_size=batch_size)
test_data_loader = DataLoader(test_data, batch_size=batch_size)

num_classes = 151

# Model

* Conv layers - Extract features from images. From general/high level to lower level/intricate specialized features 
* ReLU - Adds non linearlity to let network learn complex features 
* Pooling - Shrinks spatial size (downsampling), reducing computation and makes features more invariant
* Dropout - Randomly dropping weights to increase generalization of the model 
* FC - Combines/Flattens feature maps to help make predictions 

In [17]:
class CNN(nn.Module): #CNN is the child class inheriting all functionalities from torch.nn.Module
    def __init__(self, num_classes):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 16, 3, 1, 1)
        self.conv2 = nn.Conv2d(16, 32, 3, 1, 1)
        self.conv3 = nn.Conv2d(32, 64, 3, 1, 1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(64 * 8 * 8, 512)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x))) # Conv -> Relu -> Pool 
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [18]:
model = CNN(num_classes)

criterion = nn.CrossEntropyLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

device = ''
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

model = model.to(device)

In [15]:
def train(model, optimizer, loss_func, train_data_loader, val_data_loader, epochs, device):
    for epoch in tqdm(range(epochs)):
        training_loss = 0.0
        training_iterator = 0
        valid_loss = 0.0 
        model.train() 
        for batch in train_data_loader:
            optimizer.zero_grad()  #Refresh the optimizer for the next batch everytime 
            inputs, targets = batch 
            inputs = inputs.to(device)
            targets = targets.to(device)
            output = model(inputs)
            loss = loss_func(output, targets)
            loss.backward()
            optimizer.step()
            training_loss += loss.data.item()
            training_iterator += 1
        training_loss /= training_iterator
        model.eval()
        valid_iterator = 0
        num_correct = 0 
        num_examples = 0
        for batch in val_data_loader:
            inputs, targets = batch 
            inputs = inputs.to(device)
            targets = targets.to(device)
            output = model(inputs)
            loss = loss_func(output, targets)
            valid_loss += loss
            valid_iterator += 1 
            preds = torch.max(output, dim=1)[1] # --> Could be a more idomatic approach
            correct = torch.eq(preds, targets)
            num_correct += torch.sum(correct).item() #Summing how many predictions were true
            num_examples += correct.shape[0] #EQuating number of samples in each batch
        valid_loss /= valid_iterator
        accuracy = num_correct / num_examples
        print(f"Epoch [{epoch}] : Training Loss = {training_loss:.2f}  Validation Loss = {valid_loss:.2f}")
        print(f"Validation Accuracy -> {accuracy:2f}")

In [20]:
train(model, optimizer, criterion, train_data_loader, val_data_loader, 3, device)

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch [0] : Training Loss = 5.10  Validation Loss = 5.02
Epoch [1] : Training Loss = 5.02  Validation Loss = 5.02
Epoch [2] : Training Loss = 5.02  Validation Loss = 5.02


One thing I noticed was the book talking about using the torchvision ImageFolder functionality to load the dataset, could be a bit complicated since I have folders now instead of a single one, but maybe next time would like to check with the "Datasets" class

Also *Differential Learning Rate* was an important concept I got to read about. 

Need to check if its still happening or used.

So in FT we don't need to impact the weights of the layers that were previoulsy trained, except maybe BatchNorm since we want the normalization to our dataset. So what about a general learning rate for the layers we want to finetune properly (the last and/or the classification ones) and a very very low learning rate for the layers we do not want to affect directly.

# Finetuning

https://cs231n.github.io/transfer-learning

In [12]:
#Loading the pretrained model
model = torchvision.models.vgg16(pretrained=True)



In [13]:
# Using the pretrained model as a feature extractor part only

for layer in model.parameters():
    layer.requires_grad = False #Freezing every layer 

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

model.classifier

Sequential(
  (0): Linear(in_features=25088, out_features=4096, bias=True)
  (1): ReLU(inplace=True)
  (2): Dropout(p=0.5, inplace=False)
  (3): Linear(in_features=4096, out_features=4096, bias=True)
  (4): ReLU(inplace=True)
  (5): Dropout(p=0.5, inplace=False)
  (6): Linear(in_features=4096, out_features=1000, bias=True)
)

In [14]:
in_features = model.classifier[6].in_features
model.classifier[6] = nn.Linear(in_features, num_classes)
model.classifier[6].requires_grad_ = True
model = model.to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.classifier[6].parameters(), lr=0.001) #Only passing the parameters of the classification layers / Can also try with the last modified layer only

exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1) #Decay lr by a factor of 0.1 every 7 epochs

In [15]:
def train(model, optimizer, loss_func, train_data_loader, val_data_loader, epochs, device):
    for epoch in tqdm(range(epochs)):
        training_loss = 0.0
        training_iterator = 0
        valid_loss = 0.0 
        model.train() 
        for batch in train_data_loader:
            optimizer.zero_grad()  #Refresh the optimizer for the next batch everytime 
            inputs, targets = batch 
            inputs = inputs.to(device)
            targets = targets.to(device)
            output = model(inputs)
            loss = loss_func(output, targets)
            loss.backward()
            optimizer.step()
            training_loss += loss.data.item()
            training_iterator += 1
        training_loss /= training_iterator
        model.eval()
        valid_iterator = 0
        num_correct = 0 
        num_examples = 0
        for batch in val_data_loader:
            inputs, targets = batch 
            inputs = inputs.to(device)
            targets = targets.to(device)
            output = model(inputs)
            loss = loss_func(output, targets)
            valid_loss += loss
            valid_iterator += 1 
            preds = torch.max(output, dim=1)[1] # --> Could be a more idomatic approach
            correct = torch.eq(preds, targets)
            num_correct += torch.sum(correct).item() #Summing how many predictions were true
            num_examples += correct.shape[0] #EQuating number of samples in each batch
        valid_loss /= valid_iterator
        accuracy = num_correct / num_examples
        print(f"Epoch [{epoch}] : Training Loss = {training_loss:.2f}  Validation Loss = {valid_loss:.2f}")
        print(f"Validation Accuracy -> {accuracy:2f}")

In [16]:
train(model, optimizer, criterion, train_data_loader, val_data_loader, 4, device)

  0%|          | 0/4 [00:00<?, ?it/s]

Epoch [0] : Training Loss = 5.72  Validation Loss = 4.87
Validation Accuracy -> 0.027226
Epoch [1] : Training Loss = 5.36  Validation Loss = 4.61
Validation Accuracy -> 0.060338
Epoch [2] : Training Loss = 5.07  Validation Loss = 4.39
Validation Accuracy -> 0.100074
Epoch [3] : Training Loss = 4.81  Validation Loss = 4.22
Validation Accuracy -> 0.132450


In [55]:
#Create only the last fc layer
model = torchvision.models.vgg16(pretrained=True)

in_features = model.classifier[6].in_features
model.classifier[6] = nn.Linear(in_features, num_classes)
model.classifier[6].requires_grad_ = True
model = model.to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.classifier[6].parameters(), lr=0.001) #Only passing the parameters of the classification layers / Can also try with the last modified layer only

exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1) #Decay lr by a factor of 0.1 every 7 epochs



In [None]:
train(model, optimizer, criterion, train_data_loader, val_data_loader, 4, device)

  0%|          | 0/4 [00:00<?, ?it/s]

In [26]:
#Finetune only clasifer layers
model = torchvision.models.vgg16(pretrained=True)

In [None]:
for layer in model.parameters():
    layer.requires_grad = False #Freezing all layers first

for name, param in model.named_parameters():
    if "classifier" in name:
        param.requires_grad = True #Keeping only the classification layers trainable

in_features = model.classifier[6].in_features
model.classifier[6] = nn.Linear(in_features, num_classes)
model.classifier[6].requires_grad_ = True 
model = model.to(device)

optimizer = torch.optim.SGD(model.classifier.parameters(), lr=0.001) #Not to forget to set the parameters for the layers we are finetuning, cause when i ignored that my accuracy went down heavily

In [29]:
train(model, optimizer, criterion, train_data_loader, val_data_loader, 4, device)

  0%|          | 0/4 [00:00<?, ?it/s]

Epoch [0] : Training Loss = 5.60  Validation Loss = 4.80
Validation Accuracy -> 0.045622
Epoch [1] : Training Loss = 5.12  Validation Loss = 4.60
Validation Accuracy -> 0.089772
Epoch [2] : Training Loss = 4.84  Validation Loss = 4.43
Validation Accuracy -> 0.126564
Epoch [3] : Training Loss = 4.61  Validation Loss = 4.28
Validation Accuracy -> 0.141280
