In [1]:
%%capture
from IPython.display import Image as ImageIpython
from IPython.core.display import HTML 


## Table of contents
* [Initial Setup](#section-one)
* [LR Schedulers & Question](#section-two)
* [Mixup & Question](#section-three)
* [Architectures & Question](#section-four)
* [Mixed Precision & Question](#section-five)
* [Label Smoothing & Question](#section-six)

In [2]:
print("Imagenet improvement on Resnet-50 using tricks")
ImageIpython(url= "https://miro.medium.com/max/890/1*4y2Rdl0i-kvNh2SCMZZBuw.png")

Imagenet improvement on Resnet-50 using tricks


<a id="section-one"></a>
## Initial Set up - just run through these steps

In [3]:
import time
from tqdm import tqdm_notebook as tqdm
#import tqdm.notebook import tqdm
!pip install opencv-python
import cv2
from PIL import Image
import numpy as np
import pandas as pd
import torch
import albumentations as A
from albumentations.pytorch import ToTensorV2
from torch.utils.data import DataLoader, Dataset
from torch.optim import lr_scheduler
import timm
import warnings
warnings.filterwarnings('ignore')



In [None]:
# !pip install kaggle
# !kaggle competitions download -c midsw251birds



In [4]:
#!ls -l ../input/midsw251birds

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [4]:
class args:
    lr = 0.0001
    epochs = 5
    batch_size = 32
    num_workers = 8
    folds = 5

In [9]:
alldf = pd.read_csv('train.csv')
# Split the training dataset into a training and a validation
valdf = alldf[::args.folds]
trndf = alldf[~alldf.filename.isin(valdf.filename)]
# Load our test data
tstdf = pd.read_csv('test.csv')
metadf = pd.read_csv('metadata.csv')
metadf = metadf.set_index('label')
print(f'File shapes -- train : {trndf.shape}, valid : {valdf.shape}, test : {tstdf.shape}')
trndf.head()

File shapes -- train : (26379, 2), valid : (6595, 2), test : (8244, 1)


Unnamed: 0,filename,label
1,train/bb99f4bea973.jpg,147
2,train/50923ceb3ffd.jpg,147
3,train/f9fc3c6da5d7.jpg,147
4,train/dfe8cb1855fe.jpg,147
6,train/4fe53a096533.jpg,147


In [10]:
imgnetmeans = [0.22363983, 0.18190407, 0.2523437 ]
imgnetstds = [0.32451536, 0.2956294,  0.31335256]
# Using albumentations, check some examples here : https://albumentations.readthedocs.io/en/latest/examples.html 
def trntransforms():
    return A.Compose([
        A.HorizontalFlip(p=0.5),
        A.Transpose(p=0.5),
        ToTensorV2(),
        ])

def tsttransforms():
    return A.Compose([
        ToTensorV2(),
    ])

class BirdDataset(Dataset):
    def __init__(self, df, mode, transform=None):
        self.data = df
        self.img_dir = '../input/midsw251birds/'
        self.transform = transform
        self.mode = mode

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        
        fname = self.data.iloc[idx]['filename']
        image = cv2.imread(f'{self.img_dir}/{fname}')
        if self.transform is not None:
            image = self.transform(image = image)['image']
        image = image.float() / 255.
        label = -1 if self.mode=='test' else self.data.iloc[idx]['label']
        
        return image, label

In [11]:
# Define our dataset
trndataset = BirdDataset(trndf, 'train', trntransforms())
valdataset = BirdDataset(valdf, 'valid', tsttransforms())
tstdataset = BirdDataset(tstdf, 'test', tsttransforms())
loaderargs = {'num_workers' : args.num_workers, 'batch_size':args.batch_size, 'pin_memory': False, 'drop_last': False}
trnloader = DataLoader(trndataset, shuffle = True, **loaderargs)
valloader = DataLoader(valdataset, shuffle = False, **loaderargs)
tstloader = DataLoader(tstdataset, shuffle = False, **loaderargs)

In [12]:
# creates efficientnet-b0 architecture
device = torch.device("cuda:0")
model = timm.create_model('efficientnet_b2', pretrained = True)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
criterion = torch.nn.CrossEntropyLoss()

Downloading: "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b2_ra-bcdf34b7.pth" to C:\Users\John/.cache\torch\hub\checkpoints\efficientnet_b2_ra-bcdf34b7.pth


AssertionError: Torch not compiled with CUDA enabled

<a id="section-two"></a>
# LR Schedulers

In [13]:
# Create a scheduler which will warmup and cooldown over 20 epochs.
from timm.scheduler.cosine_lr import CosineLRScheduler

n_epochs = 20
n_warmup_epochs = 2
n_steps = len(trnloader)

scheduler = CosineLRScheduler(
            optimizer,
            t_initial= n_steps * n_epochs + 1,
            lr_min=0.00001,
            warmup_lr_init=0.00001,
            warmup_t= n_steps * n_warmup_epochs + 1)

NameError: name 'optimizer' is not defined

In [None]:
# Let's visualise how this changes the LR
lrls = []
global_step = 0
for epoch in range(n_epochs):
    for step in range(len(trnloader)):
        #train_step(...)
        scheduler.step(global_step)
        global_step+=1
        lrls.append(optimizer.param_groups[0]['lr']) 
    #validate_epoch(...)
# Plot
ax = pd.Series(lrls).plot(logy=True, figsize = (10, 4))
for i in range(0,n_epochs*n_steps+1,n_steps) : ax.axvline(i, linewidth=0.2, color='r', linestyle='--')
ax.set_xlabel("steps")
ax.set_ylabel("LR (log scale)")

##### Cosine Annealing is a type of learning rate schedule that has the effect of starting with a large learning rate that is relatively rapidly decreased to a minimum value before being increased rapidly again. The resetting of the learning rate acts like a simulated restart of the learning process and the re-use of good weights as the starting point of the restart is referred to as a "warm restart" in contrast to a "cold restart" where a new set of small random numbers may be used as a starting point.

In [None]:
lrls = []
global_step = 0
for epoch in range(n_epochs * 6):
    for step in range(len(trnloader)):
        #train_step(...)
        scheduler.step(global_step)
        global_step+=1
        lrls.append(optimizer.param_groups[0]['lr']) 
    #validate_epoch(...)
# Plot
ax = pd.Series(lrls).plot(logy=True, figsize = (10, 4))
for i in range(0,n_epochs * 6 *n_steps+1,n_steps) : ax.axvline(i, linewidth=0.2, color='r', linestyle='--')
ax.set_xlabel("steps")
ax.set_ylabel("LR (log scale)")

<a id="#subsection-two-one"></a>
### Question : Can you implement a cosine learning rate schedule which has 4 epochs warmup and reaches a minimum at 15 epochs, using the n_steps from `trnloader`. 

<a id="section-three"></a>
## Mixup

In [None]:
ImageIpython(url= "https://forums.fast.ai/uploads/default/original/3X/4/b/4b00023c65aa58fbe58b02271de08949e53c64b9.png")

In [None]:
# Lets try this very simply. 
img1 = cv2.imread(f'../input/midsw251birds/{trndf.iloc[0].filename}')
img2 = cv2.imread(f'../input/midsw251birds/{trndf.iloc[1].filename}')
mixup_alpha = 0.6
img_mixed = (img1 * mixup_alpha + img2 * (1-mixup_alpha)).astype(np.uint8)
Image.fromarray(np.concatenate([img1,img2,img_mixed], 1))

In [None]:
beta = 1.0

for step, batch in enumerate(trnloader):
    if step > 20: 
        break
    inputs = batch[0].to(device, dtype=torch.float)
    labels = batch[1].to(device).long()
    
    # Get out a random value form a distribution    
    lam = np.random.beta(beta, beta)
    rand_index = torch.randperm(inputs.size()[0]).to(device) # make an index which reorders the batch
    
    # Reorder the labels
    labels_a = labels
    labels_b = labels[rand_index]
    
    # Partially mixup up the batch
    inputs_mixed = lam * inputs + (1 - lam) * inputs[rand_index]
    
    optimizer.zero_grad()
    output = model(inputs_mixed)
    
    # Partial loss against original labels, partial loss against mixed up labels
    loss = criterion(output, labels_a) * lam + criterion(output, labels_b) * (1. - lam)
    loss.backward()
    optimizer.step()
    
    # Note, do not mixup your validation or test data !! Just train, make the model sweat....

    

### Questions 
Can you plot the different distributions of `lam`. (Something like ... `pd.Series([np.random.beta(beta, beta) for i in range(1000)]).hist(bins=100)`).   
Why would we us larger or smaller values of `lam` ?   
Can you understand how the loss is calculated above, and why ?   
Can you guess how you would implement cutmix in the example above ?

<a id="section-four"></a>
# Different Architectures

#### Above we chose to work with efficientnet-b0, can you initialise a pretrained `mixnet-xl` model ? Tip, check the results table [here](https://github.com/rwightman/pytorch-image-models/blob/master/results/results-imagenet.csv)

In [None]:
model = timm.create_model('efficientnet_b2', pretrained = True)

#### Our model still has 1000 classes outputted. How would you initialise it with 10 classes in one line of code. Tip, try checking https://fastai.github.io/timmdocs/

<a id="section-five"></a>
# Mixed Precision Training

In [None]:
ImageIpython(url= "https://developer-blogs.nvidia.com/wp-content/uploads/2019/10/Screen-Shot-2019-10-18-at-7.31.09-AM-624x328.png")

### Can you implement mixed precision in the below training loop. Tip : Check [this](https://pytorch.org/docs/stable/notes/amp_examples.html#typical-mixed-precision-training) example. You will need to comment out some of the below lines, its not too tough.
### See how large you can make your batchsize with mixed precision, and without mixed precision (you may need ot restart the kernels a few times.)

In [None]:
from torch.cuda.amp import autocast, GradScaler

model.to(device)
model.train()

for step, batch in enumerate(trnloader):
    if step > 20:
        break
    inputs = batch[0].to(device, dtype=torch.float)
    labels = batch[1].to(device).long()
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

<a id="section-six"></a>
# Label Smoothing
The generalization and learning speed of a multi-class neural network can often
be significantly improved by using soft targets that are a weighted average of the
hard targets and the uniform distribution over labels. Smoothing the labels in this
way prevents the network from becoming over-confident and label smoothing has
been used in many state-of-the-art models,

In [None]:
ImageIpython(url= "https://paperswithcode.com/media/methods/image3_1_oTiwmLN.png")

### Can you implement label smoothing in the below training loop. Tip : Check [this](https://github.com/pytorch/pytorch/issues/7455#issuecomment-513062631) or [this](https://github.com/pytorch/pytorch/issues/7455#issuecomment-759175034) example. 

In [None]:
from torch.cuda.amp import autocast, GradScaler

model.to(device)
model.train()
criterion = torch.nn.CrossEntropyLoss()

for step, batch in enumerate(trnloader):
    if step > 20:
        break
    inputs = batch[0].to(device, dtype=torch.float)
    labels = batch[1].to(device).long()
    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()