In [1]:
!pip install -q ipywidgets

In [2]:
!nvidia-smi

Sun Apr 11 22:16:51 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.36.06    Driver Version: 450.36.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro M4000        On   | 00000000:00:05.0 Off |                  N/A |
| 46%   31C    P8    11W / 120W |      1MiB /  8126MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [3]:
!nvidia-smi -q -d POWER



Timestamp                                 : Sun Apr 11 22:16:51 2021
Driver Version                            : 450.36.06
CUDA Version                              : 11.0

Attached GPUs                             : 1
GPU 00000000:00:05.0
    Power Readings
        Power Management                  : Supported
        Power Draw                        : 11.74 W
        Power Limit                       : 120.00 W
        Default Power Limit               : 120.00 W
        Enforced Power Limit              : 120.00 W
        Min Power Limit                   : 10.00 W
        Max Power Limit                   : 120.00 W
    Power Samples
        Duration                          : 81.01 sec
        Number of Samples                 : 119
        Max                               : 44.89 W
        Min                               : 11.55 W
        Avg                               : 13.22 W



In [4]:
import time
import uuid
import torch.nn.functional as F
import torchvision.transforms.functional as FT
from functools import partial
from torch import nn
from dataset import CocoDataset
from utils   import *
from model   import *
from metric  import *
from loss    import *
if torch.cuda.is_available():
    device = torch.device('cuda:0')
else:
    device = "cpu"
device

device(type='cuda', index=0)

# Setup

### Data transformations, dataset, & dataloader:

In [5]:
# define the sequence of transformations to apply to each image sample 
img_sz = 300

# training transforms with data augmentations
train_tfms = transforms.Compose(
    [PhotometricDistort(1.),
     Flip(0.5),
     ImageToTensor(), CategoryToTensor(), BoxToTensor(),
     Zoomout(0.5, max_scale=2.5),
     Normalize(), 
     Resize((img_sz, img_sz))]
)

# validation transforms without data augmentations
tfms = transforms.Compose(
    [ImageToTensor(), CategoryToTensor(), BoxToTensor(),
     Normalize(), 
     Resize((img_sz, img_sz))]
)

# instantiate the dataset object
ds_train = CocoDataset(data_dir='/datasets/coco', dataset='train2014', anno_type='instances', transforms=train_tfms)
ds_valid = CocoDataset(data_dir='/datasets/coco', dataset='val2014', anno_type='instances', transforms=tfms)

# create dataloader
BS = 16
dl_train = DataLoader(ds_train, batch_size=BS, shuffle=True,
                      collate_fn=partial(ds_train.collate_fn, img_resized=True)) # img_resized=true to indicate all image samples have been resized to same shape

dl_valid = DataLoader(ds_valid, batch_size=BS, shuffle=True,
                      collate_fn=partial(ds_valid.collate_fn, img_resized=True)) # img_resized=true to indicate all image samples have been resized to same shape

loading annotations into memory...
Done (t=13.08s)
creating index...
index created!
loading annotations into memory...
Done (t=5.87s)
creating index...
index created!


### SSD Model

In [6]:
# create the SSD model
ssd = SSD300(len(ds_train.id2cat), device)

### Cost function

In [7]:
# multi-loss criteria
criterion = MultiBoxLoss(300, ssd.prior_boxes, threshold=0.5, neg_pos_ratio=3, alpha=1., device=device)

### Metric

In [8]:
metric = mAP(ssd.n_classes, device)

### Tracker

In [9]:
class Tracker(object):
    """
    Keeps track of most recent, average, sum, and count of a metric.
    """

    def __init__(self, name=None):
        self.reset()
        self.name = name
        
    def reset(self):
        self.val = 0
        self.avg = 0
        self.min = 0
        self.max = 0
        self.sum = 0
        self.cnt = 0
        
    def __call__(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.cnt += n
        self.avg = self.sum / self.cnt
        if (val < self.min): self.min = val
        if (val > self.max): self.max = val
            
    def __repr__(self):
        return f'{self.name}_tracker'

### Gradient Clipping

In [10]:
def clip_gradient(optimizer, grad_clip):
    """
    Clips gradients computed during backpropagation to avoid explosion of gradients.
    :param optimizer: optimizer with the gradients to be clipped
    :param grad_clip: clip value
    """
    for group in optimizer.param_groups:
        for param in group['params']:
            if param.grad is not None:
                param.grad.data.clamp_(-grad_clip, grad_clip)

# Training

In [11]:
class Exp():
    
    def __init__(self, train_dl, eval_dl, model, criterion, metric, 
                 name=None, desc=None, display_every_n_batches=None, device=None):
        if device is None:
            self.device = "cpu"
        else:
            self.device = device
        # meta fields
        self.epoch = 0
        self.exp_id = f"exp_{uuid.uuid4()}"
        self.name = name if name is not None else "model"
        self.desc = desc if desc is not None else ""
        # dataloader
        self.train_dl = train_dl
        self.eval_dl  = eval_dl
        # model, criterion; put to appropriate device
        self.m = model.to(self.device)
        self.criterion = criterion.to(self.device)
        self.display_every_n_batches = display_every_n_batches
        self.metric = metric

    
    def __repr__(self):
        return f'{self.name} expID: {self.exp_id} @ epoch: {self.epoch}\n{self.desc}'
        
        
    def setup(self):
        # create directory to hold experiment artifacts
        import os
        os.makedirs(f'./{self.exp_id}/checkpoints', exist_ok=True)        

        # trackers to keep track of time
        self.epoch_time = Tracker('epoch_time')
        self.batch_time = Tracker('batch_time')

        # tracker for loss & metric
        self.loss_tracker = Tracker('loss')
        self.metric_tracker = Tracker('mAP')
                
        # gather model params to optimize
        self.weights, self.biases = list(), list()
        for param_name, param in self.m.named_parameters():
            if param.requires_grad:
                if param_name.endswith('.bias'):
                    self.biases.append(param)
                else:
                    self.weights.append(param)
        
                 
    def save_checkpoint(self):
        state = {'exp_id': self.exp_id,
                 'epoch': self.epoch,
                 'model': self.m,
                 'optimizer': self.optimizer}
        filename = f'{self.exp_id}/checkpoints/{self.name}_{self.epoch}epoch.pth.tar'
        torch.save(state, filename)

        
    def load_checkpoint(self, path):
        chkpt = torch.load(path)
#         self.exp_id = chkpt['exp_id']
        self.epoch = chkpt['epoch'] + 1
        self.m = chkpt['model']
        self.optimizer = chkpt['optimizer']
        print(f"checkpoint loaded; resume training from epoch {self.epoch}")
        self.setup()

                 
    def train_one_epoch(self, epoch):
        epoch_start_time = time.time()
        print(f"Epoch {epoch} started at {epoch_start_time}")
        self.bs = self.train_dl.batch_size
        
        # iterate over batches
        self.m.train()
        for i, batch in enumerate(self.train_dl):
            batch_start_time = time.time()
            
            # get ground truth information
            images = batch['images'].to(self.device)
            boxes  = [b.to(self.device) for b in batch['boxes']]
            labels = [c.to(self.device) for c in batch['cats']]
            
            # forward pass to model & track time taken
            pred_boxes, pred_scores = self.m(images)
            
            # compute loss & backprop
            loss = self.criterion(pred_boxes, pred_scores, boxes, labels)
            self.optimizer.zero_grad()
            loss.backward()
            # clip gradient
            if self.gradient_clip is not None:
                clip_gradient(self.optimizer, self.gradient_clip)
            self.optimizer.step()
            
            # update trackers
            self.loss_tracker(loss.item()/1e3, self.bs)

            # time batch time
            self.batch_time(time.time() - batch_start_time)
            
            # Print status
            if self.display_every_n_batches is not None:
                if i % self.display_every_n_batches == 0:
                    # compute mAP
                    detected_boxes, detected_labels, detected_scores = self.m.detect_objects(pred_boxes, pred_scores, 0.5, 0.5, 10)
                    mean_AP, aps = self.metric(detected_boxes, detected_labels, detected_scores, boxes, labels)
                    self.metric_tracker(mean_AP)
                    # display progress
                    print(f"Epoch: [{epoch}][{i}/{len(self.train_dl)}]\t"
                          f"Avg.time: {self.batch_time.avg:.1f}\t"
                          f"Loss: {self.loss_tracker.val:.3f} (Avg: {self.loss_tracker.avg:.3f})\t"
                          f"mAP: {self.metric_tracker.val:.3f} (Avg: {self.metric_tracker.avg:.3f})")
        
        # free some memory since their histories may be stored
        del pred_boxes, pred_scores, images, boxes, labels
        # display duration & update epoch
        print(f"Epoch {epoch} completed; duration {(time.time() - epoch_start_time)/60.} minutes")

    
    def eval(self, dl, name=None):
        if not name: name = 'eval'
        eval_start_time = time.time()
        
        # set model to eval mode
        self.m.eval()
        with torch.no_grad():
            # iterate over dataloader
            for i, batch in enumerate(dl): 
                # get ground truth information
                images = batch['images'].to(self.device)
                boxes  = [b.to(self.device) for b in batch['boxes']]
                labels = [c.to(self.device) for c in batch['cats']]
                # get prediction
                pred_boxes, pred_scores = self.m(images)
                detected_boxes, detected_labels, detected_scores = ssd.detect_objects(pred_boxes, pred_scores, 0.5, 0.5, 10)
                # compute metric
                mean_AP, aps = self.metric(detected_boxes, detected_labels, detected_scores, boxes, labels)
                self.metric_tracker(mean_AP)
        
        # display metric
        print(f"Min mAP: {self.metric_tracker.min}\t"
              f"Max mAP: {self.metric_tracker.max}\t"
              f"Avg mAP: {self.metric-tracker.avg}")
        print(f"Evaluation took {time.time() - eval_start_time} seconds")

                 
    def train(self, n_epochs, optimizer, lr, gradient_clip=None, eval_every_n_epochs=10, save_every_n_epoch=1):
        # setup experiment
        self.optimizer = optimizer
        self.lr = lr
        self.gradient_clip = gradient_clip
        self.save_every_n_epoch = save_every_n_epoch
        
        # iterate over epochs
        for n in range(n_epochs):
            
            # evaluate on validation dataset every n epochs
            if (n > 0) and (n % eval_every_n_epochs == 0):
                print('here')
                self.eval(self.eval_dl)
                
            # run one epoch training
            self.train_one_epoch(n)
            
            # save every n epoch
            if n % self.save_every_n_epoch == 0:
                self.save_checkpoint()
            
            self.epoch += 1

# Setup Experiment & Optimizer

In [12]:
# gather ingredients for experiment
ingredients = {
    'train_dl' : dl_train,
    'eval_dl'  : dl_valid,
    'model'    : ssd,
    'criterion': criterion,
    'metric'   : metric,
    'name'     : 'ssd',
    'desc'     : 'training ssd on coco2014 dataset',
    'display_every_n_batches' : 50,
    'device'   : device
}
# init exerpiment & setup
exp = Exp(**ingredients)

In [13]:
# train from scratch
exp.setup()

In [14]:
exp

ssd expID: exp_ab068a56-6bb2-4501-8f1c-d8dc41c0c211 @ epoch: 0
training ssd on coco2014 dataset

In [None]:
# load from checkpoint
chkpt_path = 'exp_95788307-930e-4c0d-9c93-f95f49872adb/checkpoints/ssd_0epoch.pth.tar'
exp.load_checkpoint(chkpt_path)

In [None]:
# define & init optimizer
momentum = 0.9
weight_decay = 5e-4
lr = 1e-3
optimizer = torch.optim.SGD(params=[{'params': exp.biases, 'lr': 2 * lr}, {'params': exp.weights}], # update biases at 2x LR over weights
                            lr=lr, momentum=momentum, weight_decay=weight_decay)

In [None]:
# train for 1 epoch
exp.train(1, optimizer, lr, gradient_clip=0.1)

In [None]:
# train for x epochs
exp.train(1, optimizer, lr, gradient_clip=0.1)

In [None]:
exp.epoch = 2
exp.save_checkpoint()