In [1]:
import time
import uuid
import torch.nn.functional as F
import torchvision.transforms.functional as FT
from functools import partial
from torch import nn
from dataset import CocoDataset
from utils   import *
from model   import *
from metric  import *
from loss    import *
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Setup

### Data transformations, dataset, & dataloader:

In [2]:
# define the sequence of transformations to apply to each image sample 
img_sz = 300
basic_tfs = [PhotometricDistort(1.),
             Flip(0.5),
             ImageToTensor(), CategoryToTensor(), BoxToTensor(),
             Zoomout(0.5, max_scale=2.5),
             Normalize(), 
             Resize((img_sz, img_sz))]
tfms = transforms.Compose(basic_tfs)

# instantiate the dataset object
ds = CocoDataset(data_dir='./', dataset='val2017', anno_type='instances', transforms=tfms)

# create dataloader
BS = 8
dl = DataLoader(ds, batch_size=BS, shuffle=True, 
                collate_fn=partial(ds.collate_fn, img_resized=True)) # img_resized=true to indicate all image samples have been resized to same shape

loading annotations into memory...
Done (t=0.49s)
creating index...
index created!


### SSD Model

In [4]:
# create the SSD model
ssd = SSD300(len(ds.id2cat))

### Cost function

In [5]:
# multi-loss criteria
criterion = MultiBoxLoss(300, ssd.prior_boxes, threshold=0.5, neg_pos_ratio=3, alpha=1.)

### Metric

In [6]:
metric = mAP(n_classes=ssd.n_classes)

### Tracker

In [7]:
class Tracker(object):
    """
    Keeps track of most recent, average, sum, and count of a metric.
    """

    def __init__(self, name=None):
        self.reset()
        self.name = name
        
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.cnt = 0
        
    def __call__(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.cnt += n
        self.avg = self.sum / self.cnt

### Gradient Clipping

In [8]:
def clip_gradient(optimizer, grad_clip):
    """
    Clips gradients computed during backpropagation to avoid explosion of gradients.
    :param optimizer: optimizer with the gradients to be clipped
    :param grad_clip: clip value
    """
    for group in optimizer.param_groups:
        for param in group['params']:
            if param.grad is not None:
                param.grad.data.clamp_(-grad_clip, grad_clip)

# Training

In [17]:
class Exp():
    
    def __init__(self, dataloader, model, criterion, name=None, desc=None, verbose=None):
        global device 
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # meta fields
        self.name = name if name is not None else "model"
        self.desc = desc if desc is not None else "..."
        # dataloader, model, criterion; put to appropriate device
        self.dl = dataloader
        self.bs = dataloader.batch_size
        self.m = model.to(device)
        self.criterion = criterion.to(device)
        self.verbose = verbose

        
    def setup(self):
        # init training process params
        self.epoch = 0        
        # create directory to hold experiment artifacts
        import os
        self.exp_id = uuid.uuid4()
        os.makedirs(f'./{self.exp_id}/checkpoints', exist_ok=True)        

        # create running trackers to keep track of time, loss, metrics
        self.data_time = Tracker('data_time')   
        self.fwd_time = Tracker('fwd_time')
        self.criterion_time = Tracker('criterion_time')
        self.bkwd_time = Tracker('bkwd_time')
        self.batch_time = Tracker('batch_time')
        self.loss = Tracker('loss')
    
        # gather model params to optimize
        self.weights, self.biases = list(), list()
        for param_name, param in self.m.named_parameters():
            if param.requires_grad:
                if param_name.endswith('.bias'):
                    self.biases.append(param)
                else:
                    self.weights.append(param)
        
                 
    def save_checkpoint(epoch, model, optimizer):
        state = {'epoch': epoch,
                 'model': model,
                 'optimizer': optimizer}
        filename = f'{self.exp_id}/checkpoints/{self.name}_{epoch}epoch.pth.tar'
        torch.save(state, filename)

        
    def load_checkpoint(self, path):        
        chkpt = torch.load(path)
        self.epoch = chkpt['epoch'] + 1; print(f"checkpoint loaded; resume training from epoch {self.epoch}")
        self.m = chkpt['model']
        self.optimizer = chkpt['optimizer']
    
                 
    def train_one_epoch(self, epoch):
        start_time = time.time()
        # iterate over batches
        for i, batch in enumerate(self.dl):            
            batch_start_time = time.time()
            
            # update data time to record how long it takes to load one batch of data
            self.data_time(time.time() - start_time)
            # get ground truth information
            images = batch['images'].to(device)
            boxes  = [b.to(device) for b in batch['boxes']]
            labels = [c.to(device) for c in batch['cats']]
            
            # forward pass to model & track time taken
            t = time.time()
            pred_boxes, pred_scores = self.m(images)
            self.fwd_time(time.time() - t)
            
            # compute loss & track time taken
            t = time.time()
            loss = self.criterion(pred_boxes, pred_scores, boxes, labels)
            self.criterion_time(time.time() - t)
            
            # update params
            t = time.time()
            self.optimizer.zero_grad()
            # back-prop
            loss.backward()
            # clip gradient
            if self.gradient_clip is not None:
                clip_gradient(self.optimizer, self.gradient_clip)
            # update trainable params
            self.optimizer.step()
            self.bkwd_time(time.time() - t)
            
            # update trackers
            self.loss(loss.item(), images.size(0))
            self.batch_time(time.time() - batch_start_time)

            # Print status
            if self.verbose is not None:
                if i % self.verbose == 0:
                    print(f"Epoch: [{epoch}][{i}/{len(self.dl)}]\t"
                          f"Batch time: {self.batch_time.val:.3f} ({self.batch_time.avg:.3f})\t"
                          f"Loss: {self.loss.val:.4f} ({self.loss.avg:.4f})"
                         )
        # free some memory since their histories may be stored
        del predicted_locs, predicted_scores, images, boxes, labels  

                 
    def train(self, n_epochs, optimizer, lr, gradient_clip=None, eval_every_n_epoch=1, save_every_n_epoch=1):
        # setup experiment
        self.setup()
        self.optimizer = optimizer
        self.lr = lr
        self.gradient_clip = gradient_clip
        self.save_every_n_epoch = save_every_n_epoch
        
        # iterate over epochs
        for n in range(n_epochs):
            # run one epoch training
            self.train_one_epoch(n)
            # evaluate every n epoch
            pass
            # save every n epoch
            if n % self.save_every_n_epoch == 0:
                self.save_checkpoint(n, self.m, self.optimizer)
            
                

# Setup Experiment & Optimizer

In [14]:
# init experiment object
exp = Exp(dl, ssd, criterion, "SSD", verbose=10)

In [15]:
# define & init optimizer
momentum = 0.9
weight_decay = 5e-4
lr = 1e-3
optimizer = torch.optim.SGD(params=[{'params': exp.biases, 'lr': 2 * lr}, {'params': exp.weights}], # update biases at 2x LR over weights
                            lr=lr, momentum=momentum, weight_decay=weight_decay)

In [None]:
# train for x epochs
exp.train(5, optimizer, lr)