In [2]:
import torch.nn.functional as F
import torchvision.transforms.functional as FT
from functools import partial
from torch import nn
from dataset import CocoDataset
from utils   import *
from model   import SSD300

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# define the sequence of transformations to apply to each image sample 
basic_tfs = [PhotometricDistort(1.),
             Flip(0.5),
             ImageToTensor(), CategoryToTensor(), BoxToTensor(),
             Zoomout(0.5, max_scale=2.5),
             Normalize(), 
             Resize((300,300))]
tfms = transforms.Compose(basic_tfs)

# instantiate the dataset object
ds = CocoDataset(data_dir='./', dataset='val2017', anno_type='instances', transforms=tfms)

# create dataloader
BS = 8
dl = DataLoader(ds, batch_size=BS, shuffle=True, 
                collate_fn=partial(ds.collate_fn, img_resized=True)) # img_resized=true to indicate all image samples have been resized to same shape

# create model object
ssd = SSD300(len(ds.allcats))

# test forward pass for one batch
for batch in dl:
    image_batch = batch['images']
    print(f"image batch tensor shape: {image_batch.size()}")
    # forward pass through SSD300
    locs, cls_scores = ssd(image_batch)
    print(f"bounding box location prediction shape: {locs.size()}")
    print(f"object class prediction shape: {cls_scores.size()}")
    break

loading annotations into memory...
Done (t=0.50s)
creating index...
index created!
image batch tensor shape: torch.Size([8, 3, 300, 300])
bounding box location prediction shape: torch.Size([8, 8732, 4])
object class prediction shape: torch.Size([8, 8732, 80])


---
# Coordinate Transformations

The native coordinate system for the COCO dataset for the bounding boxes are expressed in terms of $(x, y, w, h)$, where $(x, y)$ coordinates are measured from the top left image corner $(0, 0)$. We introduce three sets of coordinate systems that are utilized through the model prediction / optimization process.

## COCO coordinates to center coordinates
This transformation ecodes/decodes the original COCO bounding box coordinates $(x, y, w, h)$ (where $(x, y)$ represent the top-left corner of bounding box) to center coordinates $(x_c, y_c, w_c, h_c)$ where $(x_c, y_c)$ represent the center of the bounding box, furthermore, both $(x_c, y_c)$ and $(w_c, h_c)$ are normalized with respect to the original size of image.

In [4]:
class Coco2CenterCoord():
    """
    Encodes/Decodes original COCO bounding box coordinates (x, y, w, h) where (x, y)
    represent the top-left corner of bounding box (in image coordinate frame) to center 
    coordinates (x_c, y_c, w_c, h_c) where (x_c, y_c) represent the center of the bounding box, 
    furthermore, both (x_c, y_c) and (w_c, h_c) are normalized with respect to the original 
    size of image
    """       
    def __init__(self, w, h):
        self.w = w
        self.h = h
        
    def encode(self, boxes):
        """
        boxes: bounding boxes tensor with coordinates in original COCO (x, y, w, h) format
        """
        x_c = (boxes[:,0] + boxes[:,2]/2.0)/self.w
        y_c = (boxes[:,1] + boxes[:,3]/2.0)/self.h
        w_c = boxes[:,2]/self.w
        h_c = boxes[:,3]/self.h
        coords = [x_c, y_c, w_c, h_c]        
        return torch.cat([c.unsqueeze(-1) for c in coords], dim=-1)
    
    def decode(self, boxes_c):
        """
        boxes_c: bounding boxes tensor with coordinates in center coordinates (x_c, y_c, w_c, h_c) format
        """
        x = (boxes_c[:,0] - boxes_c[:,2]/2.0) * self.w
        y = (boxes_c[:,1] - boxes_c[:,3]/2.0) * self.h
        width  = boxes_c[:,2] * self.w
        height = boxes_c[:,3] * self.h
        coords = [x, y, width, height]        
        return torch.cat([c.unsqueeze(-1) for c in coords], dim=-1)

In [5]:
# get a single dataset sample
sample = ds[0]
_, h, w = sample['image'].size()
boxes_before = sample['boxes']

# instantiate transform
ccoord = Coco2CenterCoord(w, h)

# transform box coordinates
boxes_after = ccoord.encode(boxes_before)

# inverse transform
boxes_inverse = ccoord.decode(boxes_after)

In [6]:
print(f"box cooridnates before transformation:\n", boxes_before[:3,:]);
print(f"\nbox coordinates after transformation:\n", boxes_after[:3,:]);
print(f"\nbox coordinates apply inverse transformation:\n", boxes_inverse[:3,:])

box cooridnates before transformation:
 tensor([[156.8133, 126.4061,   6.0146,  25.4268],
        [100.8190, 135.6439,  36.3604,  34.7085],
        [234.7914, 150.8012,  19.8093,  28.8037]])

box coordinates after transformation:
 tensor([[0.5327, 0.4637, 0.0200, 0.0848],
        [0.3967, 0.5100, 0.1212, 0.1157],
        [0.8157, 0.5507, 0.0660, 0.0960]])

box coordinates apply inverse transformation:
 tensor([[156.8133, 126.4061,   6.0146,  25.4268],
        [100.8190, 135.6439,  36.3604,  34.7085],
        [234.7914, 150.8012,  19.8093,  28.8037]])


## Center coordinates to Bounding Box coordinate offsets

For the localization aspect of SSD prediction, the model **predicts** the *"offsets relative to the default box shapes in the cell"* at each of the feature map grid locations.

- For the bounding box center coordinates $(x_c, y_c)$ of $(x_c, y_c, w_c, h_c)$ relative to prior box coordinates $(x_p, y_p, w_p, h_p)$, express offset $({\Delta}x_c, {\Delta}y_c) = (\frac{(x_c - x_p)}{w_p}, \frac{(y_c - y_p)}{h_p})$; and

- For the bounding box shape coordinates $(w_c, h_c)$, express the shape offset $({\Delta}w_c, {\Delta}h_c) = (\log{(\frac{w_c}{w_p})}, \log{(\frac{h_c}{h_p})})$

In [7]:
class OffsetCoord():
    """
    Encodes/decodes the center coordinates (x_c, y_c, w_c, h_c) of bounding boxes relative to the prior 
    boxes (from SSD, expressed also in center coordinates) in terms of offset coordinates. This offset 
    coordinates is the form that is output by the SSD locator prediction. The offset coordinates have 
    the following relation:
    (dx, dy) = ((x_c - x_p)/(x_p/10), (y_c - y_p)/(y_p/10)); and 
    (dw, dh) = (log(w_c/(w_p*5)), log(h_c/(h_p*5)))
    """
    def __init__(self):
        pass
        
    def encode(self, cxcy, priors_cxcy):
        """
        cxcy: bounding box in center-coordinate format
        prior_cxcy: prior box in center-coordinate format
        """
        dxdy = (cxcy[:,:2] - priors_cxcy[:,:2]) / (priors_cxcy[:,2:] / 10)
        dwdh = torch.log(cxcy[:,2:] / priors_cxcy[:,2:]) * 5
        return torch.cat([dxdy, dwdh], dim=1)
    
    
    def decode(self, dxdy, priors_cxcy):
        """
        dxdy: bounding boxes in offset-coordinate format wrt SSD's prior bounding boxes
        """
        cxcy = dxdy[:,:2] * priors_cxcy[:,2:] / 10 + priors_cxcy[:,:2]
        cwch = torch.exp(dxdy[:,2:] / 5) * priors_cxcy[:,2:]
        return torch.cat([cxcy, cwch], dim=1)        

In [8]:
# encode SSD prior bounding boxes in center coords
prior_boxes = ccoord.encode(ssd.prior_boxes)

# init offset coord object
ocoord = OffsetCoord()

# select one bounding box location (already encoded in center coordinate format)
bbox = boxes_after[:1,:]
# select one prior box location (also encoded in center coordinate format)
pbox = prior_boxes[:1,:]

# pick one location prediction for demonstration purpose
# encode the locational prediction output in offset coordinates, related to SSD prior bounding boxes
bbox_offset = ocoord.encode(bbox, pbox)
bbox_offset_inv = ocoord.decode(bbox_offset, pbox)

print(f"bounding box in center coordinates:\n{bbox}\n")
print(f"bounding box in offset coordinates:\n{bbox_offset}\n")
print(f"bounding box converted back to center coordinates:\n{bbox_offset_inv}")

bounding box in center coordinates:
tensor([[0.5327, 0.4637, 0.0200, 0.0848]])

bounding box in offset coordinates:
tensor([[15975.7461, 13905.6348,    20.4839,    27.6920]])

bounding box converted back to center coordinates:
tensor([[0.5327, 0.4637, 0.0200, 0.0848]])


# Intersection Over Union (IoU)

The IoU is a simple concept that compute the "intersection over union" of two bounding box regions `b1` and `b2`. The union is computed as the sum of areas of the two boxes together minus the overlaping area (intersection) of the two boxes. THe IoU is then simply computed as a ratio of $\frac{b1 \cap b2}{b1 \cup b2}$