In [1]:
import torch.nn.functional as F
import torchvision.transforms.functional as FT
from functools import partial
from torch import nn
from dataset import CocoDataset
from utils   import *
from model   import SSD300

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# define the sequence of transformations to apply to each image sample 
basic_tfs = [PhotometricDistort(1.),
             Flip(0.5),
             ImageToTensor(), CategoryToTensor(), BoxToTensor(),
             Zoomout(0.5, max_scale=2.5),
             Normalize(), 
             Resize((300,300))]
tfms = transforms.Compose(basic_tfs)

# instantiate the dataset object
ds = CocoDataset(data_dir='./', dataset='val2017', anno_type='instances', transforms=tfms)

# create dataloader
BS = 8
dl = DataLoader(ds, batch_size=BS, shuffle=True, 
                collate_fn=partial(ds.collate_fn, img_resized=True)) # img_resized=true to indicate all image samples have been resized to same shape

# create model object
ssd = SSD300(len(ds.id2cat))

# test forward pass for one batch
for batch in dl:
    image_batch = batch['images']
    print(f"image batch tensor shape: {image_batch.size()}")
    # forward pass through SSD300
    pred_boxes, pred_scores = ssd(image_batch)
    print(f"bounding box location prediction shape: {pred_boxes.size()}")
    print(f"object class prediction shape: {pred_scores.size()}")
    break

loading annotations into memory...
Done (t=0.67s)
creating index...
index created!
image batch tensor shape: torch.Size([8, 3, 300, 300])
bounding box location prediction shape: torch.Size([8, 8732, 4])
object class prediction shape: torch.Size([8, 8732, 81])


# Coordinate Systems

The native coordinate system for the COCO dataset for the bounding boxes are expressed in terms of $(x, y, w, h)$, where $(x, y)$ coordinates are measured from the top left image corner $(0, 0)$. We introduce three sets of coordinate systems that are utilized through the model prediction / optimization process.

## COCO coordinates to center coordinates
This transformation ecodes/decodes the original COCO bounding box coordinates $(x, y, w, h)$ (where $(x, y)$ represent the top-left corner of bounding box) to center coordinates $(x_c, y_c, w_c, h_c)$ where $(x_c, y_c)$ represent the center of the bounding box, furthermore, both $(x_c, y_c)$ and $(w_c, h_c)$ are normalized with respect to the original size of image.

In [10]:
class Coco2CenterCoord():
    """
    Encodes/Decodes original COCO bounding box coordinates (x, y, w, h) where (x, y)
    represent the top-left corner of bounding box (in image coordinate frame) to center 
    coordinates (x_c, y_c, w_c, h_c) where (x_c, y_c) represent the center of the bounding box, 
    furthermore, both (x_c, y_c) and (w_c, h_c) are normalized with respect to the original 
    size of image
    """       
    def __init__(self, w, h):
        self.w = w
        self.h = h
        
    def encode(self, boxes):
        """
        boxes: bounding boxes tensor with coordinates in original COCO (x, y, w, h) format
        """
        x_c = (boxes[:,0] + boxes[:,2]/2.0)/self.w
        y_c = (boxes[:,1] + boxes[:,3]/2.0)/self.h
        w_c = boxes[:,2]/self.w
        h_c = boxes[:,3]/self.h
        coords = [x_c, y_c, w_c, h_c]        
        return torch.cat([c.unsqueeze(-1) for c in coords], dim=-1)
    
    def decode(self, boxes_c):
        """
        boxes_c: bounding boxes tensor with coordinates in center coordinates (x_c, y_c, w_c, h_c) format
        """
        x = (boxes_c[:,0] - boxes_c[:,2]/2.0) * self.w
        y = (boxes_c[:,1] - boxes_c[:,3]/2.0) * self.h
        width  = boxes_c[:,2] * self.w
        height = boxes_c[:,3] * self.h
        coords = [x, y, width, height]        
        return torch.cat([c.unsqueeze(-1) for c in coords], dim=-1)

In [11]:
# get a single dataset sample
sample = ds[0]
_, h, w = sample['image'].size()
boxes_before = sample['boxes']

# instantiate transform
ccoord = Coco2CenterCoord(w, h)

# transform box coordinates
boxes_after = ccoord.encode(boxes_before)

# inverse transform
boxes_inverse = ccoord.decode(boxes_after)

In [12]:
print(f"box cooridnates before transformation:\n", boxes_before[:3,:]);
print(f"\nbox coordinates after transformation:\n", boxes_after[:3,:]);
print(f"\nbox coordinates apply inverse transformation:\n", boxes_inverse[:3,:])

box cooridnates before transformation:
 tensor([[111.0844, 100.3592,  11.5781,  48.9437],
        [  3.2953, 118.1408,  69.9938,  66.8099],
        [261.1922, 147.3169,  38.1328,  55.4437]])

box coordinates after transformation:
 tensor([[0.3896, 0.4161, 0.0386, 0.1631],
        [0.1276, 0.5052, 0.2333, 0.2227],
        [0.9342, 0.5835, 0.1271, 0.1848]])

box coordinates apply inverse transformation:
 tensor([[111.0844, 100.3592,  11.5781,  48.9437],
        [  3.2953, 118.1408,  69.9938,  66.8099],
        [261.1922, 147.3169,  38.1328,  55.4437]])


## Center coordinates to Boundary coordinates

Encodes/decodes the bounding box center coordinates $(x_{c}, y_{c}, w_{c}, h_{c})$ to/from boundary coordinates $(x_{1}, y_{1}, x_{2}, y_2)$ where $(x_1, y_1)$ specifies the upper-left corner and $(x_2, y_2)$ the lower-right corner of the boundary of bounding boxes.

In [13]:
class BoundaryCoord():
    """
    Encodes/decodes the bounding box center coordinates (x_c, y_c, w_c, h_c) to/from boundary coordinates 
    (x_1, y_1, x_2, y_2) where (x_1, y_1) specifies the upper-left corner and (x_2, y_2) the lower-right
    corner of the boundary of bounding boxes
    """        
    def encode(self, boxes):
        """
        boxes: bounding boxes tensor in center coordinates (x_c, y_c, w_c, h_c) format
        return: bounding boxes tensor in boundary coordinates (x_1, y_1, x_2, y_2) format
        """
        x1 = boxes[:,0] - boxes[:,2]/2.0
        y1 = boxes[:,1] - boxes[:,3]/2.0
        x2 = boxes[:,0] + boxes[:,2]/2.0
        y2 = boxes[:,1] + boxes[:,3]/2.0        
        coords = [x1, y1, x2, y2]        
        return torch.cat([c.unsqueeze(-1) for c in coords], dim=-1)    
        
    def decode(self, boxes):
        """
        boxes: bounding boxes tensor in boundary coordinates (x_1, y_1, x_2, y_2) format
        return: bounding boxes tensor in center coordinates (x_c, y_c, w_c, h_c) format
        """
        w_c = boxes[:,2] - boxes[:,0]
        h_c = boxes[:,3] - boxes[:,1]
        x_c = boxes[:,0] + w_c/2.0
        y_c = boxes[:,1] + h_c/2.0
        coords = [x_c, y_c, w_c, h_c]
        return torch.cat([c.unsqueeze(-1) for c in coords], dim=-1)

In [15]:
# get a single dataset sample
sample = ds[0]
boxes = sample['boxes']
_, h, w = sample['image'].size()

# instantiate transforms
ccoord = Coco2CenterCoord(w, h)
bcoord = BoundaryCoord()

# transform bounding boxes from Coco-coordinates to center-coordinates
boxes_center = ccoord.encode(boxes)
# transform bounding boxes from center-coordinates to boundary-coordinates
boxes_boundary = bcoord.encode(boxes_center)
# transform from boundary-coordinates back to center-coordinates
boxes_center_  = bcoord.decode(boxes_boundary)

In [19]:
print(f"Coco coordinates:\n{boxes[:3]}")
print(f"\nCoco coordinates -> center coordinates:\n{boxes_center[:3]}")
print(f"\ncenter coordinates -> boundary coordinates:\n{boxes_boundary[:3]}")
print(f"\nboundary coordinates -> center coordinates:\n{boxes_center_[:3]}")

Coco coordinates:
tensor([[195.2894, 101.5071,   8.7176,  36.8375],
        [232.4647, 114.8905,  52.7012,  50.2845],
        [ 62.2729, 136.8498,  28.7118,  41.7297]])

Coco coordinates -> center coordinates:
tensor([[0.6655, 0.3998, 0.0291, 0.1228],
        [0.8627, 0.4668, 0.1757, 0.1676],
        [0.2554, 0.5257, 0.0957, 0.1391]])

center coordinates -> boundary coordinates:
tensor([[0.6510, 0.3384, 0.6800, 0.4611],
        [0.7749, 0.3830, 0.9506, 0.5506],
        [0.2076, 0.4562, 0.3033, 0.5953]])

boundary coordinates -> center coordinates:
tensor([[0.6655, 0.3998, 0.0291, 0.1228],
        [0.8627, 0.4668, 0.1757, 0.1676],
        [0.2554, 0.5257, 0.0957, 0.1391]])


## Center coordinates to Prior Box coordinate offsets

For the localization aspect of SSD prediction, the model **predicts** the *"offsets relative to the default box shapes in the cell"* at each of the feature map grid locations.

- For the bounding box center coordinates $(x_c, y_c)$ of $(x_c, y_c, w_c, h_c)$ relative to prior box coordinates $(x_p, y_p, w_p, h_p)$, express offset $({\Delta}x_c, {\Delta}y_c) = (\frac{(x_c - x_p)}{w_p}, \frac{(y_c - y_p)}{h_p})$; and

- For the bounding box shape coordinates $(w_c, h_c)$, express the shape offset $({\Delta}w_c, {\Delta}h_c) = (\log{(\frac{w_c}{w_p})}, \log{(\frac{h_c}{h_p})})$

In [20]:
class OffsetCoord():
    """
    Encodes/decodes the center coordinates (x_c, y_c, w_c, h_c) of bounding boxes relative to the prior 
    boxes (from SSD, expressed also in center coordinates) in terms of offset coordinates. This offset 
    coordinates is the form that is output by the SSD locator prediction. The offset coordinates have 
    the following relation:
    (dx, dy) = ((x_c - x_p)/(x_p/10), (y_c - y_p)/(y_p/10)); and 
    (dw, dh) = (log(w_c/(w_p*5)), log(h_c/(h_p*5)))
    """
    def __init__(self):
        pass
        
    def encode(self, cxcy, priors_cxcy):
        """
        cxcy: bounding box in center-coordinate format
        prior_cxcy: prior box in center-coordinate format
        """
        dxdy = (cxcy[:,:2] - priors_cxcy[:,:2]) / (priors_cxcy[:,2:] / 10)
        dwdh = torch.log(cxcy[:,2:] / priors_cxcy[:,2:]) * 5
        return torch.cat([dxdy, dwdh], dim=1)
    
    
    def decode(self, dxdy, priors_cxcy):
        """
        dxdy: bounding boxes in offset-coordinate format wrt SSD's prior bounding boxes
        """
        cxcy = dxdy[:,:2] * priors_cxcy[:,2:] / 10 + priors_cxcy[:,:2]
        cwch = torch.exp(dxdy[:,2:] / 5) * priors_cxcy[:,2:]
        return torch.cat([cxcy, cwch], dim=1)

In [22]:
# prior bounding boxes already in center coords
prior_boxes = ssd.prior_boxes

# init offset coord object
ocoord = OffsetCoord()

# select one bounding box location (already encoded in center coordinate format)
bbox = boxes_after[:3,:]
# select one prior box location (also encoded in center coordinate format)
pbox = prior_boxes[:3,:]

# pick one location prediction for demonstration purpose
# encode the locational prediction output in offset coordinates, related to SSD prior bounding boxes
bbox_offset = ocoord.encode(bbox, pbox)
bbox_offset_inv = ocoord.decode(bbox_offset, pbox)

print(f"bounding box in center coordinates:\n{bbox}\n")
print(f"prior box in center coordinates:\n{pbox}\n")
print(f"bounding box in offset coordinates:\n{bbox_offset}\n")
print(f"bounding box converted back to center coordinates:\n{bbox_offset_inv}")

bounding box in center coordinates:
tensor([[0.3896, 0.4161, 0.0386, 0.1631],
        [0.1276, 0.5052, 0.2333, 0.2227],
        [0.9342, 0.5835, 0.1271, 0.1848]])

prior box in center coordinates:
tensor([[0.0132, 0.0132, 0.1000, 0.1000],
        [0.0132, 0.0132, 0.1414, 0.1414],
        [0.0132, 0.0132, 0.1414, 0.0707]])

bounding box in offset coordinates:
tensor([[37.6420, 40.2945, -4.7604,  2.4474],
        [ 8.0952, 34.7893,  2.5032,  2.2704],
        [65.1272, 80.6532, -0.5335,  4.8037]])

bounding box converted back to center coordinates:
tensor([[0.3896, 0.4161, 0.0386, 0.1631],
        [0.1276, 0.5052, 0.2333, 0.2227],
        [0.9342, 0.5835, 0.1271, 0.1848]])
