In [1]:
import torch.nn.functional as F
import torchvision.transforms.functional as FT
from functools import partial
from torch import nn
from dataset import CocoDataset
from utils   import *
from model   import SSD300

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# define the sequence of transformations to apply to each image sample 
basic_tfs = [PhotometricDistort(1.),
             Flip(0.5),
             ImageToTensor(), CategoryToTensor(), BoxToTensor(),
             Zoomout(0.5, max_scale=2.5),
             Normalize(), 
             Resize((300,300))]
tfms = transforms.Compose(basic_tfs)

# instantiate the dataset object
ds = CocoDataset(data_dir='./', dataset='val2017', anno_type='instances', transforms=tfms)

# create dataloader
BS = 8
dl = DataLoader(ds, batch_size=BS, shuffle=True, 
                collate_fn=partial(ds.collate_fn, img_resized=True)) # img_resized=true to indicate all image samples have been resized to same shape

# create model object
ssd = SSD300(len(ds.id2cat))

# test forward pass for one batch
for batch in dl:
    image_batch = batch['images']
    print(f"image batch tensor shape: {image_batch.size()}")
    # forward pass through SSD300
    pred_boxes, pred_scores = ssd(image_batch)
    print(f"bounding box location prediction shape: {pred_boxes.size()}")
    print(f"object class prediction shape: {pred_scores.size()}")
    break

loading annotations into memory...
Done (t=0.49s)
creating index...
index created!
image batch tensor shape: torch.Size([8, 3, 300, 300])
bounding box location prediction shape: torch.Size([8, 8732, 4])
object class prediction shape: torch.Size([8, 8732, 81])


# Intersection Over Union (IoU)

The IoU is a simple concept that compute the "intersection over union" of two bounding box regions $b1$ and $b2$. The union is computed as the sum of areas of the two boxes together minus the overlaping area (intersection) of the two boxes. THe IoU is then simply computed as a ratio of $\frac{b1 \cap b2}{b1 \cup b2}$. This is also known as the **Jaccard overlap**.


This requires two separate computations given two sets of center coordinates $(x_{c1}, y_{c1}, w_{c1}, h_{c1})$ and $(x_{c2}, y_{c2}, w_{c2}, h_{c2})$: 

1) intersection of area $A_{b1 \cap b2} = b1 \cap b2$; and 

2) overlap of area: $A_{b1 \cup b2} = A_{b1} + A_{b2} - A_{b1 \cap b2}$

## Intersection

In [3]:
def find_intersection(set_1, set_2):
    """
    Find the intersection of every box combination between two sets of boxes that are in 
    boundary coordinates (x_1, y_1, x_2, y_2)
    :param set_1: set 1, a tensor of dimensions (n1, 4) in boundary coordinates
    :param set_2: set 2, a tensor of dimensions (n2, 4) in boundary coordinates
    :return: intersection of each of the boxes in set 1 with respect to each of the boxes 
             in set 2, a tensor of dimensions (n1, n2)
    """    
    # Following code from: https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection/blob/master/utils.py
    # PyTorch auto-broadcasts singleton dimensions
    lower_bounds = torch.max(set_1[:, :2].unsqueeze(1), set_2[:, :2].unsqueeze(0))  # (n1, n2, 2)
    upper_bounds = torch.min(set_1[:, 2:].unsqueeze(1), set_2[:, 2:].unsqueeze(0))  # (n1, n2, 2)
    intersection_dims = torch.clamp(upper_bounds - lower_bounds, min=0)  # (n1, n2, 2)
    return intersection_dims[:, :, 0] * intersection_dims[:, :, 1]  # (n1, n2)

In [4]:
def find_jaccard_overlap(set_1, set_2):
    """
    Find the Jaccard Overlap (IoU) of every box combination between two sets of boxes that are in boundary coordinates.
    :param set_1: set 1, a tensor of dimensions (n1, 4)
    :param set_2: set 2, a tensor of dimensions (n2, 4)
    :return: Jaccard Overlap of each of the boxes in set 1 with respect to each of the boxes in set 2, a tensor of dimensions (n1, n2)
    
    Code from: https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection/blob/master/utils.py
    """
    # convert from center coordinates to boundary coordinates
    bcoord = BoundaryCoord()
    set_1 = bcoord.encode(set_1)
    set_2 = bcoord.encode(set_2)
    
    # Find intersections
    intersection = find_intersection(set_1, set_2)  # (n1, n2)

    # Find areas of each box in both sets
    areas_set_1 = (set_1[:, 2] - set_1[:, 0]) * (set_1[:, 3] - set_1[:, 1])  # (n1)
    areas_set_2 = (set_2[:, 2] - set_2[:, 0]) * (set_2[:, 3] - set_2[:, 1])  # (n2)

    # Find the union
    # PyTorch auto-broadcasts singleton dimensions
    union = areas_set_1.unsqueeze(1) + areas_set_2.unsqueeze(0) - intersection  # (n1, n2)

    return intersection / union  # (n1, n2)

In [5]:
# get a single dataset sample
sample = ds[0]
_, h, w = sample['image'].size()
bboxes = sample['boxes']

# instantiate transform
ccoord = Coco2CenterCoord(w, h)

# encode bounding boxes in center coords
bboxes = ccoord.encode(bboxes)
# get prior bounding boxes from SSD (which is already constructed in center coordinates)
prior_boxes = ssd.prior_boxes

# compute Jaccard overlap
jOverlap = find_jaccard_overlap(bboxes, prior_boxes)

In [6]:
threshold = 0.5
print(f"{torch.sum(jOverlap > threshold)} out of {prior_boxes.shape[0]} \
prior bounding boxes have overlap region with threshold > {threshold}") 

59 out of 8732 prior bounding boxes have overlap region with threshold > 0.5


# Mean Average Precision

In [7]:
class mAP():
    
    def __init__(self, n_classes):
        """
        :param n_classes: number of class objects to compute mean AP over; note that 0 
                          should be reserved as background class
        """
        self.n_classes = n_classes
        
        
    def concat_batch_tensors(boxes, labels, scores=None):
        """
        Each batch contains M images, and each image contains N_i objects within (since each object contains
        different numbr of objects). As a result, each of boxes, labels, scores are in the form of list of 
        tensors. This helper function simply concatenates all tensors within the list into a single one for
        the batch.
        
        :param boxes: list of M tensors each of size (N_i, 4) for bounding boxes of each image within the batch
        :param labels: list of M tensors each of size (N_i, self.n_class) for labels of objects within each image within the batch
        :param scores: list of M tensors each of size (N_i, self.n_class) for confidence scores for each object 
                       within each image within the batch
                       
        :return img_idx: 1-D tensor with size being the total number of objects in the image batch, each 
                         entry tells which image the object belongs to
        :return boxes: 2-D tensor with size (n_total_objects_in_batch, 4)
        :return labels: 1-D tensor with size (n_total_objects_in_batch)
        :return scores: 1-D tensor with size (n_total_objects_in_batch)
        """
        
        # initialize a list to keep track of the image corresponding to entries in the list
        img_idx, n_images = list(), len(labels)
        for idx in range(n_images):
            n_objects_in_img = boxes[idx].size(0)
            img_idx.extend([idx] * n_objects_in_img)        
        img_idx = torch.LongTensor(img_idx).to(device)
        boxes   = torch.cat(boxes, dim=0)
        labels  = torch.cat(labels, dim=0)
        assert img_idx.size(0) == boxes.size(0) == labels.size(0), "tensor size mismatch"
        if scores is not None:
            scores = torch.cat(scores, dim=0)
        return {'img'   : img_idx, 
                'boxes' : boxes, 
                'labels': labels, 
                'scores': scores}
        
        
    def class_specific_mAP(truths, preds, category):
        """
        :param truths: dictionary containing ground truth information with keys 'img', 'boxes', 'labels', 'scores'
        :param preds:  dictionary containing predicted information with keys 'img', 'boxes', 'labels', 'scores'
        :param category: integer representing the category of interest
        
        : return mAP_class: mean average precision of detections related to category
        """
        # get predictions related to this category
        pred_labels = preds['labels']
        pred_class_images = preds['img'][pred_labels==category]
        pred_class_boxes  = preds['boxes'][pred_labels==category]
        pred_class_scores = preds['scores'][pred_labels==category]
        n_detections      = pred_class_boxes.size(0)
        # mAP is simply 0 if there's nothing detected to be in this class
        if n_detections == 0:
            return 0.0

        # get ground truths related to this category
        true_labels = truths['labels']
        true_class_images = truths['img'][true_labels==category]
        true_class_boxes  = truths['boxes'][true_labels==category]

        # re-order scores/images/boxes by descending confidence score
        pred_class_scores, sort_idx = torch.sort(pred_class_scores, dim=0, descending=True)
        pred_class_images = pred_class_images[sort_idx]
        pred_class_boxes  = pred_class_boxes[sort_idx]
        
        # initialize tensors to keep track of:
        # a) which true objects with this class have been 'detected'
        true_class_boxes_detected = torch.zeros((true_class_boxes.size(0)), dtype=torch.uint8).to(device)
        # b) which detected boxes are true positives
        tp = torch.zeros((n_detections), dtype=torch.float).to(device)
        # c) which detected boxes are flase positives
        fp = torch.zeros((n_detections), dtype=torch.float).to(device)
        
        # iterate through each detection & check whether it is true-positive or false-positive
        for d in range(n_detections):
            # get the image this detection is made on + bounding box & score associated with this detection
            this_img   = pred_class_images[d]
            this_box   = pred_class_boxes[d].unsqueeze(0)
            this_score = pred_class_scores[d].unsqueeze(0)
            # get ground truth boxes for this image
            true_boxes = true_class_boxes[true_class_images==this_img]
            # if there are no boxes in this image matching this category, then mark as false-positive
            if true_boxes.size(0) == 0:
                fp[d] = 1
                continue
                
            # compute Jaccard overlaps; if there are significant level of overlap regions between the 
            # current (single) detected bounding box and ground truth boxes (multiple), then it is a 
            # true-positive; false-positives otherwise
            overlaps = find_jaccard_overlap(this_box, true_boxes) # (1, n_true_objects_in_img)
            max_overlap, idx = torch.max(overlaps.squeenze(0), dim=0)
            # get the original_idx position of this object within the true_class_boxes_detected tensor
            # this is used to check whether this object has already been detected prior
            origin_idx = torch.LongTensor(range(true_class_boxes.size(0)))[true_class_images==this_img][idx]            
            # if max overlap is greater than 0.5 threshold, this prediction has detected this object
            if max_overlap.item() > 0.5:
                # check whether this object has been detected before
                if true_class_boxes_detected[origin_idx] == 0:
                    tp[d] = 1
                else:
                    fp[d] = 1
        
        # consolidate how many true-positive & true-positive detections there were
        cumsum_tp = torch.cumsum(tp, dim=0)  # (n_class_detections) cumulative sums
        cumsum_fp = torch.cumsum(fp, dim=0)  # (n_class_detections) cumulative sums
        cumsum_precision = cumsum_tp / (cumsum_tp + cumsum_fp + 1e-10)
        cumsum_recall    = cumsum_tp / true_class_boxes.size(0)  # note: we ignored difficulties
        
        # create thresholds between [0,1] with 0.1 increments
        recall_thresholds = torch.arange(start=0, end=1.1, step=0.1).tolist()
        precisions        = torch.zeros((len(recall_thresholds)), dtype=torch.float).to(device)
        for i, t in enumerate(recall_thresholds):
            recalls_above_t = cumsum_recall >= t
            if recalls_above_t.any():
                precisions[i] = cumsum_precision[recalls_above_t].max()
            else:
                precisions[i] = 0.
        class_mAP = precisions.mean().item()        
        return class_mAP
                            
        
    def __call__(self, pred_boxes, pred_labels, pred_scores, true_boxes, true_labels):
        """
        Takes in both prediction & ground truth labeling for both bounding boxes and object class labels
        to compute the mean average precision (mAP). This function operates on a batch of images at a 
        time, and because each image contain different number of objects within, each input should be 
        provided as a list of tensors (where each entry of the list is for one particular image). For 
        example: 
        
        :param pred_boxes: predicted bounding boxes for each image, list of M tensors each of size (N, 4)
        :param pred_labels: predicted class label for each image, list of M tensors each of size (N, self.n_class)
        :param pred_scores: predicted class score for each image, list of M tensors each of size (N, self.n_class)
        :param true_boxes: ground truths bounding boxes, list of M tensors each of size (N, 4)
        :param true_labels: ground truths class label for each iamge, list of M tensors each of size (N, self.n_class)
        
        :return: list of average precisions for all classes, mean average precision (mAP)
        """
        # check length of list of each input is consistent
        assert len(pred_boxes) == len(pred_labels) == len(pred_scores) == len(true_boxes) == len(true_labels),\
        "input tensor length mismatch"
        
        # we want to concatenate the list of tensors together within each list, to do that, we first need 
        # to track which object belongs to which image
        truths = concat_batch_tensors(true_boxes, true_labels)
        
        # because the number of predicted objects may not necessarily match the actual number of objects, we 
        # also need to do the same for predicted tensors separately
        preds  = concat_batch_tensors(pred_boxes, pred_labels, pred_scores)
        
        # iterate over each category to compute the average precision of the detections for that category 
        avg_precisions = torch.zeros((self.n_classes - 1), dtype=torch.float)
        AP = {}
        for c in range(1, self.n_classes):
            avg_precisions[c-1] = class_specific_mAP(truths, preds, c)
            AP[c] = avg_precisions[c-1]
        # further compute the mean over the average precisions over all categories
        mAP = avg_precisions.mean().item()
        
        return mAP, AP

In [8]:
mAP_metric = mAP(n_classes=len(ds.id2cat))

In [9]:
true_boxes  = batch['boxes']
true_labels = batch['cats']

In [10]:
true_labels

[tensor([40, 40, 40, 46, 46, 46, 46, 72, 40, 40, 46, 40, 62, 40, 40, 40, 40]),
 tensor([ 1, 34]),
 tensor([ 1,  1,  1,  1,  1, 31, 31, 31, 31]),
 tensor([59, 59, 63, 58, 58, 58, 74, 74, 76, 57, 57, 57, 57, 61, 74, 74, 74, 74,
         74, 74, 76, 74, 74, 74, 57, 58]),
 tensor([16, 57, 66, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 74, 78, 66, 74]),
 tensor([28,  1,  1,  1, 40, 40, 40]),
 tensor([ 1,  1,  1, 14, 34, 14, 30, 14]),
 tensor([ 3, 13,  1,  3])]

In [11]:
pred_boxes  = true_boxes
pred_labels = true_labels