In [1]:
# TODO: BBoxes assignment
import numpy as np
import tensorflow as tf
import pickle

In [24]:
# (total, num_box + num_variance)
# pre-calculate anchor boxes with each of conv-layers
priors = pickle.load(open('prior_boxes_ssd300.pkl', 'rb'))
NUM_CLASSES = 20
# for bboxes usage
num_classes = NUM_CLASSES + 1
num_priors = len(priors)
overlap_threshold = 0.5

## Assign encoded boxes (which's all relative to offset)
In the beggining, I was wonder meaning of encoded-boxes.<br/>
Here is my understanding:
- We only choose the best IOU results from offset of 'priors' and 'ground-truth-boxes' center and apply variance in each of results.

In [13]:
# compute each of boxes iou with ground-truth-box
# iou = intersection_area / union_area
def compute_iou(box):
    def compute_inner():
        inter_upleft = np.maximum(priors[:, :2], box[:2])
        inter_botright = np.minimum(priors[:, 2:4], box[2:])
        inter_wh = inter_botright - inter_upleft
        # prevent negative
        inter_wh = np.maximum(inter_wh, 0)
        inter = inter_wh[:, 0] * inter_wh[:, 1]
        return inter
    def compute_union(inter):
        area_gt = (box[2] - box[0]) * (box[3] - box[1])
        area_pred = (priors[:, 2] - priors[:, 0]) * (priors[:, 3] - priors[:, 1])
        union = area_pred + area_gt - inter
        return union
    
    inter = compute_inner()
    union = compute_union(inter)
    iou = inter / union
    
    return iou

In [36]:
def encode_box(box, return_iou=True):
    # (7308,)
    iou = compute_iou(box)
    # (7308, 4+1)
    encoded_box = np.zeros((num_priors, 4 + return_iou))
    # True, False array with shape of (7308,)
    assign_mask = iou > overlap_threshold
    # if all False, then pick the best from iou
    if not assign_mask.any():
        assign_mask[iou.argmax()] = True
    if return_iou:
        encoded_box[:, -1][assign_mask] = iou[assign_mask]
    # positive box
    assigned_priors = priors[assign_mask]
    # get ground-truth-box center (x,y)
    # [x, y]
    box_center = 0.5 * (box[:2] + box[2:])
    # [w, h]
    box_wh = box[2:] - box[:2]
    assigned_priors_center = 0.5 * (assigned_priors[:, :2] + assigned_priors[:, 2:4])
    assigned_priors_wh = assigned_priors[:, 2:4] - assigned_priors[:, :2]
    # encode variance
    # center part
    encoded_box[:, :2][assign_mask] = (box_center - assigned_priors_center) / (assigned_priors_wh * assigned_priors[:, -4:-2])
    # get ratio of width and height between ground-truth and positive_prior_boxes
    encoded_box[:, 2:4][assign_mask] = np.log(box_wh / assigned_priors_wh) / assigned_priors[:, -2:]
    # flatten
    return encoded_box.ravel()

In [47]:
# Assign the best prior box data which's iou > threshold(0.5), the data is following
# 1. (x, y) which's offset between prior-boxes and ground-truth-boxes center
# 2. classes
# 3. whether should be penalized in loss function.
def assign_boxes(boxes):
    assignment = np.zeros((num_priors, 4 + num_classes + 8))
    # background classid
    assignment[:, 4] = 1.0
    if len(boxes) == 0:
        return assignment
    # shape of (1, 36540)
    encoded_boxes = np.apply_along_axis(encode_box, 1, boxes[:, :4])
    encoded_boxes = encoded_boxes.reshape(-1, num_priors, 5)
    # find the best one from each rows
    best_iou = encoded_boxes[:, :, -1].max(axis=0)
    best_iou_idx = encoded_boxes[:, :, -1].argmax(axis=0)
    best_iou_mask = best_iou > 0
    best_iou_idx = best_iou_idx[best_iou_mask]
    assign_num = len(best_iou_idx)
    encoded_boxes = encoded_boxes[:, best_iou_mask, :]
    assignment[:, :4][best_iou_mask] = encoded_boxes[best_iou_idx, np.arange(assign_num), :4]
    # mark as non-background
    assignment[:, 4][best_iou_mask] = 0
    # classes
    assignment[:, 5:-8][best_iou_mask] = boxes[best_iou_idx, 4:]
    assignment[:, -8][best_iou_mask] = 1
    return assignment

In [49]:
# test box
classes = np.zeros((20, 1))
classes[1] = 1.0
boxes = np.array([50, 30, 200, 230], dtype='float32') / 300.
boxes = np.append(boxes, classes)
boxes = np.expand_dims(boxes, 0)

assign_boxes(boxes).shape

(7308, 33)