**Feature Extraction**

We begin with an image and a set of bounding boxes along with its label as defined below.

In [5]:
import torch
image = torch.zeros((1, 3, 800, 800)).float()

bbox = torch.FloatTensor([[20, 30, 400, 500], [300, 400, 500, 600]]) # [y1, x1, y2, x2] format
labels = torch.LongTensor([6, 8]) # 0 represents background
sub_sample = 16

**Generate anchor boxes**

Create anchor boxes at all the locations in the feature map.

In [6]:
import numpy as np
ratios = [0.5, 1, 2]
anchor_scales = [8, 16, 32]

anchor_base = np.zeros((len(ratios) * len(anchor_scales), 4), dtype=np.float32)

print(anchor_base)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [7]:
anchor_base.shape

(9, 4)

In [8]:
ctr_y = sub_sample /2.
ctr_x = sub_sample /2.

print(ctr_y, ctr_x)

8.0 8.0


In [9]:
for i in range(len(ratios)):
    for j in range(len(anchor_scales)):
        h = sub_sample * anchor_scales[j] * np.sqrt(ratios[i])
        w = sub_sample * anchor_scales[j] * np.sqrt(1./ ratios[i])

        index = i * len(anchor_scales) + j

        anchor_base[index, 0] = ctr_y - h / 2.
        anchor_base[index, 1] = ctr_x - w / 2.
        anchor_base[index, 2] = ctr_y + h / 2.
        anchor_base[index, 3] = ctr_x + w / 2.

In [10]:
anchor_base

array([[ -37.254833,  -82.50967 ,   53.254833,   98.50967 ],
       [ -82.50967 , -173.01933 ,   98.50967 ,  189.01933 ],
       [-173.01933 , -354.03867 ,  189.01933 ,  370.03867 ],
       [ -56.      ,  -56.      ,   72.      ,   72.      ],
       [-120.      , -120.      ,  136.      ,  136.      ],
       [-248.      , -248.      ,  264.      ,  264.      ],
       [ -82.50967 ,  -37.254833,   98.50967 ,   53.254833],
       [-173.01933 ,  -82.50967 ,  189.01933 ,   98.50967 ],
       [-354.03867 , -173.01933 ,  370.03867 ,  189.01933 ]],
      dtype=float32)

In [11]:
fe_size = (800//16)
ctr_x = np.arange(16, (fe_size+1) * 16, 16)
ctr_y = np.arange(16, (fe_size+1) * 16, 16)

In [14]:
ctr = np.empty((len(ctr_x) * len(ctr_y), 2))

In [15]:
index = 0
for x in range(len(ctr_x)):
    for y in range(len(ctr_y)):
        ctr[index, 1] = ctr_x[x] - 8
        ctr[index, 0] = ctr_y[y] - 8
        index +=1

In [17]:
anchors = np.zeros(((fe_size * fe_size * 9), 4))
index = 0
for c in ctr:
    ctr_y, ctr_x = c
    for i in range(len(ratios)):
        for j in range(len(anchor_scales)):
            h = sub_sample * anchor_scales[j] * np.sqrt(ratios[i])
            w = sub_sample * anchor_scales[j] * np.sqrt(1./ ratios[i])
            anchors[index, 0] = ctr_y - h / 2.
            anchors[index, 1] = ctr_x - w / 2.
            anchors[index, 2] = ctr_y + h / 2.
            anchors[index, 3] = ctr_x + w / 2.
            index += 1
print(anchors.shape)

(22500, 4)


**Assign the labels and the location of the object with respect to the anchor**

In [18]:
bbox = np.asarray([[20, 30, 400, 500], [300, 400, 500, 600]], dtype=np.float32) # [y1, x1, y2, x2] format
labels = np.asarray([6, 8], dtype=np.int8)

Indices of the valid anchor boxes

In [19]:
index_inside = np.where(
        (anchors[:, 0] >= 0) &
        (anchors[:, 1] >= 0) &
        (anchors[:, 2] <= 800) &
        (anchors[:, 3] <= 800)
    )[0]
print(index_inside.shape)

(8940,)


Empty label with all fill -1

In [21]:
label = np.empty((len(index_inside), ), dtype=np.int32)
label.fill(-1)
print(label.shape)

(8940,)


Array of valid anchor boxes

In [23]:
valid_anchor_boxes = anchors[index_inside]
print(valid_anchor_boxes.shape)

(8940, 4)


For every box calculate the iou with the every ground truth object. 

- Find the max of x1 and y1 in both the boxes (xn1, yn1)
- Find the min of x2 and y2 in both the boxes (xn2, yn2)
- Now both the boxes are intersecting only
 if (xn1 < xn2) and (yn2 < yn1)
      - iou_area will be (xn2 - xn1) * (yn2 - yn1)
 else
      - iuo_area will be 0
- similarly calculate area for anchor box and ground truth object
- iou = iou_area/(anchor_box_area + ground_truth_area - iou_area)


In [25]:
ious = np.empty((len(valid_anchor_boxes), 2), dtype=np.float32)
ious.fill(0)
print(bbox)
for num1, i in enumerate(valid_anchor_boxes):
    ya1, xa1, ya2, xa2 = i  
    anchor_area = (ya2 - ya1) * (xa2 - xa1)
    for num2, j in enumerate(bbox):
        yb1, xb1, yb2, xb2 = j
        box_area = (yb2- yb1) * (xb2 - xb1)
        inter_x1 = max([xb1, xa1])
        inter_y1 = max([yb1, ya1])
        inter_x2 = min([xb2, xa2])
        inter_y2 = min([yb2, ya2])
        if (inter_x1 < inter_x2) and (inter_y1 < inter_y2):
            iter_area = (inter_y2 - inter_y1) * \
(inter_x2 - inter_x1)
            iou = iter_area / \
(anchor_area+ box_area - iter_area)            
        else:
            iou = 0.
        ious[num1, num2] = iou
print(ious.shape)

[[ 20.  30. 400. 500.]
 [300. 400. 500. 600.]]
(8940, 2)


Considering the scenarios of a and b, we need to find two things here
- the highest iou for each gt_box and its corresponding anchor box
- the highest iou for each anchor box and its corresponding ground truth box

**case 1:** the highest iou for every ground truth box and the corresponding anchor box.

In [26]:
gt_argmax_ious = ious.argmax(axis=0)
print(gt_argmax_ious)
gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])]
print(gt_max_ious)

[2262 5620]
[0.68130493 0.61035156]


**case 2:** the highest iou for every anchor box and the corresponding ground truth box.

In [28]:
argmax_ious = ious.argmax(axis=1)
print(argmax_ious.shape)
print(argmax_ious)
max_ious = ious[np.arange(len(index_inside)), argmax_ious]
print(max_ious)

(8940,)
[0 0 0 ... 0 0 0]
[0.06811669 0.07083762 0.07083762 ... 0.         0.         0.        ]


Now we have three arrays
- argmax_ious — Tells which ground truth object has max iou with each anchor.
- max_ious — Tells the max_iou with ground truth object with each anchor.
- gt_argmax_ious — Tells the anchors with the highest Intersection-over-Union (IoU) overlap with a ground-truth box.

Lets put thresholds to some variables

In [29]:
pos_iou_threshold  = 0.7
neg_iou_threshold = 0.3

case 1: Assign a negative class to the anchor box which has a max iou as less than 0.3 for that class.

In [30]:
label[max_ious < neg_iou_threshold] = 0

case 2: Assign a positive class to the anchor boxes that has the highest ious for the ground truth objects.

In [31]:
label[gt_argmax_ious] = 1

case 3: Assign positive class to the anchor boxes that have iou values as greater than positive threshold as 0.7

In [32]:
label[max_ious > pos_iou_threshold] = 1

**Training RPN** 

The Faster_R-CNN paper phrases as follows 

*Each mini-batch arises from a single image that contains many positive and negitive example anchors, but this will bias towards negitive samples as they are dominate. Instead, we randomly sample 256 anchors in an image to compute the loss function of a mini-batch, where the sampled positive and negative anchors have a ratio of up to 1:1. If there are fewer than 128 positive samples in an image, we pad the mini-batch with negitive ones..*

From this we can derive two variable as follows:

In [33]:
pos_ratio = 0.5
n_sample = 256

In [34]:
n_pos = pos_ratio * n_sample

- postive boxes sampling

In [35]:
pos_index = np.where(label == 1)[0]
if len(pos_index) > n_pos:
    disable_index = np.random.choice(pos_index, size=(len(pos_index) - n_pos), replace=False)
    label[disable_index] = -1

- negative sampling

In [36]:
n_neg = n_sample * np.sum(label == 1)
neg_index = np.where(label == 0)[0]
if len(neg_index) > n_neg:
    disable_index = np.random.choice(neg_index, size=(len(neg_index) - n_neg), replace = False)
    label[disable_index] = -1

**Assigning locations to anchor boxes**

Now lets assign the locations to each anchor box with the ground truth object which has maximum iou. Note, we will assign anchor locs to all the valid anchor boxes irrespective of its label, later when we are calculating the losses, we can remove them with simple filters.

We already know which ground truth object has high iou with each anchor box, Now we need to find the locations of ground truth with respect to the anchor box location. Faster_R-CNN uses the following parametrizion for this.

$$t_{x} = (x - x_{a})/w_{a}$$
$$t_{y} = (y - y_{a})/h_{a}$$
$$t_{w} = log(w/ w_a)$$
$$t_{h} = log(h/ h_a)$$

For each anchor box, find the groundtruth object which has the maximum iou.

In [37]:
max_iou_bbox = bbox[argmax_ious]
print(max_iou_bbox)

[[ 20.  30. 400. 500.]
 [ 20.  30. 400. 500.]
 [ 20.  30. 400. 500.]
 ...
 [ 20.  30. 400. 500.]
 [ 20.  30. 400. 500.]
 [ 20.  30. 400. 500.]]


- Inorder to find t_{x}, t_{y}, t_{w}, t_{h}, we need to convert the y1, x1, y2, x2 format of valid anchor boxes and associated ground truth boxes with max iou to ctr_y, ctr_x , h, w format.

In [39]:
height = valid_anchor_boxes[:, 2] - valid_anchor_boxes[:, 0]
width = valid_anchor_boxes[:, 3] - valid_anchor_boxes[:, 1]
ctr_y = valid_anchor_boxes[:, 0] + 0.5 * height
ctr_x = valid_anchor_boxes[:, 1] + 0.5 * width

base_height = max_iou_bbox[:, 2] - max_iou_bbox[:, 0]
base_width = max_iou_bbox[:, 3] - max_iou_bbox[:, 1]
base_ctr_y = max_iou_bbox[:, 0] + 0.5 * base_height
base_ctr_x = max_iou_bbox[:, 1] + 0.5 * base_width

use the formulaes to find the locs

In [40]:
eps = np.finfo(height.dtype).eps
height = np.maximum(height, eps)
width = np.maximum(width, eps)
dy = (base_ctr_y - ctr_y) / height
dx = (base_ctr_x - ctr_x) / width
dh = np.log(base_height / height)
dw = np.log(base_width / width)
anchor_locs = np.vstack((dy, dx, dh, dw)).transpose()
print(anchor_locs)


[[ 0.5855728   2.30914558  0.7415674   1.64727602]
 [ 0.49718446  2.30914558  0.7415674   1.64727602]
 [ 0.40879611  2.30914558  0.7415674   1.64727602]
 ...
 [-2.50801936 -5.29225232  0.7415674   1.64727602]
 [-2.59640771 -5.29225232  0.7415674   1.64727602]
 [-2.68479606 -5.29225232  0.7415674   1.64727602]]


In [45]:
anchor_locs.shape

(8940, 4)

- final labels

In [48]:
anchor_labels = np.empty((len(anchors),), dtype=label.dtype)
anchor_labels.fill(-1)
anchor_labels[index_inside] = label

- final locations

In [52]:
anchor_locations = np.empty((len(anchors),) + anchors.shape[1:], dtype=anchor_locs.dtype)
anchor_locations.fill(0)
anchor_locations[index_inside, :] = anchor_locs

The final two matrices are
- anchor_locations [N, 4] — [22500, 4]
- anchor_labels [N,] — [22500]

These are used as targets to the RPN network. We will see how this RPN network is designed in the next section.

**Region Proposal Network**

In [53]:
import torch.nn as nn
mid_channels = 512
in_channels = 512 # depends on the output feature map. in vgg 16 it is equal to 512
n_anchor = 9 # Number of anchors at each location
conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1)
reg_layer = nn.Conv2d(mid_channels, n_anchor *4, 1, 1, 0)
cls_layer = nn.Conv2d(mid_channels, n_anchor *2, 1, 1, 0) 
## I will be going to use softmax here. you can equally use sigmoid if u replace 2 with 1

The paper tells that they initialized these layers with zero mean and 0.01 standard deviation for weights and zeros for base. Lets do that

In [54]:
# conv sliding layer
conv1.weight.data.normal_(0, 0.01)
conv1.bias.data.zero_()
# Regression layer
reg_layer.weight.data.normal_(0, 0.01)
reg_layer.bias.data.zero_()
# classification layer
cls_layer.weight.data.normal_(0, 0.01)
cls_layer.bias.data.zero_()

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

Now the outputs we got in the feature extraction state should be sent to this network to predict locations of objects with repect to the anchor and the objectness score assoiciated with it.

In [102]:
out_map = torch.rand(1, 512, 50, 50)

In [103]:
x = conv1(out_map) # out_map is obtained in section 1
pred_anchor_locs = reg_layer(x)
pred_cls_scores = cls_layer(x)
print(pred_cls_scores.shape, pred_anchor_locs.shape)


torch.Size([1, 18, 50, 50]) torch.Size([1, 36, 50, 50])


In [104]:
pred_anchor_locs = pred_anchor_locs.permute(0, 2, 3, 1).contiguous().view(1, -1, 4)
print(pred_anchor_locs.shape)
#Out: torch.Size([1, 22500, 4])
pred_cls_scores = pred_cls_scores.permute(0, 2, 3, 1).contiguous()
print(pred_cls_scores)
#Out torch.Size([1, 50, 50, 18])
objectness_score = pred_cls_scores.view(1, 50, 50, 9, 2)[:, :, :, :, 1].contiguous().view(1, -1)
print(objectness_score.shape)
#Out torch.Size([1, 22500])
pred_cls_scores  = pred_cls_scores.view(1, -1, 2)
print(pred_cls_scores.shape)
# Out torch.size([1, 22500, 2])

torch.Size([1, 22500, 4])
tensor([[[[-9.5024e-02,  2.6759e-02, -3.9740e-02,  ...,  9.7388e-02,
           -1.8628e-02,  1.6635e-02],
          [-4.4591e-02,  1.9843e-02,  8.6184e-02,  ...,  4.7052e-02,
            5.3647e-02, -6.3169e-02],
          [-1.0754e-02,  6.6458e-02,  7.8701e-02,  ...,  5.9399e-02,
           -6.6282e-03, -6.6840e-02],
          ...,
          [-1.7168e-02,  7.0088e-02,  9.8154e-02,  ...,  2.7229e-03,
           -2.5998e-02, -2.3750e-02],
          [ 1.3076e-02,  1.2077e-01,  4.2411e-02,  ..., -2.2580e-02,
           -4.0996e-02, -5.0846e-02],
          [-4.6171e-03, -6.5945e-02,  4.9031e-02,  ..., -3.8430e-02,
           -1.6058e-02, -5.7551e-02]],

         [[-5.4386e-02,  2.0342e-02,  3.5286e-02,  ...,  7.9171e-02,
           -8.2211e-03, -6.5353e-02],
          [ 1.2870e-01,  3.8972e-02, -5.9430e-02,  ...,  3.3565e-02,
            3.0402e-02, -6.4990e-02],
          [-3.9643e-02,  6.9098e-02,  1.3554e-03,  ..., -5.3457e-02,
            7.3035e-03, -1.5339e

**Generating region proposals from the RPN to feed to the Fast RCNN**

This is the non-max suppression step, where we choose the best non-overlapping anchor boxes.

In [105]:
nms_thresh = 0.7
n_train_pre_nms = 12000
n_train_post_nms = 2000
n_test_pre_nms = 6000
n_test_post_nms = 300
min_size = 16

1. convert the loc predictions from the rpn network to bbox [y1, x1, y2, x2] format.

This is the reverse operations of what we have done while assigning ground truth to anchor boxes .This operation decodes predictions by un-parameterizing them and offseting to image. the formulas are as follows
$$x = (w_{a} * ctrx_{p}) + ctrx_{a}$$
$$y = (h_{a} * ctrx_{p}) + ctrx_{a}$$
$$h = np.exp(h_{p}) * h_{a}$$
$$w = np.exp(w_{p}) * w_{a}$$
and later convert to y1, x1, y2, x2 format

convert all the anchors possible from the feature map, from y1, x1, y2, x2 into ctr_x, ctr_y, h, w format.

In [106]:
anc_height = anchors[:, 2] - anchors[:, 0]
anc_width = anchors[:, 3] - anchors[:, 1]
anc_ctr_y = anchors[:, 0] + 0.5 * anc_height
anc_ctr_x = anchors[:, 1] + 0.5 * anc_width

2. Convert predictions locs using above formulas. before that convert the pred_anchor_locs and objectness_score to numpy array


In [107]:
pred_anchor_locs_numpy = pred_anchor_locs[0].data.numpy()
objectness_score_numpy = objectness_score[0].data.numpy()

In [108]:
pred_anchor_locs_numpy.shape

(22500, 4)

pred_anchor_locs has x, y, h, w normalized by anchor box. 

In [109]:
dy = pred_anchor_locs_numpy[:, 0::4]
dx = pred_anchor_locs_numpy[:, 1::4]
dh = pred_anchor_locs_numpy[:, 2::4]
dw = pred_anchor_locs_numpy[:, 3::4]

convert this normalized pred_anchor_locs back into real coords.

In [110]:
ctr_y = dy * anc_height[:, np.newaxis] + anc_ctr_y[:, np.newaxis]
ctr_x = dx * anc_width[:, np.newaxis] + anc_ctr_x[:, np.newaxis]
h = np.exp(dh) * anc_height[:, np.newaxis]
w = np.exp(dw) * anc_width[:, np.newaxis]

convert [ctr_x, ctr_y, h, w] to [y1, x1, y2, x2] format

In [111]:
roi = np.zeros(pred_anchor_locs_numpy.shape)
roi[:, 0::4] = ctr_y - 0.5 * h
roi[:, 1::4] = ctr_x - 0.5 * w
roi[:, 2::4] = ctr_y + 0.5 * h
roi[:, 3::4] = ctr_x + 0.5 * w

In [112]:
roi.shape

(22500, 4)

In [113]:
roi

array([[ -28.92410141,  -89.62584035,   51.93897971,  101.67660377],
       [ -94.47754683, -177.5039333 ,   82.77957668,  165.72836979],
       [-167.19772599, -437.67890807,  204.49474029,  381.83549529],
       ...,
       [ 709.55375141,  757.09631736,  879.15302588,  846.27818025],
       [ 597.94354323,  700.97313758,  983.30521569,  869.78544069],
       [ 417.03526588,  625.57564561, 1103.40453522,  979.89805367]])

3. clip the predicted boxes to the image,

In [114]:
img_size = (800, 800) #Image size
roi[:, slice(0, 4, 2)] = np.clip(
            roi[:, slice(0, 4, 2)], 0, img_size[0])
roi[:, slice(1, 4, 2)] = np.clip(
    roi[:, slice(1, 4, 2)], 0, img_size[1])
print(roi)

[[  0.           0.          51.93897971 101.67660377]
 [  0.           0.          82.77957668 165.72836979]
 [  0.           0.         204.49474029 381.83549529]
 ...
 [709.55375141 757.09631736 800.         800.        ]
 [597.94354323 700.97313758 800.         800.        ]
 [417.03526588 625.57564561 800.         800.        ]]


Remove predicted boxes with either height or width < threshold.

In [115]:
hs = roi[:, 2] - roi[:, 0]
ws = roi[:, 3] - roi[:, 1]
keep = np.where((hs >= min_size) & (ws >= min_size))[0]
roi = roi[keep, :]
score = objectness_score_numpy[keep]
print(score.shape)

(22500,)


now all the chosen boxes have the min size of 16.

In [117]:
order = score.ravel().argsort()[::-1]
print(order)

[14079  3342 12474 ... 10441 21412 13168]


Take top pre_nms_topN (e.g. 12000 while training and 300 while testing)

In [119]:
order = order[:n_train_pre_nms]
#roi = roi[order, :]
print(roi.shape)
print(roi)

(22500, 4)
[[  0.           0.          51.93897971 101.67660377]
 [  0.           0.          82.77957668 165.72836979]
 [  0.           0.         204.49474029 381.83549529]
 ...
 [709.55375141 757.09631736 800.         800.        ]
 [597.94354323 700.97313758 800.         800.        ]
 [417.03526588 625.57564561 800.         800.        ]]


**Non-max suppression**

Pseudo Code

- Take all the roi boxes [roi_array]
- Find the areas of all the boxes [roi_area]
- Take the indexes of order the probability score in descending order [order_array]
keep = []
while order_array.size > 0:
  - take the first element in order_array and append that to keep  
  - Find the area with all other boxes
  - Find the index of all the boxes which have high overlap with this box
  - Remove them from order array
  - Iterate this till we get the order_size to zero (while loop)
- Ouput the keep variable which tells what indexes to consider.

In [121]:
y1 = roi[:, 0]
x1 = roi[:, 1]
y2 = roi[:, 2]
x2 = roi[:, 3]
area = (x2 - x1 + 1) * (y2 - y1 + 1)
keep = []

In [122]:
order.shape

(22500,)

In [123]:
order

array([14079,  3342, 12474, ..., 10441, 21412, 13168])

In [125]:
while order.size > 0:
    i = order[0]
    xx1 = np.maximum(x1[i], x1[order[1:]])
    yy1 = np.maximum(y1[i], y1[order[1:]])
    xx2 = np.minimum(x2[i], x2[order[1:]])
    yy2 = np.minimum(y2[i], y2[order[1:]])
    w = np.maximum(0.0, xx2 - xx1 + 1)
    h = np.maximum(0.0, yy2 - yy1 + 1)
    inter = w * h
    ovr = inter / (area[i] + area[order[1:]] - inter)
    inds = np.where(ovr <= thresh)[0]
    order = order[inds + 1]
keep = keep[:n_train_post_nms] # while training/testing , use accordingly
roi = roi[keep] # the final region proposals

NameError: name 'thresh' is not defined