In [1]:
import torch
import torchvision
import torch.nn as nn

image = torch.zeros((1, 3, 800, 800)).float()
bbox = torch.FloatTensor([[20, 30, 400, 500], [300, 400, 500, 600]])
labels = torch.LongTensor([6, 8])
sub_sample = 16

dummy_img = torch.zeros((1, 3, 800, 800)).float()
print(dummy_img.shape)

torch.Size([1, 3, 800, 800])


In [2]:
model = torchvision.models.vgg16(pretrained=True)
print(model)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

In [3]:
fe = list(model.features)

In [4]:
req_features = []
k = dummy_img.clone()
for i in fe:
    k = i(k)
    if k.size()[2] < 800//16:
        break
    req_features.append(i)
    out_channels = k.size()[1]
    
print(len(req_features))
print(out_channels)

30
512


In [5]:
faster_rcnn_fe_extractor = nn.Sequential(*req_features)

In [6]:
print(faster_rcnn_fe_extractor)

Sequential(
  (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU(inplace=True)
  (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU(inplace=True)
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (6): ReLU(inplace=True)
  (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (8): ReLU(inplace=True)
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (11): ReLU(inplace=True)
  (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (13): ReLU(inplace=True)
  (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (15): ReLU(inplace=True)
  (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (17): Conv2d(256, 512, kernel_si

In [7]:
out_map = faster_rcnn_fe_extractor(image)
print(out_map.shape)

torch.Size([1, 512, 50, 50])


In [8]:
import numpy as np

ratios = [0.5, 1, 2]
anchor_scales = [8, 16, 32]

anchor_base = np.zeros((len(ratios) * len(anchor_scales), 4))
print(anchor_base.shape)
print(anchor_base)

(9, 4)
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [9]:
ctr_x = sub_sample / 2
ctr_y = sub_sample / 2

print(ctr_x, ctr_y)

8.0 8.0


In [10]:
for i in range(len(ratios)):
    for j in range(len(anchor_scales)):
        h = sub_sample * anchor_scales[j] * np.sqrt(ratios[i])
        w = sub_sample * anchor_scales[j] * np.sqrt(1./ratios[i])
        
        print(h, w)
        
        index = i * len(anchor_scales) + j
        
        anchor_base[index, 0] = ctr_y - h/2
        anchor_base[index, 1] = ctr_x - w/2
        anchor_base[index, 2] = ctr_y + h/2
        anchor_base[index, 3] = ctr_x + w/2

print(anchor_base)

90.50966799187809 181.01933598375618
181.01933598375618 362.03867196751236
362.03867196751236 724.0773439350247
128.0 128.0
256.0 256.0
512.0 512.0
181.01933598375618 90.50966799187809
362.03867196751236 181.01933598375618
724.0773439350247 362.03867196751236
[[ -37.254834    -82.50966799   53.254834     98.50966799]
 [ -82.50966799 -173.01933598   98.50966799  189.01933598]
 [-173.01933598 -354.03867197  189.01933598  370.03867197]
 [ -56.          -56.           72.           72.        ]
 [-120.         -120.          136.          136.        ]
 [-248.         -248.          264.          264.        ]
 [ -82.50966799  -37.254834     98.50966799   53.254834  ]
 [-173.01933598  -82.50966799  189.01933598   98.50966799]
 [-354.03867197 -173.01933598  370.03867197  189.01933598]]


In [11]:
fe_size = 800 // 16
print(fe_size)
ctr_x = np.arange(16, (fe_size+1) * 16, 16)
ctr_y = np.arange(16, (fe_size+1) * 16, 16)
print(ctr_y)
print(len(ctr_y))

50
[ 16  32  48  64  80  96 112 128 144 160 176 192 208 224 240 256 272 288
 304 320 336 352 368 384 400 416 432 448 464 480 496 512 528 544 560 576
 592 608 624 640 656 672 688 704 720 736 752 768 784 800]
50


In [12]:
ctr = np.zeros((2500, 2))

In [13]:
ctr.shape

(2500, 2)

In [14]:
index = 0
for x in range(len(ctr_x)):
    for y in range(len(ctr_y)):
        ctr[index, 1] = ctr_x[x] - 8
        ctr[index, 0] = ctr_y[y] - 8
        index +=1

In [15]:
anchors = np.zeros((fe_size * fe_size * 9, 4))
index = 0
for c in ctr:
    ctr_y, ctr_x = c
    for i in range(len(ratios)):
        for j in range(len(anchor_scales)):
            h = sub_sample * anchor_scales[j] * np.sqrt(ratios[i])
            w = sub_sample * anchor_scales[j] * np.sqrt(1./ ratios[i])
            anchors[index, 0] = ctr_y - h / 2.
            anchors[index, 1] = ctr_x - w / 2.
            anchors[index, 2] = ctr_y + h / 2.
            anchors[index, 3] = ctr_x + w / 2.
            index += 1
print(anchors.shape)

(22500, 4)


In [16]:
index_inside = np.where(
(anchors[:, 0] >= 0) &
(anchors[:,1] >=0) &
(anchors[:, 2] <= 800) &
(anchors[:, 3] <= 800))[0]

print(index_inside.shape)

(8940,)


In [17]:
label = np.empty((len(index_inside), ), dtype = np.int32)
label.fill(-1)
print(label.shape)

(8940,)


In [18]:
valid_anchor_boxes = anchors[index_inside]
print(valid_anchor_boxes.shape)

(8940, 4)


In [19]:
ious = np.empty((len(valid_anchor_boxes), 2), dtype = np.float32)
ious.fill(0)
for num1, i in enumerate(valid_anchor_boxes):
    ya1, xa1, ya2, xa2 = i
    anchor_area = (ya2 - ya1) * (xa2 - xa1)
    for num2, j in enumerate(bbox):
        yb1, xb1, yb2, xb2 = j
        box_area = (yb2 - yb1) * (xb2 - xb1)
        
        inter_x1 = max([xb1, xa1])
        inter_y1 = max([yb1, ya1])
        inter_x2 = min([xb2, xa2])
        inter_y2 = min([yb2, ya2])
        
        if(inter_x1 < inter_x2) and (inter_y1 < inter_y2):
            iter_area = (inter_y2 - inter_y1) * (inter_x2 - inter_x1)
            iou = iter_area / (anchor_area + box_area - iter_area)
        else:
            iou = 0
        
        ious[num1, num2] = iou

In [20]:
print(ious.shape)

(8940, 2)


## Highest IOU of bounding box-1 and bounding box-2

In [21]:
gt_argmax_ious = np.argmax(ious, axis = 0)
print(gt_argmax_ious)

gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])]
print(gt_max_ious)

[2262 5620]
[0.68130493 0.61035156]


## Finding maximum IOU in bounding box-1, box-2 for every 8940 instances

In [22]:
argmax_ious = ious.argmax(axis=1)
print(argmax_ious.shape)
print(argmax_ious)

max_ious = ious[np.arange(len(index_inside)), argmax_ious]
print(max_ious)

(8940,)
[0 0 0 ... 0 0 0]
[0.06811669 0.07083762 0.07083762 ... 0.         0.         0.        ]


In [23]:
gt_argmax_ious = np.where(ious == gt_max_ious)
print(gt_argmax_ious)

(array([2262, 2508, 5620, 5628, 5636, 5644, 5866, 5874, 5882, 5890, 6112,
       6120, 6128, 6136, 6358, 6366, 6374, 6382]), array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]))


In [24]:
pos_iou_threshold = 0.7
neg_iou_threshold = 0.3

In [25]:
max_ious.shape

(8940,)

In [26]:
label[max_ious < neg_iou_threshold] = 0

In [27]:
for x in gt_argmax_ious[0]:
    label[x] = 1

In [28]:
label[max_ious >= pos_iou_threshold] = 1

In [29]:
print(label.shape)

(8940,)


In [30]:
pos_ratio = 0.5
n_sample = 256

In [31]:
n_pos = pos_ratio * n_sample

In [32]:
pos_index = np.where(label == 1)[0]
if len(pos_index) > n_pos:
    disable_index = np.random.choice(pos_index, size = (len(pos_index) - n_pos), replace = False)
    label[disable_index] = -1

In [33]:
n_neg = n_sample - np.sum(label == 1)
neg_index = np.where(label == 0)[0]
if len(neg_index) > n_neg:
    disable_index = np.random.choice(neg_index, size = (len(neg_index) - n_neg), replace = False)
    label[disable_index] = -1

In [34]:
bbox

tensor([[ 20.,  30., 400., 500.],
        [300., 400., 500., 600.]])

In [35]:
max_iou_bbox = bbox[argmax_ious]

In [36]:
max_iou_bbox

tensor([[ 20.,  30., 400., 500.],
        [ 20.,  30., 400., 500.],
        [ 20.,  30., 400., 500.],
        ...,
        [ 20.,  30., 400., 500.],
        [ 20.,  30., 400., 500.],
        [ 20.,  30., 400., 500.]])

In [37]:
valid_anchor_boxes.shape

(8940, 4)

In [38]:
valid_anchor_boxes[0]

array([ 13.49033201,  10.745166  , 194.50966799, 101.254834  ])

In [39]:
height = valid_anchor_boxes[:, 2] - valid_anchor_boxes[:, 0]
width = valid_anchor_boxes[:, 3] - valid_anchor_boxes[:, 1]
ctr_y = valid_anchor_boxes[:, 0] + 0.5 * height
ctr_x = valid_anchor_boxes[:, 1] + 0.5 * width

base_height = (max_iou_bbox[:, 2] - max_iou_bbox[:, 0]).cpu().numpy()
base_width = (max_iou_bbox[:, 3] - max_iou_bbox[:, 1]).cpu().numpy()
base_ctr_y = max_iou_bbox[:, 0].cpu().numpy() + 0.5 * base_height
base_ctr_x = max_iou_bbox[:, 1].cpu().numpy() + 0.5 * base_width

eps = np.finfo(height.dtype).eps
height = np.maximum(height, eps)
width = np.maximum(width, eps)

dy = (base_ctr_y - ctr_y) / height
dx = (base_ctr_x - ctr_x) / width
dh = np.log(base_height / height)
dw = np.log(base_width / width)

In [40]:
anchor_locs = np.vstack((dy, dx, dh, dw)).transpose()
print(anchor_locs)

[[ 0.5855728   2.30914558  0.7415674   1.64727602]
 [ 0.49718446  2.30914558  0.7415674   1.64727602]
 [ 0.40879611  2.30914558  0.7415674   1.64727602]
 ...
 [-2.50801936 -5.29225232  0.7415674   1.64727602]
 [-2.59640771 -5.29225232  0.7415674   1.64727602]
 [-2.68479606 -5.29225232  0.7415674   1.64727602]]


In [41]:
anchor_labels = np.empty((len(anchors),), dtype = label.dtype)

In [42]:
anchor_labels.fill(-1)

In [43]:
anchor_labels[index_inside] = label

In [44]:
anchor_locations = np.empty((len(anchors),) + anchors.shape[1:], dtype = anchor_locs.dtype)

In [45]:
mid_channels = 512
in_channels = 512
n_anchor = 9
conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1)
reg_layer = nn.Conv2d(mid_channels, n_anchor*4, 1, 1, 0)
cls_layer = nn.Conv2d(mid_channels, n_anchor*2, 1, 1, 0)

In [46]:
conv1.weight.data.normal_(0, 0.01)
conv1.bias.data.zero_()

reg_layer.weight.data.normal_(0, 0.01)
reg_layer.bias.data.zero_()

cls_layer.weight.data.normal_(0, 0.01)
cls_layer.bias.data.zero_()

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [47]:
out_map.shape

torch.Size([1, 512, 50, 50])

In [48]:
x = conv1(out_map)
pred_anchor_locs = reg_layer(x)
pred_cls_scores = cls_layer(x)

In [49]:
print(pred_cls_scores.shape, pred_anchor_locs.shape)

torch.Size([1, 18, 50, 50]) torch.Size([1, 36, 50, 50])


In [50]:
pred_anchor_locs = pred_anchor_locs.permute(0, 2, 3, 1).contiguous().view(1, -1, 4)
print(pred_anchor_locs.shape)

torch.Size([1, 22500, 4])


In [51]:
pred_cls_scores = pred_cls_scores.permute(0, 2, 3, 1).contiguous()
print(pred_cls_scores.shape)

torch.Size([1, 50, 50, 18])


In [52]:
objectness_score = pred_cls_scores.view(1, 50, 50, 9, 2)[:, :, :, :, 1].contiguous().view(1, -1)
print(objectness_score.shape)

torch.Size([1, 22500])


In [53]:
pred_cls_scores = pred_cls_scores.view(1, -1, 2)
pred_cls_scores.shape

torch.Size([1, 22500, 2])

In [54]:
nms_thresh = 0.7
n_train_pre_nms = 12000
n_train_post_nms = 2000
n_test_pre_nms = 6000
n_test_post_nms = 300
min_size = 16

In [55]:
anchors.shape

(22500, 4)

In [56]:
anc_height = anchors[:, 2] - anchors[:, 0]
anc_width = anchors[:, 3] - anchors[:, 1]
anc_ctr_y = anchors[:, 0] + 0.5 * anc_height
anc_ctr_x = anchors[:, 1] + 0.5 * anc_width

In [57]:
pred_anchor_locs[0].shape

torch.Size([22500, 4])

In [58]:
pred_anchor_locs_numpy = pred_anchor_locs[0].data.numpy()
objectness_score_numpy = objectness_score[0].data.numpy()
dy = pred_anchor_locs_numpy[:, 0::4]
dx = pred_anchor_locs_numpy[:, 1::4]
dh = pred_anchor_locs_numpy[:, 2::4]
dw = pred_anchor_locs_numpy[:, 3::4]
ctr_y = dy * anc_height[:, np.newaxis] + anc_ctr_y[:, np.newaxis]
ctr_x = dx * anc_width[:, np.newaxis] + anc_ctr_x[:, np.newaxis]
h = np.exp(dh) * anc_height[:, np.newaxis]
w = np.exp(dw) * anc_width[:, np.newaxis]

In [59]:
roi = np.zeros(pred_anchor_locs_numpy.shape, dtype=anchor_locs.dtype)
roi[:, 0::4] = ctr_y - 0.5 * h
roi[:, 1::4] = ctr_x - 0.5 * w
roi[:, 2::4] = ctr_y + 0.5 * h
roi[:, 3::4] = ctr_x + 0.5 * w

In [60]:
img_size = (800, 800) #Image size
roi[:, slice(0, 4, 2)] = np.clip(
            roi[:, slice(0, 4, 2)], 0, img_size[0])
roi[:, slice(1, 4, 2)] = np.clip(
    roi[:, slice(1, 4, 2)], 0, img_size[1])
print(roi)

[[  0.           0.          55.55356459  97.95980537]
 [  0.           0.          93.62134298 201.07869439]
 [  0.           0.         191.03594569 361.40322306]
 ...
 [701.84662553 749.25349388 800.         800.        ]
 [616.12116492 704.74719399 800.         800.        ]
 [415.84887997 611.21031646 800.         800.        ]]


In [61]:

# %% remove small ones
hs = roi[:, 2] - roi[:, 0]
ws = roi[:, 3] - roi[:, 1]
keep = np.where((hs >= min_size) & (ws >= min_size))[0]
roi = roi[keep, :]
scores = objectness_score_numpy[keep]
print(scores.shape)
print(roi.shape)

# %% sort and pick
ordered_scores = scores.ravel().argsort()[::-1]
print(ordered_scores)
ordered_scores = ordered_scores[:n_train_pre_nms]
roi = roi[ordered_scores, :]
print(roi.shape)
print(roi)

(22500,)
(22500, 4)
[461 447 875 ... 446 455  17]
(12000, 4)
[[  0.           0.         206.33910605 411.37331389]
 [693.80619277   0.         800.          55.66087923]
 [584.40189007   0.         800.         394.748768  ]
 ...
 [  0.         417.29678278 579.72181682 784.85981279]
 [149.47247598 401.29678278 800.         768.85981279]
 [133.47247598 401.29678278 800.         768.85981279]]


In [62]:
y1 = roi[:, 0]
x1 = roi[:, 1]
y2 = roi[:, 2]
x2 = roi[:, 3]

areas = (x2 - x1 + 1) * (y2 - y1 + 1)

order = ordered_scores.argsort()[::-1]
keep = []

while order.size > 0:
    i = order[0]
    keep.append(i)
    xx1 = np.maximum(x1[i], x1[order[1:]])
    yy1 = np.maximum(y1[i], y1[order[1:]])
    xx2 = np.minimum(x2[i], x2[order[1:]])
    yy2 = np.minimum(y2[i], y2[order[1:]])
    w = np.maximum(0.0, xx2 - xx1 + 1)
    h = np.maximum(0.0, yy2 - yy1 + 1)
    inter = w * h
    ovr = inter / (areas[i] + areas[order[1:]] - inter)
    inds = np.where(ovr <= nms_thresh)[0]
    order = order[inds + 1]
    
keep = keep[:n_train_post_nms]  # while training/testing , use accordingly
roi = roi[keep]  # the final region proposals for training

In [63]:
roi.shape

(1482, 4)

In [117]:
n_samples = 128
pos_ratio = 0.25
pos_iou_thresh = 0.5
neg_iou_thresh_hi = 0.5
neg_iou_thresh_lo = 0.0

In [118]:
ious = np.empty((len(roi), 2), dtype = np.float32)
ious.fill(0)

for num1, i in enumerate(roi):
    ya1, xa1, ya2, xa2 = i
    anchor_area = (ya2 - ya1) * (xa2 - xa1)
    for num1, j in enumerate(bbox):
        yb1, xb1, yb2, xb2 = j
        bbox_area = (yb2 - yb1) * (xb2 - xb1)
        
        inter_x1 = max([xb1, xa1])
        inter_y1 = max([yb1, ya1])
        inter_x2 = min([xb2, xa2])
        inter_y2 = min([yb2, ya2])
        
        if (inter_x1 < inter_x2) and (inter_y1 < inter_y2):
            iter_area = (inter_y2 - inter_y1) * (inter_x2 - inter_x1)
            iou = iter_area / (anchor_area + box_area - iter_area)            
        else:
            iou = 0
            
        ious[num1, num2] = iou
print(ious.shape)

(1482, 2)


In [119]:
gt_assignment = ious.argmax(axis=1)
max_ious = ious.max(axis=1)
print(gt_assignment)
print(max_ious)

gt_roi_label = labels[gt_assignment]
print(gt_roi_label)

[1 0 0 ... 0 0 0]
[1.0825591 0.        0.        ... 0.        0.        0.       ]
tensor([8, 6, 6,  ..., 6, 6, 6])


In [120]:
pos_roi_per_image = int(n_samples * pos_ratio)
pos_index = np.where(max_ious >= pos_iou_thresh)[0]
pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size))
if pos_index.size > 0:
    pos_index = np.random.choice(
        pos_index, size=pos_roi_per_this_image, replace=False)
print(pos_roi_per_this_image)
print(pos_index)

1
[0]


In [121]:
neg_index = np.where((max_ious < neg_iou_thresh_hi) &
                     (max_ious >= neg_iou_thresh_lo))[0]
neg_roi_per_this_image = n_sample - pos_roi_per_this_image
neg_roi_per_this_image = int(min(neg_roi_per_this_image,
                                 neg_index.size))
if neg_index.size > 0:
    neg_index = np.random.choice(
        neg_index, size=neg_roi_per_this_image, replace=False)
print(neg_roi_per_this_image)
print(neg_index)

124
[1083 1185  612  534 1045  794 1330  497   47  477  647  440  594 1274
  217 1354 1215  370  369 1338 1047 1363 1289  673 1062  768 1360  220
   79  840 1426  660 1411 1032  335  764  767 1380  178  416  831  568
  303  909 1057  324  244  971  684 1407  637  331  412  575 1447  674
 1395  921   99  266  325 1154   67 1080  959 1142  283  339  661  531
   17  528  775   34  640  163 1009  734  676  233  843  525  762  122
  782 1073 1242 1222 1042  250 1287  961  472  393  986 1359  666  541
  655  626  352  357  238  104  502 1068   27 1349  374  879  474 1480
   18  216  641  995  857 1283 1146  707 1390  675 1223  842]


In [122]:
keep_index = np.append(pos_index, neg_index)
gt_roi_labels = gt_roi_label[keep_index]
gt_roi_labels[pos_roi_per_this_image:] = 0  # negative labels --> 0
sample_roi = roi[keep_index]
print(sample_roi.shape)

(125, 4)


In [130]:
bbox_for_sampled_roi = bbox[gt_assignment[keep_index]]

In [132]:
print(bbox_for_sampled_roi.shape)

torch.Size([125, 4])


In [133]:
height = sample_roi[:, 2] - sample_roi[:, 0]
width = sample_roi[:, 3] - sample_roi[:, 1]
ctr_y = sample_roi[:, 0] + 0.5 * height
ctr_x = sample_roi[:, 1] + 0.5 * width

In [134]:
sample_roi.shape

(125, 4)

In [135]:
base_height = bbox_for_sampled_roi[:, 2] - bbox_for_sampled_roi[:, 0]
base_width = bbox_for_sampled_roi[:, 3] - bbox_for_sampled_roi[:, 1]
base_ctr_y = (bbox_for_sampled_roi[:, 0] + 0.5 * base_height).cpu().numpy()
base_ctr_x = (bbox_for_sampled_roi[:, 1] + 0.5 * base_width).cpu().numpy()

In [138]:
eps = np.finfo(height.dtype).eps
height = np.maximum(height, eps)
width = np.maximum(width, eps)
dy = (base_ctr_y - ctr_y) / height
dx = (base_ctr_x - ctr_x) / width
dh = np.log(base_height.cpu().numpy() / height)
dw = np.log(base_width.cpu().numpy() / width)
gt_roi_locs = np.vstack((dy, dx, dh, dw)).transpose()
print(gt_roi_locs.shape)

(125, 4)


In [142]:
sample_roi.shape

(125, 4)

In [143]:
rois = torch.from_numpy(sample_roi).float()

In [148]:
rois.size()

torch.Size([125, 4])

In [153]:
roi_indices = np.zeros((len(rois), ), dtype = np.int32)

In [154]:
roi_indices

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [155]:
roi_indices = torch.from_numpy(roi_indices).float()

In [156]:
print(roi_indices.shape, rois.shape)

torch.Size([125]) torch.Size([125, 4])


In [164]:
indices_and_roi = torch.cat([roi_indices[:, None], rois], dim = 1)
print(indices_and_roi.shape)

torch.Size([125, 5])


In [166]:
xy_indices_and_rois = indices_and_roi[:, [0, 2, 1, 4, 3]]
indices_and_roi = xy_indices_and_rois.contiguous()
print(xy_indices_and_rois.shape)

torch.Size([125, 5])


In [168]:
xy_indices_and_rois[1]

tensor([  0.0000, 154.5457, 109.9299, 245.1085, 290.5577])

In [169]:
import torch.nn as nn

In [187]:
size = 7  # max pool 7x7
adaptive_max_pool = nn.AdaptiveMaxPool2d(size)
output = []
rois = indices_and_roi.data.float()
rois[:, 1:].mul_(1 / 16.0)  # Subsampling ratio skipping the index
rois = rois.long()
num_rois = rois.size(0)

In [188]:
num_rois

125

In [218]:
output = []
for roi in rois:
    roi_feature = out_map[..., roi[0]:roi[2]+1, roi[1]:roi[3]+1]
    output.append(adaptive_max_pool(roi_feature))

In [219]:
output = torch.cat(output, 0)
print(output.shape)

torch.Size([125, 512, 7, 7])


In [220]:
k = output.view(output.size(0), -1)

In [221]:
k.size()

torch.Size([125, 25088])

In [222]:
roi_head_classifier = nn.Sequential(*[nn.Linear(25088, 4096), nn.Linear(4096, 4096)])
cls_loc = nn.Linear(4096, 21*4)

cls_loc.weight.data.normal_(0, 0.01)
cls_loc.bias.data.zero_()

score = nn.Linear(4096, 21)

In [223]:
k.shape

torch.Size([125, 25088])

In [224]:
k = roi_head_classifier(k)

In [225]:
roi_cls_loc = cls_loc(k)
roi_cls_score = score(k)

In [226]:
print(roi_cls_loc.shape, roi_cls_score.shape)

torch.Size([125, 84]) torch.Size([125, 21])


In [227]:
print(pred_anchor_locs.shape)
print(pred_cls_scores.shape)
print(anchor_locations.shape)
print(anchor_labels.shape)

torch.Size([1, 22500, 4])
torch.Size([1, 22500, 2])
(22500, 4)
(22500,)


In [237]:
rpn_loc = pred_anchor_locs[0]
rpn_score = pred_cls_scores[0]

gt_rpn_loc = torch.from_numpy(anchor_locations)
gt_rpn_score = torch.from_numpy(anchor_labels)

print(rpn_loc.shape, rpn_score.shape, gt_rpn_loc.shape, gt_rpn_score.shape)

torch.Size([22500, 4]) torch.Size([22500, 2]) torch.Size([22500, 4]) torch.Size([22500])


In [238]:
import torch.nn.functional as F
rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_score.long(), ignore_index=-1)

In [239]:
print(rpn_cls_loss)

tensor(0.6914, grad_fn=<NllLossBackward>)


In [240]:
pos = gt_rpn_score > 0

In [241]:
pos.shape

torch.Size([22500])

In [247]:
mask = pos.unsqueeze(1).expand_as(rpn_loc)
print(mask.shape)

torch.Size([22500, 4])


In [248]:
mask_loc_preds = rpn_loc[mask].view(-1, 4)
mask_loc_targets = gt_rpn_loc[mask].view(-1, 4)
print(mask_loc_preds.shape, mask_loc_preds.shape)

torch.Size([18, 4]) torch.Size([18, 4])


In [255]:
mask_loc_preds = mask_loc_preds.float()
mask_loc_targets = mask_loc_targets.float()

In [256]:
x = torch.abs(mask_loc_targets - mask_loc_preds)

In [257]:
rpn_loc_loss = ((x < 1).float() * 0.5 * x**2) + ((x >= 1).float() * (x-0.5))
print(rpn_loc_loss.sum())

tensor(0.0004, grad_fn=<SumBackward0>)


In [258]:
rpn_lambda = 10
N_reg = (gt_rpn_score > 0).float().sum()

In [260]:
rpn_loc_loss = rpn_loc_loss.sum() / N_reg

In [261]:
rpn_loc_loss

tensor(1.9661e-05, grad_fn=<DivBackward0>)

In [262]:
rpn_loss = rpn_cls_loss + (rpn_lambda * rpn_loc_loss)
print(rpn_loss)

tensor(0.6916, grad_fn=<AddBackward0>)


In [263]:
print(roi_cls_loc.shape)
print(roi_cls_score.shape)

torch.Size([125, 84])
torch.Size([125, 21])


In [264]:
print(gt_roi_locs.shape)
print(gt_roi_labels.shape)

(125, 4)
torch.Size([125])


In [265]:
gt_roi_loc = torch.from_numpy(gt_roi_locs)
gt_roi_label = torch.from_numpy(np.float32(gt_roi_labels)).long()
print(gt_roi_loc.shape, gt_roi_label.shape)

torch.Size([125, 4]) torch.Size([125])


In [270]:
roi_cls_loss = F.cross_entropy(roi_cls_score, gt_roi_label, ignore_index=-1)
print(roi_cls_loss)

tensor(3.0490, grad_fn=<NllLossBackward>)


In [271]:
n_sample = roi_cls_loc.shape[0]
roi_loc = roi_cls_loc.view(n_sample, -1, 4)
print(roi_loc.shape)

torch.Size([125, 21, 4])


In [272]:
print(n_sample)

125


In [273]:
roi_loc = roi_loc[torch.arange(0, n_sample).long(), gt_roi_label]
print(roi_loc.shape)

torch.Size([125, 4])


In [275]:
gt_roi_loc = gt_roi_loc.float()
roi_loc = roi_loc.float()

In [276]:
x_roi = torch.abs(gt_roi_loc - roi_loc)
roi_loc_loss = ((x_roi < 1).float() * 0.5 * x_roi ** 2) + ((x_roi >= 1).float() * (x_roi - 0.5))
print(roi_loc_loss.sum())

tensor(437.5732, grad_fn=<SumBackward0>)


In [280]:
roi_lambda = 10.
N_reg_roi = (gt_rpn_score > 0).float().sum()
roi_loc_loss = roi_loc_loss.sum() / N_reg_roi
roi_loss = roi_cls_loss + (roi_lambda * roi_loc_loss)
print(roi_loss)

tensor(246.1453, grad_fn=<AddBackward0>)


In [281]:
total_loss = rpn_loss + roi_loss
print(total_loss)

tensor(246.8369, grad_fn=<AddBackward0>)
