# Single Shot Multibox Detection(SSD)
배웠던 것을 활용하여 object detection model SSD를 만들어보자.  
SSD는 널리 이용되고 있으며 다른 object detection model에도 적용 가능하다.  
읽어볼 것 : :cite:`Liu.Anguelov.Erhan.ea.2016`

### Model
SSD 모델의 주 요소는 base network와 연속적으로 연결된 다규모의 feature block이다.  
Base network 블록은 원본 이미지의 feature를 추출하는 데 사용되고 deep convolutional neural network의 형태를 갖는다(그러나 최근 ResNet으로 대체되는 추세다?).

### Category Prediction Layer
Object category 개수가 $q$면 anchor box category 개수는 $q+1$이 된다. Background만을 포함하는 anchor box의 number는 0이 되는 것이다. Classifying에 FCN을 사용하게 되면 computational burden이 너무 커서 SSD는 model complexity를 감소시킬 필요가 있다.

Category prediction layer에서는 input의 h, w를 유지하는 convolutional layer를 사용한다. 즉 output과 input의 좌표는 feature map의 그것과 일대일 대응한다.
$a$와 $q$ parameter를 명시한 후에 padding 1, 3 by 3 convolutional layer를 사용한다. Input과 output의 해상도는 변하지 않는다.

In [28]:
%matplotlib inline
import sys
sys.path.insert(0, '..')
import d2l
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
import json
import time
from tqdm import tqdm
from PIL import Image
def cls_predictor(input_channels, num_anchors, num_classes):
    return nn.Conv2d(in_channels=input_channels, out_channels=num_anchors * (num_classes + 1), kernel_size = 3, padding = 1)

### Bounding Box Prediction Layer
Bounding box prediction layer 설계는 category prediction layer의 그것과 유사하다. 차이점은 anchor box 당 $q+1$이 아닌 4개의 offset이 필요하다는 점이다.

In [29]:
def bbox_predictor(input_channels, num_anchors):
    return nn.Conv2d(in_channels=input_channels, out_channels=num_anchors * 4, kernel_size = 3, padding = 1)

### Concatenating Predictions for Multiple Scales


In [30]:
def forward(x, block):
    return block(x)
Y1 = forward(torch.zeros((2, 8, 20, 20)), cls_predictor(8, 5, 10))
Y2 = forward(torch.zeros((2, 16, 10, 10)), cls_predictor(16, 3, 10))
(Y1.shape, Y2.shape)

(torch.Size([2, 55, 20, 20]), torch.Size([2, 33, 10, 10]))

In [31]:
def flatten_pred(pred):
    return pred.permute(0, 2, 3, 1).reshape(pred.size(0), -1)

def concat_preds(preds):
    return torch.cat(tuple([flatten_pred(p) for p in preds]), dim = 1)

In [32]:
concat_preds([Y1, Y2]).shape

torch.Size([2, 25300])

### Height and Width Downsample Block

In [33]:
def down_sample_blk(input_channels, num_channels):
    blk = []
    for _ in range(2):
        blk.append(nn.Conv2d(in_channels = input_channels, out_channels = num_channels, kernel_size = 3, padding = 1))
        blk.append(nn.BatchNorm2d(num_features = num_channels))
        blk.append(nn.ReLU())
        input_channels = num_channels
    blk.append(nn.MaxPool2d(kernel_size = 2, stride = 2))
    blk = nn.Sequential(*blk)
    return blk

In [34]:
forward(torch.zeros((2, 3, 20, 20)), down_sample_blk(3, 10)).shape

torch.Size([2, 10, 10, 10])

### Base Network Block

In [35]:
def base_net():
    blk = []
    num_filters = [3, 16, 32, 64]
    for i in range(len(num_filters) - 1):
        blk.append(down_sample_blk(num_filters[i], num_filters[i + 1]))
    blk = nn.Sequential(*blk)
    return blk

forward(torch.zeros((2, 3, 256, 256)), base_net()).shape

torch.Size([2, 64, 32, 32])

### The Complete Model

In [36]:
def get_blk(i):
    if i == 0:
        blk = base_net()
    elif i == 1:
        blk = down_sample_blk(64, 128)
    elif i == 4:
        blk = nn.AdaptiveMaxPool2d((1, 1))
    else:
        blk = down_sample_blk(128, 128)
        
    return blk

In [43]:
import itertools
import math
def create_anchors(feature_map_sizes, steps, sizes):
    scale = 256.
    steps = [s / scale for s in steps]
    sizes = [s / scale for s in sizes]
    
    aspect_ratios = ((2,),)
    
    
    num_layers = len(feature_map_sizes)
    
    boxes = []
    for i in range(num_layers):
        fmsize = feature_map_sizes[i]
        for h, w in itertools.product(range(fmsize), repeat = 2):
            cx = (w + 0.5) * steps[i]
            cy = (h + 0.5) * steps[i]
            s = sizes[i]
            boxes.append((cx, cy, s, s))
            
            s = sizes[i + 1]
            boxes.append((cx, cy, s, s))
            
            s = sizes[i]
            for ar in aspect_ratios[i]:
                boxes.append((cx, cy, (s * math.sqrt(ar)), (s / math.sqrt(ar))))
                boxes.append((cx, cy, (s / math.sqrt(ar)), (s * math.sqrt(ar))))
                
    return torch.Tensor(boxes)

In [44]:
def blk_forward(X, blk, size, ratio, cls_predictor, bbox_predictor):
    Y = blk(X)
    anchors = create_anchors((Y.size(2),), (256 / Y.size(2),), size)
    cls_preds = cls_predictor(Y)
    bbox_preds = bbox_predictor(Y)
    return (Y, anchors, cls_preds, bbox_preds)

In [45]:
sizes = [[0.2 * 256, 0.272 * 256], [0.37*256, 0.446 * 256], [0.54 * 256, 0.619 * 256], [0.71 * 256, 0.79 * 256], [0.88 * 256, 0.961 * 256]]
ratios = [[1, 2, 0.5]] * 5
num_anchors = len(sizes[0]) + len(ratios[0]) - 1

In [46]:
class TinySSD(nn.Module):
    def __init__(self, input_channels, num_classes):
        super(TinySSD, self).__init__()
        
        input_channels_cls = 128
        input_channels_bbox = 128
        self.num_classes = num_classes
        
        self.blk = []
        self.cls = []
        self.bbox = []
        
        self.blk_0 = get_blk(0)
        self.blk_1 = get_blk(1)
        self.blk_2 = get_blk(2)
        self.blk_3 = get_blk(3)
        self.blk_4 = get_blk(4)
        
        self.cls_0 = cls_predictor(64, num_anchors, num_classes)
        self.cls_1 = cls_predictor(input_channels_cls, num_anchors, num_classes)
        self.cls_2 = cls_predictor(input_channels_cls, num_anchors, num_classes)
        self.cls_3 = cls_predictor(input_channels_cls, num_anchors, num_classes)
        self.cls_4 = cls_predictor(input_channels_cls, num_anchors, num_classes)
        
        self.bbox_0 = bbox_predictor(64, num_anchors)
        self.bbox_1 = bbox_predictor(input_channels_bbox, num_anchors)
        self.bbox_2 = bbox_predictor(input_channels_bbox, num_anchors)
        self.bbox_3 = bbox_predictor(input_channels_bbox, num_anchors)
        self.bbox_4 = bbox_predictor(input_channels_bbox, num_anchors)
    
    def forward(self, X):
        anchors, cls_preds, bbox_preds = [None] * 5, [None] * 5, [None] * 5
        
        X, anchors[0], cls_preds[0], bbox_preds[0] = blk_forward(X, self.blk_0, sizes[0], ratios[0], self.cls_0, self.bbox_0)
        
        X, anchors[1], cls_preds[1], bbox_preds[1] = blk_forward(X, self.blk_1, sizes[1], ratios[1], self.cls_1, self.bbox_1)
            
        X, anchors[2], cls_preds[2], bbox_preds[2] = blk_forward(X, self.blk_2, sizes[2], ratios[2], self.cls_2, self.bbox_2)    
        
        X, anchors[3], cls_preds[3], bbox_preds[3] = blk_forward(X, self.blk_3, sizes[3], ratios[3], self.cls_3, self.bbox_3)
        
        X, anchors[4], cls_preds[4], bbox_preds[4] = blk_forward(X, self.blk_4, sizes[4], ratios[4], self.cls_4, self.bbox_4)

        return (torch.cat(anchors, dim=0), concat_preds(cls_preds).reshape((-1, 5444, self.num_classes + 1)), concat_preds(bbox_preds))

In [47]:
anchors, cls_preds, bbox_preds = [None] * 5, [None] * 5, [None] * 5

In [48]:
def init_weights(m):
    if type(m) ==  nn.Linear or type(m) == nn.Conv2d:
        torch.nn.init.xavier_uniform_(m.weight)
        
net = TinySSD(3, num_classes = 1)
net.apply(init_weights)

X = torch.zeros((32, 3, 256, 256))
anchors, cls_preds, bbox_preds = net(X)

print('output anchors:', anchors.shape)
print('output class preds:', cls_preds.shape)
print('output cbbox preds:', bbox_preds.shape)

output anchors: torch.Size([5444, 4])
output class preds: torch.Size([32, 5444, 2])
output cbbox preds: torch.Size([32, 21776])


## Training

### Data Reading and Initialization

In [49]:
d2l.download_and_preprocess_data()

In [51]:
batch_size = 32
data_dir = '../data/pikachu'
train_dataset = d2l.PIKACHU(data_dir, 'train')
val_dataset = d2l.PIKACHU(data_dir, 'val')

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = batch_size, shuffle = True, num_workers = 4)

val_loader = torch.utils.data.DataLoader(val_dataset, batch_size = batch_size, shuffle = False, num_workers = 4)

In [52]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda:0


In [53]:
net = TinySSD(3, num_classes = 1)
net.apply(init_weights)
net = net.to(device)

learning_rate = 1e-3
weight_decay = 5e-4
optimizer = optim.SGD(net.parameters(), lr = learning_rate, weight_decay = weight_decay)