In [30]:
import pandas as pd
from itertools import product as product
from math import sqrt
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Function

In [2]:
# 34層のvgg層を実装
def make_vgg():
    layers = []
    in_channels = 3
    
    cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'MC',
           512, 512, 512, 'M', 512, 512, 512]
    
    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        elif v == 'MC':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
        else:
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
            
    pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
    conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=1)
    conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
    layers += [pool5, conv6, nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]
    return nn.ModuleList(layers)

In [3]:
# ドウサカクニン
vgg_test = make_vgg()
print(vgg_test)

ModuleList(
  (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU(inplace)
  (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU(inplace)
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (6): ReLU(inplace)
  (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (8): ReLU(inplace)
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (11): ReLU(inplace)
  (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (13): ReLU(inplace)
  (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (15): ReLU(inplace)
  (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=True)
  (17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1

In [6]:
#8層のextrasを実装
def make_extras():
    layers = []
    in_channels = 1024
    
    cfg = [256, 512, 128, 256, 128, 256, 128, 256]
    
    layers += [nn.Conv2d(in_channels, cfg[0], kernel_size=(1))]
    layers += [nn.Conv2d(cfg[0], cfg[1], kernel_size=(3), stride=2, padding=1)]
    layers += [nn.Conv2d(cfg[1], cfg[2], kernel_size=(1))]
    layers += [nn.Conv2d(cfg[2], cfg[3], kernel_size=(3), stride=2, padding=1)]
    layers += [nn.Conv2d(cfg[3], cfg[4], kernel_size=(1))]
    layers += [nn.Conv2d(cfg[4], cfg[5], kernel_size=(3))]
    layers += [nn.Conv2d(cfg[5], cfg[6], kernel_size=(1))]
    layers += [nn.Conv2d(cfg[6], cfg[7], kernel_size=(3))]
    
    return nn.ModuleList(layers)

In [8]:
# 動作確認
extras_test = make_extras()
extras_test

ModuleList(
  (0): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
  (1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (2): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1))
  (3): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (4): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
  (5): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
  (6): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
  (7): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
)

In [11]:
# defaultboxのoffsetを出力する loc_layers
# defaultboxに対するclassの確信度を出力する conf_layers
def make_loc_conf(num_classes=21, bbox_aspect_num=[4, 6, 6, 6, 4, 4]):
    loc_layers = []
    conf_layers = []
    
    # source1 (VGG22)
    loc_layers += [nn.Conv2d(512, bbox_aspect_num[0] * 4, kernel_size=3, padding=1)] # 4は出力するoffset数
    conf_layers += [nn.Conv2d(512, bbox_aspect_num[0] * num_classes, kernel_size=3, padding=1)]
    
    # source2 (VGG最終)
    loc_layers += [nn.Conv2d(1024, bbox_aspect_num[1] * 4, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(1024, bbox_aspect_num[1] * num_classes, kernel_size=3, padding=1)]
    
    # source3 (extras2)
    loc_layers += [nn.Conv2d(512, bbox_aspect_num[2] * 4, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(512, bbox_aspect_num[2] * num_classes, kernel_size=3, padding=1)]
    
    # source4 (extras4)
    loc_layers += [nn.Conv2d(256, bbox_aspect_num[3] * 4, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(256, bbox_aspect_num[3] * num_classes, kernel_size=3, padding=1)]
    
    # source5 (extras6)
    loc_layers += [nn.Conv2d(256, bbox_aspect_num[4] * 4, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(256, bbox_aspect_num[4] * num_classes, kernel_size=3, padding=1)]
    
    # source6 (extras8)
    loc_layers += [nn.Conv2d(256, bbox_aspect_num[5] * 4, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(256, bbox_aspect_num[5] * num_classes, kernel_size=3, padding=1)]
    
    return nn.ModuleList(loc_layers), nn.ModuleList(conf_layers)
    

In [12]:
# 動作確認
loc_test, conf_test = make_loc_conf()
print(loc_test)
print(conf_test)

ModuleList(
  (0): Conv2d(512, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): Conv2d(1024, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (2): Conv2d(512, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): Conv2d(256, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (4): Conv2d(256, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (5): Conv2d(256, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
ModuleList(
  (0): Conv2d(512, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): Conv2d(1024, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (2): Conv2d(512, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): Conv2d(256, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (4): Conv2d(256, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (5): Conv2d(256, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)


In [14]:
# L2Norm層の実装
class L2Norm(nn.Module):
    def __init__(self, input_channels=512, scale=20):
        super(L2Norm, self).__init__()
        self.weight = nn.Parameter(torch.Tensor(input_channels))
        self.scale = scale # 係数weightの初期値として設定する値
        self.reset_parameters()
        self.eps = 1e-10
        
    def reset_parameters(self):
        init.constant_(self.weight, self.scale)
        
    def forward(self, x):
        norm = x.pow(2).sum(dim=1, keepdim=True).sqrt + self.eps
        x = torch.div(x, norm)
        
        weights = self.weigth.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x)
        out = weights * x
        
        return out

In [15]:
# DefaultBoxの実装
# 8732 * 4
class DBox(object):
    def __init__(self, cfg):
        super(DBox, self).__init__() # objectが親クラスなのにinit？
        
        self.image_size = cfg['input_size'] # 画像サイズ 300
        self.feature_maps = cfg['feature_maps'] # 各sourceの特徴量マップのサイズ[38, 19, 10, 5, 3, 1]
        self.num_priors = len(cfg['feature_maps']) # sourceの個数 6
        self.steps = cfg['steps'] # DBoxのピクセルサイズ [8, 16, 32, 64, 100, 300] マップサイズとほぼ反比例
        self.min_sizes = cfg['min_sizes'] # 小さい正方形のDBoxのピクセルサイズ [30, 60, ]
        self.max_sizes = cfg['max_sizes'] # 大きい正方形のDBoxのピクセルサイズ [60, 111, ]
        self.aspect_ratios = cfg['aspect_ratios'] # 長方形のDBoxのアスペクト比
        
    def make_dbox_list(self):
        mean = []
        for k, f in enumerate(self.feature_maps):
            for i, j in product(range(f), repeat=2): # fまでの数での2pearの組み合わせ
                f_k = self.image_size / self.steps[k] # 特徴量の画像サイズ 300 / [8, 16, 32, 64, 100, 300]
                cx = (j + 0.5) / f_k
                cy = (i + 0.5) / f_k
                
                # aspect 1 の小さいDBox
                s_k = self.min_sizes[k] / self.image_size
                mean += [cx, cy, s_k, s_k]
                
                # aspect 1 の大きいDBox
                s_k_prime = sqrt(s_k * (self.max_sizes[k] / self.image_size))
                mean += [cx, cy, s_k_prime, s_k_prime]
                
                # その他のアスペクト比のDBox
                for ar in self.aspect_ratios[k]:
                    mean += [cx, cy, s_k*sqrt(ar), s_k/sqrt(ar)]
                    mean += [cx, cy, s_k/sqrt(ar), s_k*sqrt(ar)]
                    
        output = torch.Tensor(mean).view(-1, 4) # DBoxをテンソルに変換 torch.Size([8732, 4])
        output.clamp_(max=1, min=0) # DBoxが画像の外に出るのを防ぐために、大きさ0-1にする
        
        return output        
                

In [21]:
# 動作の確認
ssd_cfg = {
    'num_classes': 21, # 背景クラスを含めた合計数
    'input_size': 300,
    'bbox_aspect_num': [4, 6, 6, 6, 4, 4],
    'feature_maps': [38, 19, 10, 5, 3, 1],
    'steps': [8, 16, 32, 64, 100, 300],
    'min_sizes': [30, 60, 111, 162, 213, 264],
    'max_sizes': [60, 111, 162, 213, 264, 315],
    'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
}

dbox = DBox(ssd_cfg)
dbox_list = dbox.make_dbox_list()

pd.DataFrame(dbox_list.numpy())

Unnamed: 0,0,1,2,3
0,0.013333,0.013333,0.100000,0.100000
1,0.013333,0.013333,0.141421,0.141421
2,0.013333,0.013333,0.141421,0.070711
3,0.013333,0.013333,0.070711,0.141421
4,0.040000,0.013333,0.100000,0.100000
5,0.040000,0.013333,0.141421,0.141421
6,0.040000,0.013333,0.141421,0.070711
7,0.040000,0.013333,0.070711,0.141421
8,0.066667,0.013333,0.100000,0.100000
9,0.066667,0.013333,0.141421,0.141421


In [22]:
# SSD classを実装
class SSD(nn.Module):
    def __init__(self, phase, cfg):
        super(SSD, self).__init__()
        
        self.phase = phase # train or inference
        self.num_classes = cfg['num_classes']
        
        self.vgg = make_vgg()
        self.extras = make_extras()
        self.L2Norm = L2Norm()
        self.loc, self.conf = make_loc_conf(cfg['num_classes'], cfg['bbox_aspect_num'])
        
        dbox = DBox(cfg)
        self.dbox_list = dbox.make_dbox_list()
        
        if phase == 'inference':
            self.detect = Detect() #後述
        

In [23]:
# 動作確認
ssd_test = SSD(phase='train', cfg=ssd_cfg)
print(ssd_test)

SSD(
  (vgg): ModuleList(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=True)
    (17): Conv2d(256, 

## 2-5 順伝播関数の実装

In [24]:
# decode (DBoxとlocからBBoxを作成)
def decode(loc, dbox_list):
    
    boxes = torch.cat((
        dbox_list[:, :2] + loc[:, :2] * 0.1 * dbox_list[:, 2:],
        dbox_list[:, 2:] * torch.exp(loc[:, 2:] * 0.2)), dim=1)
    # torch.Size([8732, 4])
    
    # BBoxの座標情報を[cx, cy, width, height]から[xmin, ymin, xmax, ymax]に変換
    boxes[:, :2] -= boxes[:, 2:] / 2
    boxes[:, 2:] += boxes[:, :2]
    
    return boxes

In [25]:
# Non-Maximum Suppression
# もしBBox同士の被っている面積が閾値以上である場合にはconfが一番大きなBBoxのみを残し他は消去。object_classごとに実行
def num_suppression(boxes, scores, overlap=0.45, top_k=200):
    '''
    return. keep: list, count: int
    '''
    count = 0
    keep = scores.new(scores.size(0)).zero().long() # torch.Size([確信度閾値を超えたBBox数])　要素は全部0
    
    # 各BBoxの面積areaを計算
    x1 = bboxes[:, 0]
    y1 = bboxes[:, 1]
    x2 = bboxes[:, 2]
    y2 = bboxes[:, 3]
    area = torch.mul(x2 - x1, y2 - y1)
    
    # boxesコピー。のちに、BBoxの被り度合いIoUの計算に使用する際の雛形
    tmp_x1 = boxes.new()
    tmp_y1 = boxes.new()
    tmp_x2 = boxes.new()
    tmp_y2 = boxes.new()
    tmp_w = boxes.new()
    tmp_h = boxes.new()
    
    # scoreを昇順に並び替え
    v, idx = scores.sort(0)
    
    #上位top_kのBBoxのindexを取り出す
    idx = idx[-top_k:]
    
    while idx.numel() > 0:
        i = idx[-1] # 現在のconf最大のindexをiに
        
        keep[count] = i
        count += 1
        
        if idx.size(0) == 1: # 最後のBBox
            break
        
        idx = idx[:-1] # idxを一つ減らす
        
        # ここから、keepに格納したBBoxと被りの大きいBBoxを抽出して除去する
        # 一つ減らしたidxまでのBBoxをoutに指定した変数として作成
        torch.index_select(x1, 0, idx, out=tmp_x1)
        torch.index_select(y1, 0, idx, out=tmp_y1)
        torch.index_select(x2, 0, idx, out=tmp_x2)
        torch.index_select(y2, 0, idx, out=tmp_y2)
        
        # 全てのBBoxに対して、現在のBBox=indexがiと被っている値までに設定
        tmp_x1 = torch.clamp(tmp_x1, min=x1[i])
        tmp_y1 = torch.clamp(tmp_y1, min=y1[i])
        tmp_x2 = torch.clamp(tmp_x2, max=x2[i])
        tmp_y2 = torch.clamp(tmp_y2, max=y2[i])
        
        # wとhのテンソルサイズをindexを1つ減らしたものにする
        tmp_w.resize_as_(tmp_x2)
        tmp_h.resize_as_(tmp_y2)

        # clampした状態でのBBoxの幅と高さを求める
        tmp_w = tmp_x2 - tmp_x1
        tmp_h = tmp_y2 - tmp_y1

        # 幅や高さが負になっているものは0にする
        tmp_w = torch.clamp(tmp_w, min=0.0)
        tmp_h = torch.clamp(tmp_h, min=0.0)

        # clampされた状態での面積を求める
        inter = tmp_w*tmp_h

        # IoU = intersect部分 / (area(a) + area(b) - intersect部分)の計算
        rem_areas = torch.index_select(area, 0, idx)  # 各BBoxの元の面積
        union = (rem_areas - inter) + area[i]  # 2つのエリアのANDの面積
        IoU = inter/union

        # IoUがoverlapより小さいidxのみを残す
        idx = idx[IoU.le(overlap)]  # leはLess than or Equal toの処理をする演算です
        # IoUがoverlapより大きいidxは、最初に選んでkeepに格納したidxと同じ物体に対してBBoxを囲んでいるため消去
        
    return keep, count

In [28]:
# Detect実装
# 出力テンソル (batch_num, class_num, top_k_num, BBox_info[conf, xmin, ymin, width, height])
class Detect(Function):
    def __init__(self, conf_thresh=0.01, top_k=200, nms_thresh=0.45):
        self.softmax = nn.Softmax(dim=-1) # confをsoftmaxで正規化するために用意
        self.conf_thresh = conf_thresh    # confがthreshより高いDBoxのみ扱う
        self.top_k = top_k                # num_supressionでconfの高いtop_k個を計算に使用
        self.nms_thresh = nms_thresh
        
    def forward(self, loc_data, conf_data, dbox_list):
        '''
        Parameters
        ----------
        loc_data: [batch_num, 8732, 4] offset info
        conf_data: [batch_num, 8732, num_classes] 
        dbox_list: [8732, 4] DBox info
        
        Returns
        ------
        output: torch.Size([batch_num, 21, 200, 5])
        '''
        
        num_batch = loc_data.size(0) # minibatchのサイズ
        num_dbox = loc_data.size(1) # DBoxの数
        num_classes = conf_data.size(2) # class num=21
        
        conf_data = self.softmax(conf_data)
        
        output = torch.zeros(num_batch, num_classes, self.top_k, 5)
        
        # conf_dataを[batch_num, 8732, num_classes]から[batch_num, num_classes, 8732]に変更
        conf_preds = conf_data.transpose(2, 1)
        
        # Mini_bacthごとにループ
        for i in range(num_batch):
            # locとDBoxから修正したBBox[xmin, ymin, xmax, ymax]を求める
            decoded_boxes = decode(loc_data[i], dbox_list)
            
            # confのコピーを作成
            conf_scores = conf_preds[i].clone()
            
            # 画像クラスごとのループ
            for cl in range(1, num_classes):
                # confの閾値を超えたBBoxを取り出す
                c_mask = conf_scores[cl].gt(self.conf_thresh) # gt=greater than
                # c_mask: torch.Size([8732])
                scores = conf_scores[cl][c_mask]
                
                if scores.nelement() == 0:
                    continue
                    
                l_mask = c_mask.unsqueeze(1).expand_as(decoded_boxes)
                # l_mask: torch.Size([8732, 4])
                
                boxes = decoded_boxes[l_mask].view(-1, 4)
                # decoded_boxesで一次元になってしまうので、viewで(閾値を超えたBBox数, 4)に変形
                
                ids, count = nm_supression(boxes, scores, self.nms_thresh, self.top_k)
                
                output[i, cl, :count] = torch.cat((socres[ids[:count]].unsqueeze(1), boxes[ids[:count]]), 1)
                
        return output

In [31]:
# 順伝播を実装し、SSD modelを実装
class SSD(nn.Module):
    def __init__(self, phase, cfg):
        super(SSD, self).__init__()
        
        self.phase = phase
        self.num_classes = cfg['num_classes']
        
        self.vgg = make_vgg()
        self.extras = make_extras()
        self.L2Norm = L2Norm()
        self.loc, self.conf = make_loc_conf(cfg['num_classes'], cfg['bbox_aspect_num'])
        
        dbox = DBox(self)
        self.dbox_list = dbox.make_dbox_list()
        
        if phase == 'inference':
            self.detect = Detect()
            
    def forward(self, x):
        sources = []
        loc = []
        conf = []
        
        for k in range(23): # vggのconv4_3まで計算(source1)
            x = self.vgg[k](x)
        
        # conv4_3の出力をL2Normに入力し、source1を作成
        source1 = self.L2Norm(x)
        sources.append(source1)
        
        for k in range(23, len(self.vgg)):
            x = self.vgg[k](x)
        
        sources.append(x) #source2
        
        # extrasのconvとReLUを計算
        # source3-6をsourcesに追加
        for k, v in enumerate(self.extras):
            x = F.relu(v(x), inplace=True)
            if k % 2 == 1: # conv -> ReLU -> conv -> ReLUしてからsourceにいれる
                sources.append(x)
                
        # source1~6に、それぞれ対応する畳み込みを1回ずつ適用する
        # zipでforループの複数のリストの要素を取得
        # source1-6まであるので6回ループ
        for (x, l, c) in zip(sources, self.loc, self.conf):
            loc.append(l(x).permute(0, 2, 3, 1).contiguous())
            conf.append(c(x).permute(0, 2, 3, 1).contiguous())
            # l(x)とc(x)で畳み込みを実行
            # l(x)とc(x)の出力サイズは[batch_num, 4*アスペクト比の種類数, featuremapの高さ, featuremap幅]
            # sourceによって、アスペクト比の種類数が異なり、面倒なので順番入れ替えて整える
            # permuteで要素の順番を入れ替え、
            # [minibatch数, featuremap数, featuremap数,4*アスペクト比の種類数]へ
            # （注釈）
            # torch.contiguous()はメモリ上で要素を連続的に配置し直す命令です。
            # あとでview関数を使用します。
            # このviewを行うためには、対象の変数がメモリ上で連続配置されている必要があります。
        
        # locとconfの変形
        # loc: torch.Size([batch_num, 34928]), conf: torch.Size([batch_num, 183372])
        loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
        conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
            
        # さらに変形
        # loc: torch.Size([batch_num, 8732, 4]), conf: torch.Size([batch_num, 8732, 21])
        loc = loc.view(loc.size(0), -1, 4)
        conf = conf.view(conf.size(0), -1, self.num_classes)
        
        output = (loc, conf, self.dbox_list)
        
        if self.phase == 'inference':
            return self.detect(output[0], output[1], output[2])
        
        else:
            return output