# Single Shot Detector

In [1]:
import random
import numpy as np
from tqdm import tqdm
import cv2

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torchvision
from torchvision import models, transforms
#torch.backends.cudnn.deterministic = True
#torch.backends.cudnn.benchmark = False

In [2]:
# 乱数シード (共通)
torch.manual_seed(1234)
np.random.seed(1234)
random.seed(1234)

In [5]:
import itertools

for i, j in itertools.product(range(38), repeat=2):
    print(i, j)

0 0
0 1
0 2
0 3
0 4
0 5
0 6
0 7
0 8
0 9
0 10
0 11
0 12
0 13
0 14
0 15
0 16
0 17
0 18
0 19
0 20
0 21
0 22
0 23
0 24
0 25
0 26
0 27
0 28
0 29
0 30
0 31
0 32
0 33
0 34
0 35
0 36
0 37
1 0
1 1
1 2
1 3
1 4
1 5
1 6
1 7
1 8
1 9
1 10
1 11
1 12
1 13
1 14
1 15
1 16
1 17
1 18
1 19
1 20
1 21
1 22
1 23
1 24
1 25
1 26
1 27
1 28
1 29
1 30
1 31
1 32
1 33
1 34
1 35
1 36
1 37
2 0
2 1
2 2
2 3
2 4
2 5
2 6
2 7
2 8
2 9
2 10
2 11
2 12
2 13
2 14
2 15
2 16
2 17
2 18
2 19
2 20
2 21
2 22
2 23
2 24
2 25
2 26
2 27
2 28
2 29
2 30
2 31
2 32
2 33
2 34
2 35
2 36
2 37
3 0
3 1
3 2
3 3
3 4
3 5
3 6
3 7
3 8
3 9
3 10
3 11
3 12
3 13
3 14
3 15
3 16
3 17
3 18
3 19
3 20
3 21
3 22
3 23
3 24
3 25
3 26
3 27
3 28
3 29
3 30
3 31
3 32
3 33
3 34
3 35
3 36
3 37
4 0
4 1
4 2
4 3
4 4
4 5
4 6
4 7
4 8
4 9
4 10
4 11
4 12
4 13
4 14
4 15
4 16
4 17
4 18
4 19
4 20
4 21
4 22
4 23
4 24
4 25
4 26
4 27
4 28
4 29
4 30
4 31
4 32
4 33
4 34
4 35
4 36
4 37
5 0
5 1
5 2
5 3
5 4
5 5
5 6
5 7
5 8
5 9
5 10
5 11
5 12
5 13
5 14
5 15
5 16
5 17
5 18
5 19
5 20
5 21


### ネットワークモデル

VGG, Extra, Loc, Confの４つがメインネットワーク

### モジュールvgg

In [35]:
# 34層にわたるvggモジュールを作成
def make_vgg():
    layers = []
    in_channels = 3
    
    # vggモジュールで使用する畳み込み層やMaxPooling層のチャンネル数
    cfg = [64, 64, 'M', 
           128, 128, 'M', 
           256, 256, 256, 'MC', 
           512, 512, 512,'M',
           512, 512, 512]
    
    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        elif v == 'MC':
            # ceilは出力サイズを、計算結果(float)に対して、切り上げで整数にするモード
            # デフォルト(floor)では出力サイズを計算結果(float)に対して、切り下げで整数にするモード
            layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
        else:
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
            
    pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
    conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
    conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
    layers += [pool5, conv6, nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]
    return nn.ModuleList(layers)

# 動作確認
vgg_test = make_vgg()
print(vgg_test)

ModuleList(
  (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): ReLU(inplace=True)
  (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): ReLU(inplace=True)
  (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (6): ReLU(inplace=True)
  (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (8): ReLU(inplace=True)
  (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (11): ReLU(inplace=True)
  (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (13): ReLU(inplace=True)
  (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (15): ReLU(inplace=True)
  (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=True)
  (17): Conv2d(256, 512, kernel_siz

### モジュールextra

In [36]:
# 8層に渡るextraモジュールを作成
def make_extra():
    layers = []
    in_channels = 1024 # vggの出力のチャネル数
    
    # extraモジュールの畳み込み層のチャネル数を設定する
    cfg = [256, 512, 128, 256, 128, 256, 128, 256]
    
    layers += [nn.Conv2d(in_channels, cfg[0], kernel_size=(1))]
    layers += [nn.Conv2d(cfg[0], cfg[1], kernel_size=(3), stride=2, padding=1)]
    layers += [nn.Conv2d(cfg[1], cfg[2], kernel_size=(1))]
    layers += [nn.Conv2d(cfg[2], cfg[3], kernel_size=(3), stride=2, padding=1)]
    layers += [nn.Conv2d(cfg[3], cfg[4], kernel_size=(1))]
    layers += [nn.Conv2d(cfg[4], cfg[5], kernel_size=(3))]
    layers += [nn.Conv2d(cfg[5], cfg[6], kernel_size=(1))]
    layers += [nn.Conv2d(cfg[6], cfg[7], kernel_size=(3))]
    
    return nn.ModuleList(layers)

動作確認

In [38]:
extras_test = make_extra()
print(extras_test)

ModuleList(
  (0): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1))
  (1): Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (2): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1))
  (3): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
  (4): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
  (5): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
  (6): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))
  (7): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1))
)


### モジュールlocとモジュールconf

In [39]:
# デフォルトボックスのオフセットを出力するloc_layers
# デフォルトボックスに対する各クラスの信頼度confidenceを出力するconf_layers

def make_loc_conf(num_classes=21, bbox_aspect_num=[4, 6, 6, 6, 4, 4]):
    
    loc_layers = []
    conf_layers = []
    
    # VGGの22層目, conv4_3(source1)に対する畳込み層
    loc_layers += [nn.Conv2d(512, bbox_aspect_num[0] * 4, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(512, bbox_aspect_num[0] * num_classes, kernel_size=3, padding=1)]
    
    # VGGの最終層(source2)に対する畳込み層
    loc_layers += [nn.Conv2d(1024, bbox_aspect_num[1] * 4, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(1024, bbox_aspect_num[1] * num_classes, kernel_size=3, padding=1)]
    
    # extraの(source3)に対する畳み込み
    loc_layers += [nn.Conv2d(512, bbox_aspect_num[2] * 6, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(512, bbox_aspect_num[2] * num_classes, kernel_size=3, padding=1)]
    
    # extraの(source4)に対する畳込み
    loc_layers += [nn.Conv2d(256, bbox_aspect_num[3] * 6, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(256, bbox_aspect_num[3] * num_classes, kernel_size=3, padding=1)]
    
    # extraの(source5)に対する畳込み
    loc_layers += [nn.Conv2d(256, bbox_aspect_num[4] * 4, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(256, bbox_aspect_num[4] * num_classes, kernel_size=3, padding=1)]
    
    # extraの(source6)に対する畳込み
    loc_layers += [nn.Conv2d(256, bbox_aspect_num[5] * 4, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(256, bbox_aspect_num[5] * num_classes, kernel_size=3, padding=1)]
    
    return nn.ModuleList(loc_layers), nn.ModuleList(conf_layers)

動作確認

In [40]:
loc_test, conf_test = make_loc_conf()
print(loc_test)
print(conf_test)

ModuleList(
  (0): Conv2d(512, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): Conv2d(1024, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (2): Conv2d(512, 36, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): Conv2d(256, 36, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (4): Conv2d(256, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (5): Conv2d(256, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
ModuleList(
  (0): Conv2d(512, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (1): Conv2d(1024, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (2): Conv2d(512, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (3): Conv2d(256, 126, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (4): Conv2d(256, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (5): Conv2d(256, 84, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)


### L2ノルム層

In [41]:
# ConvC4_3からの出力をscale=20のL2normで正規化する

class L2Norm(nn.Module):
    
    def __init__(self, input_channels = 512, scale=20):
        
        super(L2Norm, self).__init__()
        self.weights = nn.Parameter(torch.Tensor(input_channels))
        self.scale = scale      # 係数weightsをscaleで初期化する
        self.reset_parameters() # パラメータの初期化
        self.eps = 1e-10
        
    def reset_parameters(self):
        nn.init.constant_(self.weights, self.scale) # 全てのweightをscale=20で初期化
        
    def forward(self, x):
        '''
        38x38の特徴量に対して、512チャネルに渡って2乗和をのルートを求めた38x38個の値を使用し、
        各特徴量を正規化してから係数を掛け算する層
        '''
        
        # normの計算
        # normのテンソルサイズはtorch.Size([batch_num, 1, 38, 38])
        norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.esp
        x = torch.div(x, norm)
        
        # 係数の次元を調整
        # self.weightsのサイズはtorch.Size([512])なので、
        # torch.Size([batch_num, 512, 38, 38])まで変形する
        weights = self.weights.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x)
        
        
        # 正規化
        out = x * weights
        
        return out

### デフォルトボックス

In [58]:
# 4種類： 小さい正方形, 大きい正方形、 1:2の長方形, 2:1の長方形
# 6種類： 小さい正方形, 大きい正方形、 1:2の長方形, 2:1の長方形, 1:3の長方形, 3:1の長方形

import itertools
import math

class DBox(object):
    def __init__(self, cfg):
        super(DBox, self).__init__()
        
        # 初期設定
        self.image_size = cfg['input_size']        # 画像サイズ 300 x 300の300
        self.feature_maps = cfg['feature_maps']    # [38, 19, 10, 5, 3, 1] 各sourceの特徴量マップのサイズ
        self.num_priors = len(cfg['feature_maps']) # sourceの個数=6
        self.steps = cfg['steps']                  # [8, 16, 32, 64, 100, 300]    DBoxのピクセルサイズ
        self.min_sizes = cfg['min_sizes']          # [30, 60, 111, 162, 213, 264] 小さい正方形のDBoxのピクセルサイズ
        self.max_sizes = cfg['max_sizes']          # [45, 99, 153, 207, 261, 315] 大きい正方形のDBoxのピクセルサイズ
        self.aspect_ratios = cfg['aspect_ratios']  # 長方形のDBoxのアスペクト比
        
    def make_dbox_list(self):
        '''DBoxを作成'''
        mean = []
        
        # 'feature_maps': [38, 19, 10, 5, 3, 1]
        for k, f in enumerate(self.feature_maps):
            for i, j in itertools.product(range(f), repeat=2): # fまでの数で2ペアの組み合わせ(f * (f-1))
                
                # 特徴量の画像サイズ
                # 300 / 'steps': [8, 16, 32, 64, 100, 300]
                f_k = self.image_size / self.steps[k]
                
                # DBoxの中心座標 (x, y) ただし、0~1で規格化
                cx = (j + 0.5) / f_k
                cy = (i + 0.5) / f_k
                
                # アスペクト比1の小さいDBox[cx, cy, width, height]
                # 'min_sizes': [30, 60, 111, 162, 213, 264]
                s_k = self.min_sizes[k] / self.image_size
                mean += [cx, cy, s_k, s_k]
                
                # アスペクト比1の大きいDBox[cx, cy, width, height]
                # 'max_sizes': [45, 99, 153, 207, 261, 315]
                s_k_prime = math.sqrt(s_k * (self.max_sizes[k] / self.image_size))
                mean += [cx, cy, s_k_prime, s_k_prime]
                
                # その他のアスペクト比のdefBox [cx, cy, width, height]
                for aspect in self.aspect_ratios:
                    for ar in aspect:
                        mean += [cx, cy, s_k * math.sqrt(ar), s_k / math.sqrt(ar)] # 横長長方形
                        mean += [cx, cy, s_k / math.sqrt(ar), s_k * math.sqrt(ar)] # 縦長長方形
                    
                # DBoxをテンソルに変換 torch.Size([8732, 4])
                output = torch.Tensor(mean).view(-1, 4)
                
                # DBoxが画像の外にはみ出るのを防ぐために、大きさを最小0, 最大1にする
                output.clamp_(max=1, min=0)
                
        return output

動作の確認

In [59]:
# SSD300の設定
SSD300_cfg = {
    'num_classes': 21,  # 背景クラスを含めた合計クラス数
    'input_size' : 300, # 画像の入力サイズ
    'bbox_aspect_num': [4, 6, 6, 6, 4, 4,],    # 出力するDBoxのアスペクト比の種類
    'feature_maps': [38, 19, 10, 5, 3, 1],     # 各sourceの画像サイズ
    'steps': [8, 16, 32, 64, 100, 300] ,       # DBoxのピクセルサイズ
    'min_sizes': [30, 60, 111, 162, 213, 264], # 小さい正方形のDBoxのピクセルサイズ
    'max_sizes': [45, 99, 153, 207, 261, 315], # 大きい正方形のDBoxのピクセルサイズ
    'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]] # アスペクト比の構成?
}

# DBoxの作成
dbox = DBox(SSD300_cfg)
dbox_list = dbox.make_dbox_list()

# DBoxの出力を確認
import pandas as pd
dbox_df = pd.DataFrame(dbox_list)
dbox_df

Unnamed: 0,0,1,2,3
0,tensor(0.0133),tensor(0.0133),tensor(0.1000),tensor(0.1000)
1,tensor(0.0133),tensor(0.0133),tensor(0.1225),tensor(0.1225)
2,tensor(0.0133),tensor(0.0133),tensor(0.1414),tensor(0.0707)
3,tensor(0.0133),tensor(0.0133),tensor(0.0707),tensor(0.1414)
4,tensor(0.0133),tensor(0.0133),tensor(0.1414),tensor(0.0707)
...,...,...,...,...
38795,tensor(0.5000),tensor(0.5000),tensor(0.5081),tensor(1.)
38796,tensor(0.5000),tensor(0.5000),tensor(1.),tensor(0.6223)
38797,tensor(0.5000),tensor(0.5000),tensor(0.6223),tensor(1.)
38798,tensor(0.5000),tensor(0.5000),tensor(1.),tensor(0.6223)
