In [1]:
import torch.nn as nn
import torch
from utils.ssd import make_vgg, make_extras, L2Norm, make_loc_conf, DBox

# import stuff
import os
import numpy as np
import time
import pandas as pd

import torch
import torch.utils.data as data
from itertools import product as product

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Function

In [2]:
N = range(10)
Z = range(10,20)
for i,ii in zip(reversed(Z), reversed(N)):
    print(i)

19
18
17
16
15
14
13
12
11
10


In [3]:
def FPN(sources, fpnconv):
    mode = "nearest"
    # make layers
    sources[5] = fpnconv[0](sources[5])
    x = nn.functional.interpolate(sources[5], size=[3,3], mode=mode)
    
    sources[4] = fpnconv[1](sources[4]) + x
    x = nn.functional.interpolate(sources[4], size=[5,5], mode=mode)
    
    sources[3] = fpnconv[2](sources[3]) + x
    x = nn.functional.interpolate(sources[3], size=[10,10], mode=mode)
    
    sources[2] = fpnconv[3](sources[2]) + x
    x = nn.functional.interpolate(sources[2], size=[19,19], mode=mode)
    
    sources[1] = fpnconv[4](sources[1]) + x
    x = nn.functional.interpolate(sources[1], size=[38,38], mode=mode)
    
    sources[0] = fpnconv[5](sources[0]) + x
    
    return sources

In [4]:
def make_loc_conf(num_classes=21, bbox_aspect_num=[4, 6, 6, 6, 4, 4]):

    loc_layers = []
    conf_layers = []

    # VGGの22層目、conv4_3（source1）に対する畳み込み層
    loc_layers += [nn.Conv2d(256, bbox_aspect_num[0]
                             * 4, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(256, bbox_aspect_num[0]
                              * num_classes, kernel_size=3, padding=1)]

    # VGGの最終層（source2）に対する畳み込み層
    loc_layers += [nn.Conv2d(256, bbox_aspect_num[1]
                             * 4, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(256, bbox_aspect_num[1]
                              * num_classes, kernel_size=3, padding=1)]

    # extraの（source3）に対する畳み込み層
    loc_layers += [nn.Conv2d(256, bbox_aspect_num[2]
                             * 4, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(256, bbox_aspect_num[2]
                              * num_classes, kernel_size=3, padding=1)]

    # extraの（source4）に対する畳み込み層
    loc_layers += [nn.Conv2d(256, bbox_aspect_num[3]
                             * 4, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(256, bbox_aspect_num[3]
                              * num_classes, kernel_size=3, padding=1)]

    # extraの（source5）に対する畳み込み層
    loc_layers += [nn.Conv2d(256, bbox_aspect_num[4]
                             * 4, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(256, bbox_aspect_num[4]
                              * num_classes, kernel_size=3, padding=1)]

    # extraの（source6）に対する畳み込み層
    loc_layers += [nn.Conv2d(256, bbox_aspect_num[5]
                             * 4, kernel_size=3, padding=1)]
    conf_layers += [nn.Conv2d(256, bbox_aspect_num[5]
                              * num_classes, kernel_size=3, padding=1)]

    return nn.ModuleList(loc_layers), nn.ModuleList(conf_layers)

In [5]:
class FPNSSD(nn.Module):
    def __init__(self, phase, cfg):
        super(FPNSSD, self).__init__()
        
        self.phase = phase
        self.num_classes = cfg["num_classes"]
        
        # call SSD network
        self.vgg = make_vgg()
        self.extras = make_extras()
        self.L2Norm = L2Norm()
        self.loc, self.conf = make_loc_conf(self.num_classes, cfg["bbox_aspect_num"])
        
        #self.FPN = FPN()
        
        mode = "nearest"
        self.upsamplers = [
            [38,38], [19,19], [10,10], [5,5], [3,3]
            ]
        
        self.fpnconv = nn.ModuleList([
            nn.Conv2d(256, 256, kernel_size=1),
            nn.Conv2d(256, 256, kernel_size=1),
            nn.Conv2d(256, 256, kernel_size=1),
            nn.Conv2d(512, 256, kernel_size=1),
            nn.Conv2d(1024, 256, kernel_size=1),
            nn.Conv2d(512, 256, kernel_size=1),
        ])
        
        # make Dbox
        dbox = DBox(cfg)
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.dbox_list = dbox.make_dbox_list()
        
        # use Detect if inference
        if phase == "inference":
            self.detect = Detect()
            
    def forward(self, x):
        sources = list()
        loc = list()
        conf = list()
        
        # VGGのconv4_3まで計算
        for k in range(23):
            x = self.vgg[k](x)
        
        # conv4_3の出力をL2Normに入力。source1をsourceに追加
        source1 = self.L2Norm(x)
        sources.append(source1)
        
        # VGGを最後まで計算しsource2を取得
        for k in range(23, len(self.vgg)):
            x = self.vgg[k](x)
        
        sources.append(x)
        
        # extra層の計算を行う。
        # source3-6に結果を格納。
        for k, v in enumerate(self.extras):
            x = F.relu(v(x), inplace = True)
            if k % 2 == 1:
                sources.append(x)
        
        # source1 38x38
        # source2 19x19
        # source3 10x10
        # source4 5x5
        # source5 3x3
        # source6 1x1
        
        ## feature piramidレイヤを作成する。
        sources = FPN(sources, self.fpnconv)
        
        # source 1-6にそれぞれ対応するconvを適応しconfとlocを得る。
        for (x, l, c) in zip(sources, self.loc, self.conf):
            # Permuteは要素の順番を入れ替え
            loc.append(l(x).permute(0, 2, 3, 1).contiguous())
            conf.append(c(x).permute(0, 2, 3, 1).contiguous())
        
        # convの出力は[batch, 4*anker, fh, fw]なので整形しなければならない。
        # まず[batch, fh, fw, anker]に整形
        
        # locとconfの形を変形
        # locのサイズは、torch.Size([batch_num, 34928])
        # confのサイズはtorch.Size([batch_num, 183372])になる
        loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
        conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
        
        # さらにlocとconfの形を整える
        # locのサイズは、torch.Size([batch_num, 8732, 4])
        # confのサイズは、torch.Size([batch_num, 8732, 21])
        loc = loc.view(loc.size(0), -1, 4)
        conf = conf.view(conf.size(0), -1, self.num_classes)
        # これで後段の処理につっこめるかたちになる。
        
        output = (loc, conf, self.dbox_list)
        
        if self.phase == "inference":
            # Detectのforward
            return self.detect(output[0], output[1], output[2].to(self.device))
        else:
            return output


In [6]:
# SSD300の設定
ssd_cfg = {
    'num_classes': 21,  # 背景クラスを含めた合計クラス数
    'input_size': 300,  # 画像の入力サイズ
    'bbox_aspect_num': [4, 6, 6, 6, 4, 4],  # 出力するDBoxのアスペクト比の種類
    'feature_maps': [38, 19, 10, 5, 3, 1],  # 各sourceの画像サイズ
    'steps': [8, 16, 32, 64, 100, 300],  # DBOXの大きさを決める
    'min_sizes': [30, 60, 111, 162, 213, 264],  # DBOXの大きさを決める
    'max_sizes': [60, 111, 162, 213, 264, 315],  # DBOXの大きさを決める
    'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
}

net = FPNSSD(phase="train", cfg=ssd_cfg)

In [7]:
print(net)

FPNSSD(
  (vgg): ModuleList(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=True)
    (17): Conv2d(25

In [8]:
# SSDのweightsを設定
print("using vgg weights")
vgg_weights = torch.load("./weights/vgg16_reducedfc.pth")
net.vgg.load_state_dict(vgg_weights)

def weights_init(m):
    if isinstance(m, nn.Conv2d):
        init.kaiming_normal_(m.weight.data)
        if m.bias is not None:
            nn.init.constant_(m.bias, 0.0)

import torch.nn.init as init

# 初期値を適応
net.extras.apply(weights_init)
net.loc.apply(weights_init)
net.conf.apply(weights_init)

# GPUが使えるか確認
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("using:", device)

print("set weights!")

using vgg weights
using: cuda:0
set weights!


In [9]:
from utils.focalloss import FocalLoss
from utils.ssd_model import match
from utils.ssd_model import MultiBoxLoss

# define loss
criterion = MultiBoxLoss(jaccard_thresh=0.5,neg_pos=3, device=device)

# optim
import torch.optim as optim
optimizer = optim.SGD(net.parameters(), lr=1e-3, momentum=0.9, weight_decay=5e-4)

In [10]:
def get_current_lr(epoch):
    lr = 1e-3
    for i,lr_decay_epoch in enumerate([120,180]):
        if epoch >= lr_decay_epoch:
            lr *= 0.1
    return lr

def adjust_learning_rate(optimizer, epoch):
    lr = get_current_lr(epoch)
    print("lr is:", lr)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [11]:
# import dataset
from utils.dataset import VOCDataset, DatasetTransform, make_datapath_list, Anno_xml2list, od_collate_fn
# load files
# set your VOCdevkit path!
vocpath = "../VOCdevkit/VOC2007"
train_img_list, train_anno_list, val_img_list, val_anno_list = make_datapath_list(vocpath)

vocpath = "../VOCdevkit/VOC2012"
train_img_list2, train_anno_list2, _, _ = make_datapath_list(vocpath)

train_img_list.extend(train_img_list2)
train_anno_list.extend(train_anno_list2)

print("trainlist: ", len(train_img_list))
print("vallist: ", len(val_img_list))

# make Dataset
voc_classes = ['aeroplane', 'bicycle', 'bird', 'boat',
               'bottle', 'bus', 'car', 'cat', 'chair',
               'cow', 'diningtable', 'dog', 'horse',
               'motorbike', 'person', 'pottedplant',
               'sheep', 'sofa', 'train', 'tvmonitor']

color_mean = (104, 117, 123)  # (BGR)の色の平均値
input_size = 300  # 画像のinputサイズを300×300にする

## DatasetTransformを適応
transform = DatasetTransform(input_size, color_mean)
transform_anno = Anno_xml2list(voc_classes)

# Dataloaderに入れるデータセットファイル。
# ゲットで叩くと画像とGTを前処理して出力してくれる。
train_dataset = VOCDataset(train_img_list, train_anno_list, phase = "train", transform=transform, transform_anno = transform_anno)
val_dataset = VOCDataset(val_img_list, val_anno_list, phase="val", transform=DatasetTransform(
    input_size, color_mean), transform_anno=Anno_xml2list(voc_classes))

batch_size = 32

train_dataloader = data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=od_collate_fn, num_workers=8)

val_dataloader = data.DataLoader(
    val_dataset, batch_size=batch_size, shuffle=False, collate_fn=od_collate_fn, num_workers=8)

# 辞書型変数にまとめる
dataloaders_dict = {"train": train_dataloader, "val": val_dataloader}

trainlist:  16551
vallist:  4952


In [12]:
def save_checkpoint(state, epoch):
    filename = 'weights/ssd_fpn_300_'+str(epoch+1)+'.pth'
    torch.save(state, filename)

In [15]:
# モデルを学習させる関数を作成


def train_model(net, dataloaders_dict, criterion, optimizer, num_epochs):

    # GPUが使えるかを確認
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("used device：", device)

    # ネットワークをGPUへ
    net.to(device)

    # ネットワークがある程度固定であれば、高速化させる
    torch.backends.cudnn.benchmark = True

    # イテレーションカウンタをセット
    iteration = 1
    epoch_train_loss = 0.0  # epochの損失和
    epoch_val_loss = 0.0  # epochの損失和
    logs = []

    # epochのループ
    for epoch in range(num_epochs+1):
        
        adjust_learning_rate(optimizer, epoch)
        
        # 開始時刻を保存
        t_epoch_start = time.time()
        t_iter_start = time.time()

        print('-------------')
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-------------')

        # epochごとの訓練と検証のループ
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()  # モデルを訓練モードに
                print('（train）')
            else:
                if((epoch+1) % 10 == 0):
                    net.eval()   # モデルを検証モードに
                    print('-------------')
                    print('（val）')
                else:
                    # 検証は10回に1回だけ行う
                    continue

            # データローダーからminibatchずつ取り出すループ
            for images, targets in dataloaders_dict[phase]:

                # GPUが使えるならGPUにデータを送る
                images = images.to(device)
                targets = [ann.to(device)
                           for ann in targets]  # リストの各要素のテンソルをGPUへ

                # optimizerを初期化
                optimizer.zero_grad()

                # 順伝搬（forward）計算
                with torch.set_grad_enabled(phase == 'train'):
                    # 順伝搬（forward）計算
                    outputs = net(images)

                    # 損失の計算
                    loc_data, conf_data, dbox_list = outputs
                    # 要素数を把握
                    num_batch = loc_data.size(0)  # ミニバッチのサイズ
                    num_dbox = loc_data.size(1)  # DBoxの数 = 8732
                    num_classes = conf_data.size(2)  # クラス数 = 21
                    # get target
                    conf_t_label = torch.LongTensor(num_batch, num_dbox).to(device)
                    loc_t = torch.Tensor(num_batch, num_dbox, 4).to(device)                    
                    for idx in range(num_batch):  # ミニバッチでループ
                        # 現在のミニバッチの正解アノテーションのBBoxとラベルを取得
                        truths = targets[idx][:, :-1].to(device)  # BBox
                        # ラベル [物体1のラベル, 物体2のラベル, …]
                        labels = targets[idx][:, -1].to(device)

                        # デフォルトボックスを新たな変数で用意
                        dbox = dbox_list.to(device)

                        # 関数matchを実行し、loc_tとconf_t_labelの内容を更新する
                        # （詳細）
                        # loc_t:各DBoxに一番近い正解のBBoxの位置情報が上書きされる
                        # conf_t_label：各DBoxに一番近いBBoxのラベルが上書きされる
                        # ただし、一番近いBBoxとのjaccard overlapが0.5より小さい場合は
                        # 正解BBoxのラベルconf_t_labelは背景クラスの0とする
                        variance = [0.1, 0.2]
                        # このvarianceはDBoxからBBoxに補正計算する際に使用する式の係数です
                        match(0.5, truths, dbox,
                              variance, labels, loc_t, conf_t_label, idx)
                    
                    # compute focal loss
                    #loss_l, loss_c = criterion(loc_data, loc_t, conf_data, conf_t_label)
                    loss_l, loss_c = criterion(outputs, targets)
                    loss = loss_l + loss_c

                    # 訓練時はバックプロパゲーション
                    if phase == 'train':
                        loss.backward()  # 勾配の計算

                        # 勾配が大きくなりすぎると計算が不安定になるので、clipで最大でも勾配2.0に留める
                        nn.utils.clip_grad_value_(
                            net.parameters(), clip_value=2.0)

                        optimizer.step()  # パラメータ更新

                        if (iteration % 10 == 0):  # 10iterに1度、lossを表示
                            t_iter_finish = time.time()
                            duration = t_iter_finish - t_iter_start
                            print('イテレーション {} || Loss: {:.4f} || 10iter: {:.4f} sec.'.format(
                                iteration, loss.item(), duration))
                            t_iter_start = time.time()

                        epoch_train_loss += loss.item()
                        iteration += 1

                    # 検証時
                    else:
                        epoch_val_loss += loss.item()

        # epochのphaseごとのlossと正解率
        t_epoch_finish = time.time()
        print('-------------')
        print('epoch {} || Epoch_TRAIN_Loss:{:.4f} ||Epoch_VAL_Loss:{:.4f}'.format(
            epoch+1, epoch_train_loss, epoch_val_loss))
        print('timer:  {:.4f} sec.'.format(t_epoch_finish - t_epoch_start))
        t_epoch_start = time.time()

        # ログを保存
        log_epoch = {'epoch': epoch+1,
                     'train_loss': epoch_train_loss, 'val_loss': epoch_val_loss}
        logs.append(log_epoch)
        df = pd.DataFrame(logs)
        df.to_csv("log_output.csv")

        epoch_train_loss = 0.0  # epochの損失和
        epoch_val_loss = 0.0  # epochの損失和

        # ネットワークを保存する
        if ((epoch+1) % 10 == 0):
            save_checkpoint({
            'epoch': epoch +1,
            'state_dict': net.state_dict(),
            'optimizer' : optimizer.state_dict(),
        }, epoch)
            


In [None]:
num_epochs = 200
train_model(net, dataloaders_dict, criterion, optimizer, num_epochs=num_epochs)

used device： cuda:0
lr is: 0.001
-------------
Epoch 1/200
-------------
（train）
イテレーション 10 || Loss: 3.9452 || 10iter: 7.1056 sec.
イテレーション 20 || Loss: 4.0003 || 10iter: 5.5501 sec.
イテレーション 30 || Loss: 3.3053 || 10iter: 5.6181 sec.
イテレーション 40 || Loss: 3.7027 || 10iter: 5.6020 sec.
イテレーション 50 || Loss: 4.1493 || 10iter: 5.6336 sec.
イテレーション 60 || Loss: 3.9259 || 10iter: 5.7089 sec.
イテレーション 70 || Loss: 4.0375 || 10iter: 5.5935 sec.
イテレーション 80 || Loss: 3.9287 || 10iter: 5.6231 sec.
イテレーション 90 || Loss: 4.8472 || 10iter: 5.6302 sec.
イテレーション 100 || Loss: 4.1358 || 10iter: 5.5893 sec.
イテレーション 110 || Loss: 3.8377 || 10iter: 5.6391 sec.
イテレーション 120 || Loss: 3.8065 || 10iter: 5.6155 sec.
イテレーション 130 || Loss: 4.3254 || 10iter: 5.6417 sec.
イテレーション 140 || Loss: 4.1040 || 10iter: 5.6162 sec.
イテレーション 150 || Loss: 4.2332 || 10iter: 5.6097 sec.
イテレーション 160 || Loss: 4.0288 || 10iter: 5.6263 sec.
イテレーション 170 || Loss: 3.9026 || 10iter: 5.6099 sec.
イテレーション 180 || Loss: 4.2253 || 10iter: 5.5858 sec.
イテレーション 19

イテレーション 1530 || Loss: 4.3608 || 10iter: 4.5054 sec.
イテレーション 1540 || Loss: 3.4786 || 10iter: 4.5117 sec.
イテレーション 1550 || Loss: 4.2487 || 10iter: 4.4609 sec.
-------------
epoch 3 || Epoch_TRAIN_Loss:1973.5958 ||Epoch_VAL_Loss:0.0000
timer:  253.4304 sec.
lr is: 0.001
-------------
Epoch 4/200
-------------
（train）
イテレーション 1560 || Loss: 3.8075 || 10iter: 4.0654 sec.
イテレーション 1570 || Loss: 4.4827 || 10iter: 4.5052 sec.
イテレーション 1580 || Loss: 3.4280 || 10iter: 4.5915 sec.
イテレーション 1590 || Loss: 3.7224 || 10iter: 4.5320 sec.
イテレーション 1600 || Loss: 3.8781 || 10iter: 4.5095 sec.
イテレーション 1610 || Loss: 4.3110 || 10iter: 4.5112 sec.
イテレーション 1620 || Loss: 3.5319 || 10iter: 4.5485 sec.
イテレーション 1630 || Loss: 3.8772 || 10iter: 4.5120 sec.
イテレーション 1640 || Loss: 3.5218 || 10iter: 4.5068 sec.
イテレーション 1650 || Loss: 3.7584 || 10iter: 4.5127 sec.
イテレーション 1660 || Loss: 4.3046 || 10iter: 4.5398 sec.
イテレーション 1670 || Loss: 3.9909 || 10iter: 4.4924 sec.
イテレーション 1680 || Loss: 3.6094 || 10iter: 4.5337 sec.
イテレーション 1