# setup dataset

In [1]:
# import stuff
import os
import numpy as np
import time
import pandas as pd

import torch
import torch.utils.data as data
from itertools import product as product

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Function

In [2]:
# import dataset
from utils.dataset import VOCDataset, DatasetTransform, make_datapath_list, Anno_xml2list, od_collate_fn


In [3]:
backbone = "resnet50"
scale = 2 
# scale==1: resolution 300
# scale==2: resolution 600

## make data.Dataset for training

In [4]:
# load files
# set your VOCdevkit path!
vocpath = "../VOCdevkit/VOC2007"
train_img_list, train_anno_list, val_img_list, val_anno_list = make_datapath_list(vocpath)

vocpath = "../VOCdevkit/VOC2012"
train_img_list2, train_anno_list2, _, _ = make_datapath_list(vocpath)

train_img_list.extend(train_img_list2)
train_anno_list.extend(train_anno_list2)

print("trainlist: ", len(train_img_list))
print("vallist: ", len(val_img_list))

# make Dataset
voc_classes = ['aeroplane', 'bicycle', 'bird', 'boat',
               'bottle', 'bus', 'car', 'cat', 'chair',
               'cow', 'diningtable', 'dog', 'horse',
               'motorbike', 'person', 'pottedplant',
               'sheep', 'sofa', 'train', 'tvmonitor']

color_mean = (104, 117, 123)  # (BGR)の色の平均値
input_size = 300*scale  # 画像のinputサイズを300×300にする

## DatasetTransformを適応
transform = DatasetTransform(input_size, color_mean)
transform_anno = Anno_xml2list(voc_classes)

# Dataloaderに入れるデータセットファイル。
# ゲットで叩くと画像とGTを前処理して出力してくれる。
train_dataset = VOCDataset(train_img_list, train_anno_list, phase = "train", transform=transform, transform_anno = transform_anno)
val_dataset = VOCDataset(val_img_list, val_anno_list, phase="val", transform=DatasetTransform(
    input_size, color_mean), transform_anno=Anno_xml2list(voc_classes))

batch_size = 16

train_dataloader = data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=od_collate_fn, num_workers=8)

val_dataloader = data.DataLoader(
    val_dataset, batch_size=batch_size, shuffle=False, collate_fn=od_collate_fn, num_workers=8)

# 辞書型変数にまとめる
dataloaders_dict = {"train": train_dataloader, "val": val_dataloader}

trainlist:  16551
vallist:  4952


In [5]:
# 動作の確認
batch_iterator = iter(dataloaders_dict["val"])  # イタレータに変換
images, targets = next(batch_iterator)  # 1番目の要素を取り出す
print(images.size())  # torch.Size([4, 3, 300, 300])
print(len(targets))
print(targets[1].shape)  # ミニバッチのサイズのリスト、各要素は[n, 5]、nは物体数

torch.Size([16, 3, 600, 600])
16
torch.Size([1, 5])


# define SSD model

In [6]:
from utils.retinanet import RetinaFPN as SSD
from utils.retinanet import Bottleneck

In [7]:
if scale==1:
    ssd_cfg = {
        'num_classes': 21,  # 背景クラスを含めた合計クラス数
        'input_size': 300*scale,  # 画像の入力サイズ
        'bbox_aspect_num': [4, 6, 6, 6, 4, 4],  # 出力するDBoxのアスペクト比の種類
        'feature_maps': [38, 19, 10, 5, 3, 1],  # 各sourceの画像サイズ
        'steps': [8, 16, 32, 64, 100, 300],  # DBOXの大きさを決める
        'min_sizes': [30, 60, 111, 162, 213, 264],  # DBOXの大きさを決める
        'max_sizes': [60, 111, 162, 213, 264, 315],  # DBOXの大きさを決める
        'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
    }
elif scale==2:
    ssd_cfg = {
        'num_classes': 21,  # 背景クラスを含めた合計クラス数
        'input_size': 300*scale,  # 画像の入力サイズ
        'bbox_aspect_num': [4, 6, 6, 6, 4, 4],  # 出力するDBoxのアスペクト比の種類
        'feature_maps': [75, 38, 19, 10, 5, 3],  # 各sourceの画像サイズ
        'steps': [8, 16, 32, 64, 100, 300],  # DBOXの大きさを決める
        'min_sizes': [30, 60, 111, 162, 213, 264]*scale,  # DBOXの大きさを決める
        'max_sizes': [60, 111, 162, 213, 264, 315]*scale,  # DBOXの大きさを決める
        'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
    }

# test if net works
net = SSD(phase="train", cfg=ssd_cfg, model=backbone, verbose=True)
net(torch.rand([1,3,600,600]))

layer size: torch.Size([1, 256, 75, 75])
layer size: torch.Size([1, 256, 38, 38])
layer size: torch.Size([1, 256, 19, 19])
layer size: torch.Size([1, 256, 10, 10])
layer size: torch.Size([1, 256, 5, 5])
layer size: torch.Size([1, 256, 3, 3])


  "See the documentation of nn.Upsample for details.".format(mode))


(tensor([[[ 0.1804, -0.1486,  0.1303,  0.2200],
          [-0.0645, -0.0586, -0.3193,  0.0754],
          [-0.0402, -0.2396, -0.0114,  0.0661],
          ...,
          [ 0.0448, -0.0307, -0.0462,  0.0433],
          [-0.0200,  0.0512,  0.0387,  0.0380],
          [-0.0041,  0.0390,  0.0319,  0.0450]]], grad_fn=<ViewBackward>),
 tensor([[[ 0.1156, -0.3100,  0.1033,  ...,  0.2048,  0.2719,  0.1458],
          [-0.1009,  0.0542, -0.2932,  ..., -0.0154, -0.2245,  0.3272],
          [-0.1375,  0.0777,  0.0208,  ...,  0.2686, -0.3814,  0.1226],
          ...,
          [ 0.0064,  0.0448,  0.0303,  ..., -0.0235,  0.0023, -0.0137],
          [-0.0238,  0.0041, -0.0016,  ...,  0.0302,  0.0225,  0.0652],
          [ 0.0637, -0.0090,  0.0159,  ...,  0.0209,  0.0354, -0.0230]]],
        grad_fn=<ViewBackward>),
 tensor([[0.0067, 0.0067, 0.0500, 0.0500],
         [0.0067, 0.0067, 0.0707, 0.0707],
         [0.0067, 0.0067, 0.0707, 0.0354],
         ...,
         [1.0000, 1.0000, 0.4806, 0.4806],
  

In [8]:
net = SSD(phase="train", cfg=ssd_cfg, model=backbone, verbose=False)

# GPUが使えるか確認
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("using:", device)

print("set weights!")

using: cuda:0
set weights!


In [9]:
print(net)

RetinaFPN(
  (layer0): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  )
  (layer2): Sequential(
    (0): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace)
        (downsample): Sequential(
       

In [10]:
from utils.ssd_model import MultiBoxLoss

# define loss
criterion = MultiBoxLoss(jaccard_thresh=0.5,neg_pos=3, device=device)

# optim
import torch.optim as optim
optimizer = optim.SGD(net.parameters(), lr=1e-3, momentum=0.9, weight_decay=5e-4)

In [11]:
def get_current_lr(epoch):
    lr = 1e-3
    for i,lr_decay_epoch in enumerate([120,180]):
        if epoch >= lr_decay_epoch:
            lr *= 0.1
    return lr

def adjust_learning_rate(optimizer, epoch):
    lr = get_current_lr(epoch)
    print("lr is:", lr)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [12]:
# モデルを学習させる関数を作成


def train_model(net, dataloaders_dict, criterion, optimizer, num_epochs):

    # GPUが使えるかを確認
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("used device:", device)

    # ネットワークをGPUへ
    net.to(device)

    # ネットワークがある程度固定であれば、高速化させる
    torch.backends.cudnn.benchmark = True

    # イテレーションカウンタをセット
    iteration = 1
    epoch_train_loss = 0.0  # epochの損失和
    epoch_val_loss = 0.0  # epochの損失和
    logs = []

    # epochのループ
    for epoch in range(num_epochs+1):
        
        adjust_learning_rate(optimizer, epoch)
        
        # 開始時刻を保存
        t_epoch_start = time.time()
        t_iter_start = time.time()

        print('-------------')
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-------------')

        # epochごとの訓練と検証のループ
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()  # モデルを訓練モードに
                print('(train)')
            else:
                if((epoch+1) % 10 == 0):
                    net.eval()   # モデルを検証モードに
                    print('-------------')
                    print('(val)')
                else:
                    # 検証は10回に1回だけ行う
                    continue

            # データローダーからminibatchずつ取り出すループ
            for images, targets in dataloaders_dict[phase]:

                # GPUが使えるならGPUにデータを送る
                images = images.to(device)
                targets = [ann.to(device)
                           for ann in targets]  # リストの各要素のテンソルをGPUへ

                # optimizerを初期化
                optimizer.zero_grad()

                # 順伝搬（forward）計算
                with torch.set_grad_enabled(phase == 'train'):
                    # 順伝搬（forward）計算
                    outputs = net(images)

                    # 損失の計算
                    loss_l, loss_c = criterion(outputs, targets)
                    loss = loss_l + loss_c

                    # 訓練時はバックプロパゲーション
                    if phase == 'train':
                        loss.backward()  # 勾配の計算

                        # 勾配が大きくなりすぎると計算が不安定になるので、clipで最大でも勾配2.0に留める
                        nn.utils.clip_grad_value_(
                            net.parameters(), clip_value=2.0)

                        optimizer.step()  # パラメータ更新

                        if (iteration % 10 == 0):  # 10iterに1度、lossを表示
                            t_iter_finish = time.time()
                            duration = t_iter_finish - t_iter_start
                            print('Iter {} || Loss: {:.4f} || 10iter: {:.4f} sec.'.format(
                                iteration, loss.item(), duration))
                            t_iter_start = time.time()

                        epoch_train_loss += loss.item()
                        iteration += 1

                    # 検証時
                    else:
                        epoch_val_loss += loss.item()

        # epochのphaseごとのlossと正解率
        t_epoch_finish = time.time()
        print('-------------')
        print('epoch {} || Epoch_TRAIN_Loss:{:.4f} ||Epoch_VAL_Loss:{:.4f}'.format(
            epoch+1, epoch_train_loss, epoch_val_loss))
        print('timer:  {:.4f} sec.'.format(t_epoch_finish - t_epoch_start))
        t_epoch_start = time.time()

        # ログを保存
        log_epoch = {'epoch': epoch+1,
                     'train_loss': epoch_train_loss, 'val_loss': epoch_val_loss}
        logs.append(log_epoch)
        df = pd.DataFrame(logs)
        df.to_csv("log_output.csv")

        epoch_train_loss = 0.0  # epochの損失和
        epoch_val_loss = 0.0  # epochの損失和

        # ネットワークを保存する
        if ((epoch+1) % 10 == 0):
            torch.save(net.state_dict(), 'weights/retinanet' + str(300*scale) + "_" + 
                       str(epoch+1) + '.pth')


In [None]:
num_epochs = 200
train_model(net, dataloaders_dict, criterion, optimizer, num_epochs=num_epochs)

used device: cuda:0
lr is: 0.001
-------------
Epoch 1/200
-------------
(train)


  "See the documentation of nn.Upsample for details.".format(mode))


Iter 10 || Loss: 15.1184 || 10iter: 8.8231 sec.
Iter 20 || Loss: 11.7034 || 10iter: 4.5247 sec.
Iter 30 || Loss: 8.7539 || 10iter: 4.5405 sec.
Iter 40 || Loss: 9.1248 || 10iter: 4.5388 sec.
Iter 50 || Loss: 8.7309 || 10iter: 4.5423 sec.
Iter 60 || Loss: 7.9078 || 10iter: 4.5409 sec.
Iter 70 || Loss: 7.8319 || 10iter: 5.3253 sec.
Iter 80 || Loss: 8.4322 || 10iter: 4.5846 sec.
Iter 90 || Loss: 8.3300 || 10iter: 4.6009 sec.
Iter 100 || Loss: 7.9276 || 10iter: 4.5954 sec.
Iter 110 || Loss: 8.3744 || 10iter: 4.5519 sec.
Iter 120 || Loss: 8.6325 || 10iter: 4.5450 sec.
Iter 130 || Loss: 7.9593 || 10iter: 4.5543 sec.
Iter 140 || Loss: 7.8965 || 10iter: 4.5513 sec.
Iter 150 || Loss: 7.1805 || 10iter: 4.5505 sec.
Iter 160 || Loss: 8.0453 || 10iter: 4.5604 sec.
Iter 170 || Loss: 8.0388 || 10iter: 4.5619 sec.
Iter 180 || Loss: 7.5218 || 10iter: 4.7218 sec.
Iter 190 || Loss: 6.8632 || 10iter: 4.8312 sec.
Iter 200 || Loss: 7.5156 || 10iter: 4.5648 sec.
Iter 210 || Loss: 7.0827 || 10iter: 4.5783 sec.

  "See the documentation of nn.Upsample for details.".format(mode))


Iter 1040 || Loss: 5.0385 || 10iter: 3.5844 sec.
Iter 1050 || Loss: 5.3435 || 10iter: 4.5392 sec.
Iter 1060 || Loss: 4.7791 || 10iter: 4.5535 sec.
Iter 1070 || Loss: 5.2882 || 10iter: 4.5566 sec.
Iter 1080 || Loss: 5.8744 || 10iter: 4.5702 sec.
Iter 1090 || Loss: 5.4367 || 10iter: 4.5655 sec.
Iter 1100 || Loss: 5.8032 || 10iter: 4.5827 sec.
Iter 1110 || Loss: 5.5832 || 10iter: 4.5603 sec.
Iter 1120 || Loss: 5.7064 || 10iter: 4.5701 sec.
Iter 1130 || Loss: 5.9161 || 10iter: 4.5509 sec.
Iter 1140 || Loss: 5.9194 || 10iter: 4.5597 sec.
Iter 1150 || Loss: 5.2934 || 10iter: 4.5629 sec.
Iter 1160 || Loss: 4.6196 || 10iter: 4.5565 sec.
Iter 1170 || Loss: 6.4706 || 10iter: 4.5541 sec.
Iter 1180 || Loss: 5.2216 || 10iter: 4.5696 sec.
Iter 1190 || Loss: 5.7202 || 10iter: 4.6049 sec.
Iter 1200 || Loss: 5.1343 || 10iter: 4.5696 sec.
Iter 1210 || Loss: 5.1706 || 10iter: 4.5643 sec.
Iter 1220 || Loss: 5.4595 || 10iter: 4.5586 sec.
Iter 1230 || Loss: 5.6616 || 10iter: 4.5607 sec.
Iter 1240 || Loss: 5