# setup dataset

In [1]:
# import stuff
import os
import numpy as np
import time
import pandas as pd

import torch
import torch.utils.data as data
from itertools import product as product

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Function

In [2]:
# import dataset
from utils.dataset import VOCDataset, DatasetTransform, make_datapath_list, Anno_xml2list, od_collate_fn

In [3]:
# select from efficientnet backbone or resnet backbone
backbone = "efficientnet-b0"
scale = 1
useBiFPN = True
# scale==1: resolution 300
# scale==2: resolution 600

## make data.Dataset for training

In [4]:
# load files
# set your VOCdevkit path here.
vocpath = "../VOCdevkit/VOC2007"
train_img_list, train_anno_list, val_img_list, val_anno_list = make_datapath_list(vocpath)

vocpath = "../VOCdevkit/VOC2012"
train_img_list2, train_anno_list2, _, _ = make_datapath_list(vocpath)

train_img_list.extend(train_img_list2)
train_anno_list.extend(train_anno_list2)

print("trainlist: ", len(train_img_list))
print("vallist: ", len(val_img_list))

# make Dataset
voc_classes = ['aeroplane', 'bicycle', 'bird', 'boat',
               'bottle', 'bus', 'car', 'cat', 'chair',
               'cow', 'diningtable', 'dog', 'horse',
               'motorbike', 'person', 'pottedplant',
               'sheep', 'sofa', 'train', 'tvmonitor']

color_mean = (104, 117, 123)  # (BGR)の色の平均値
input_size = 300*scale  # 画像のinputサイズを300×300にする

## DatasetTransformを適応
transform = DatasetTransform(input_size, color_mean)
transform_anno = Anno_xml2list(voc_classes)

# Dataloaderに入れるデータセットファイル。
# ゲットで叩くと画像とGTを前処理して出力してくれる。
train_dataset = VOCDataset(train_img_list, train_anno_list, phase = "train", transform=transform, transform_anno = transform_anno)
val_dataset = VOCDataset(val_img_list, val_anno_list, phase="val", transform=DatasetTransform(
    input_size, color_mean), transform_anno=Anno_xml2list(voc_classes))

batch_size = 32

train_dataloader = data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=od_collate_fn, num_workers=8)

val_dataloader = data.DataLoader(
    val_dataset, batch_size=batch_size, shuffle=False, collate_fn=od_collate_fn, num_workers=8)

# 辞書型変数にまとめる
dataloaders_dict = {"train": train_dataloader, "val": val_dataloader}

trainlist:  16551
vallist:  4952


In [5]:
# 動作の確認
batch_iterator = iter(dataloaders_dict["val"])  # イタレータに変換
images, targets = next(batch_iterator)  # 1番目の要素を取り出す
print(images.size())  # torch.Size([4, 3, 300, 300])
print(len(targets))
print(targets[1].shape)  # ミニバッチのサイズのリスト、各要素は[n, 5]、nは物体数

torch.Size([32, 3, 300, 300])
32
torch.Size([1, 5])


# define EfficientDet model

In [6]:
from utils.efficientdet import EfficientDet

In [7]:
if scale==1:
    ssd_cfg = {
        'num_classes': 21,  # 背景クラスを含めた合計クラス数
        'input_size': 300*scale,  # 画像の入力サイズ
        'bbox_aspect_num': [4, 6, 6, 6, 4, 4],  # 出力するDBoxのアスペクト比の種類
        'feature_maps': [37, 18, 9, 5, 3, 1],  # 各sourceの画像サイズ
        'steps': [8, 16, 32, 64, 100, 300],  # DBOXの大きさを決める
        'min_sizes': [30, 60, 111, 162, 213, 264],  # DBOXの大きさを決める
        'max_sizes': [60, 111, 162, 213, 264, 315],  # DBOXの大きさを決める
        'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
    }
elif scale==2:
    ssd_cfg = {
        'num_classes': 21,  # 背景クラスを含めた合計クラス数
        'input_size': 300*scale,  # 画像の入力サイズ
        'bbox_aspect_num': [4, 6, 6, 6, 4, 4],  # 出力するDBoxのアスペクト比の種類
        'feature_maps': [75, 38, 19, 10, 5, 3],  # 各sourceの画像サイズ
        'steps': [8, 16, 32, 64, 100, 300],  # DBOXの大きさを決める
        'min_sizes': [30, 60, 111, 162, 213, 264]*scale,  # DBOXの大きさを決める
        'max_sizes': [60, 111, 162, 213, 264, 315]*scale,  # DBOXの大きさを決める
        'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
    }

# test if net works
net = EfficientDet(phase="train", cfg=ssd_cfg, verbose=True, backbone=backbone, useBiFPN=useBiFPN)
out = net(torch.rand([1,3,300,300]))
print(out[0].size())

Loaded pretrained weights for efficientnet-b0
layerc3: torch.Size([1, 40, 37, 37])
layerc4: torch.Size([1, 80, 18, 18])
layerc5: torch.Size([1, 320, 9, 9])
layer size: torch.Size([1, 256, 37, 37])
layer size: torch.Size([1, 256, 18, 18])
layer size: torch.Size([1, 256, 9, 9])
layer size: torch.Size([1, 256, 5, 5])
layer size: torch.Size([1, 256, 3, 3])
layer size: torch.Size([1, 256, 1, 1])
torch.Size([1, 8096, 4])


  "See the documentation of nn.Upsample for details.".format(mode))


In [8]:
net = EfficientDet(phase="train", cfg=ssd_cfg, verbose=False, backbone=backbone, useBiFPN=useBiFPN)

# GPUが使えるか確認
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("using:", device)

print("set weights!")

Loaded pretrained weights for efficientnet-b0
using: cuda:0
set weights!


In [9]:
print(net)

EfficientDet(
  (layer0): Sequential(
    (0): Conv2dStaticSamePadding(
      3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False
      (static_padding): ZeroPad2d(padding=(0, 1, 0, 1), value=0.0)
    )
    (1): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
  )
  (layer2): Sequential(
    (0): MBConvBlock(
      (_depthwise_conv): Conv2dStaticSamePadding(
        32, 32, kernel_size=(3, 3), stride=[1, 1], groups=32, bias=False
        (static_padding): ZeroPad2d(padding=(1, 1, 1, 1), value=0.0)
      )
      (_bn1): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
      (_se_reduce): Conv2dStaticSamePadding(
        32, 8, kernel_size=(1, 1), stride=(1, 1)
        (static_padding): Identity()
      )
      (_se_expand): Conv2dStaticSamePadding(
        8, 32, kernel_size=(1, 1), stride=(1, 1)
        (static_padding): Identity()
      )
      (_project_conv): Conv2dStaticSamePadding(
   

In [10]:
from utils.ssd_model import MultiBoxLoss

# define loss
criterion = MultiBoxLoss(jaccard_thresh=0.5,neg_pos=3, device=device)

# optim
import torch.optim as optim
optimizer = optim.SGD(net.parameters(), lr=1e-3, momentum=0.9, weight_decay=5e-4)

In [11]:
def get_current_lr(epoch):
    lr = 1e-3
    for i,lr_decay_epoch in enumerate([120,180]):
        if epoch >= lr_decay_epoch:
            lr *= 0.1
    return lr

def adjust_learning_rate(optimizer, epoch):
    lr = get_current_lr(epoch)
    print("lr is:", lr)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [12]:
# モデルを学習させる関数を作成
def train_model(net, dataloaders_dict, criterion, optimizer, num_epochs):

    # GPUが使えるかを確認
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("used device:", device)

    # ネットワークをGPUへ
    net.to(device)

    # ネットワークがある程度固定であれば、高速化させる
    torch.backends.cudnn.benchmark = True

    # イテレーションカウンタをセット
    iteration = 1
    epoch_train_loss = 0.0  # epochの損失和
    epoch_val_loss = 0.0  # epochの損失和
    logs = []

    # epochのループ
    for epoch in range(num_epochs+1):
        
        adjust_learning_rate(optimizer, epoch)
        
        # 開始時刻を保存
        t_epoch_start = time.time()
        t_iter_start = time.time()

        print('-------------')
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-------------')

        # epochごとの訓練と検証のループ
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()  # モデルを訓練モードに
                print('(train)')
            else:
                if((epoch+1) % 10 == 0):
                    net.eval()   # モデルを検証モードに
                    print('-------------')
                    print('(val)')
                else:
                    # 検証は10回に1回だけ行う
                    continue

            # データローダーからminibatchずつ取り出すループ
            for images, targets in dataloaders_dict[phase]:

                # GPUが使えるならGPUにデータを送る
                images = images.to(device)
                targets = [ann.to(device)
                           for ann in targets]  # リストの各要素のテンソルをGPUへ

                # optimizerを初期化
                optimizer.zero_grad()

                # 順伝搬（forward）計算
                with torch.set_grad_enabled(phase == 'train'):
                    # 順伝搬（forward）計算
                    outputs = net(images)

                    # 損失の計算
                    loss_l, loss_c = criterion(outputs, targets)
                    loss = loss_l + loss_c

                    # 訓練時はバックプロパゲーション
                    if phase == 'train':
                        loss.backward()  # 勾配の計算

                        # 勾配が大きくなりすぎると計算が不安定になるので、clipで最大でも勾配2.0に留める
                        nn.utils.clip_grad_value_(
                            net.parameters(), clip_value=2.0)

                        optimizer.step()  # パラメータ更新

                        if (iteration % 10 == 0):  # 10iterに1度、lossを表示
                            t_iter_finish = time.time()
                            duration = t_iter_finish - t_iter_start
                            print('Iter {} || Loss: {:.4f} || 10iter: {:.4f} sec.'.format(
                                iteration, loss.item(), duration))
                            t_iter_start = time.time()

                        epoch_train_loss += loss.item()
                        iteration += 1

                    # 検証時
                    else:
                        epoch_val_loss += loss.item()

        # epochのphaseごとのlossと正解率
        t_epoch_finish = time.time()
        print('-------------')
        print('epoch {} || Epoch_TRAIN_Loss:{:.4f} ||Epoch_VAL_Loss:{:.4f}'.format(
            epoch+1, epoch_train_loss, epoch_val_loss))
        print('timer:  {:.4f} sec.'.format(t_epoch_finish - t_epoch_start))
        t_epoch_start = time.time()

        # ログを保存
        log_epoch = {'epoch': epoch+1,
                     'train_loss': epoch_train_loss, 'val_loss': epoch_val_loss}
        logs.append(log_epoch)
        df = pd.DataFrame(logs)
        df.to_csv("log_output.csv")

        epoch_train_loss = 0.0  # epochの損失和
        epoch_val_loss = 0.0  # epochの損失和

        # ネットワークを保存する
        if ((epoch+1) % 10 == 0):
            torch.save(net.state_dict(), 'weights/'+backbone+"_" + str(300*scale) + "_" + 
                       str(epoch+1) + '.pth')


In [None]:
num_epochs = 200
train_model(net, dataloaders_dict, criterion, optimizer, num_epochs=num_epochs)

used device: cuda:0
lr is: 0.001
-------------
Epoch 1/200
-------------
(train)


  "See the documentation of nn.Upsample for details.".format(mode))


Iter 10 || Loss: 29.6469 || 10iter: 5.7856 sec.
Iter 20 || Loss: 27.7266 || 10iter: 2.5724 sec.
Iter 30 || Loss: 22.7666 || 10iter: 2.3900 sec.
Iter 40 || Loss: 20.2654 || 10iter: 2.4015 sec.
Iter 50 || Loss: 19.1678 || 10iter: 2.5208 sec.
Iter 60 || Loss: 17.8210 || 10iter: 2.4121 sec.
Iter 70 || Loss: 16.9245 || 10iter: 2.4418 sec.
Iter 80 || Loss: 16.1897 || 10iter: 2.4140 sec.
Iter 90 || Loss: 15.3084 || 10iter: 2.4101 sec.
Iter 100 || Loss: 15.1703 || 10iter: 2.4014 sec.
Iter 110 || Loss: 13.9548 || 10iter: 2.3977 sec.
Iter 120 || Loss: 13.1043 || 10iter: 2.4734 sec.
Iter 130 || Loss: 12.4970 || 10iter: 2.4313 sec.
Iter 140 || Loss: 11.9180 || 10iter: 2.4973 sec.
Iter 150 || Loss: 10.7936 || 10iter: 2.4723 sec.
Iter 160 || Loss: 10.2956 || 10iter: 2.4339 sec.
Iter 170 || Loss: 11.9987 || 10iter: 2.4240 sec.
Iter 180 || Loss: 10.1595 || 10iter: 2.4878 sec.
Iter 190 || Loss: 9.0999 || 10iter: 2.4149 sec.
Iter 200 || Loss: 9.3735 || 10iter: 2.4926 sec.
Iter 210 || Loss: 7.7816 || 10i

  "See the documentation of nn.Upsample for details.".format(mode))


Iter 520 || Loss: 5.7972 || 10iter: 2.2280 sec.
Iter 530 || Loss: 5.9533 || 10iter: 2.6162 sec.
Iter 540 || Loss: 5.7764 || 10iter: 2.4010 sec.
Iter 550 || Loss: 5.6730 || 10iter: 2.4691 sec.
Iter 560 || Loss: 6.1579 || 10iter: 2.4377 sec.
Iter 570 || Loss: 6.1104 || 10iter: 2.5114 sec.
Iter 580 || Loss: 6.3605 || 10iter: 2.5322 sec.
Iter 590 || Loss: 5.7485 || 10iter: 2.4497 sec.
Iter 600 || Loss: 5.4334 || 10iter: 2.4007 sec.
Iter 610 || Loss: 6.1932 || 10iter: 2.4010 sec.
Iter 620 || Loss: 6.2750 || 10iter: 2.5245 sec.
Iter 630 || Loss: 5.5166 || 10iter: 2.4488 sec.
Iter 640 || Loss: 5.9516 || 10iter: 2.4754 sec.
Iter 650 || Loss: 6.2545 || 10iter: 2.4722 sec.
Iter 660 || Loss: 5.5212 || 10iter: 2.5185 sec.
Iter 670 || Loss: 6.3648 || 10iter: 2.5049 sec.
Iter 680 || Loss: 6.2307 || 10iter: 2.4118 sec.
Iter 690 || Loss: 6.1977 || 10iter: 2.3956 sec.
Iter 700 || Loss: 5.8070 || 10iter: 2.4754 sec.
Iter 710 || Loss: 6.1094 || 10iter: 2.5198 sec.
Iter 720 || Loss: 6.0915 || 10iter: 2.41

Iter 2110 || Loss: 4.5929 || 10iter: 2.4400 sec.
Iter 2120 || Loss: 4.5427 || 10iter: 2.4375 sec.
Iter 2130 || Loss: 4.3950 || 10iter: 2.4435 sec.
Iter 2140 || Loss: 4.6723 || 10iter: 2.4848 sec.
Iter 2150 || Loss: 4.8161 || 10iter: 2.5091 sec.
Iter 2160 || Loss: 4.5712 || 10iter: 2.4049 sec.
Iter 2170 || Loss: 5.4162 || 10iter: 2.4287 sec.
Iter 2180 || Loss: 4.9540 || 10iter: 2.4788 sec.
Iter 2190 || Loss: 4.6401 || 10iter: 2.4451 sec.
Iter 2200 || Loss: 5.4198 || 10iter: 2.4532 sec.
Iter 2210 || Loss: 4.6213 || 10iter: 2.5303 sec.
Iter 2220 || Loss: 4.5708 || 10iter: 2.4666 sec.
Iter 2230 || Loss: 4.9106 || 10iter: 2.4277 sec.
Iter 2240 || Loss: 4.5620 || 10iter: 2.4446 sec.
Iter 2250 || Loss: 4.4419 || 10iter: 2.4708 sec.
Iter 2260 || Loss: 4.9993 || 10iter: 2.4447 sec.
Iter 2270 || Loss: 4.4786 || 10iter: 2.4677 sec.
Iter 2280 || Loss: 4.8669 || 10iter: 2.4061 sec.
Iter 2290 || Loss: 4.7898 || 10iter: 2.4336 sec.
Iter 2300 || Loss: 4.1915 || 10iter: 2.4760 sec.
Iter 2310 || Loss: 4

Iter 3690 || Loss: 3.8857 || 10iter: 2.4995 sec.
Iter 3700 || Loss: 4.5967 || 10iter: 2.4365 sec.
Iter 3710 || Loss: 4.3485 || 10iter: 2.3928 sec.
Iter 3720 || Loss: 3.4526 || 10iter: 2.4347 sec.
Iter 3730 || Loss: 4.5667 || 10iter: 2.4212 sec.
Iter 3740 || Loss: 4.9599 || 10iter: 2.4614 sec.
Iter 3750 || Loss: 4.0177 || 10iter: 2.4907 sec.
Iter 3760 || Loss: 4.4663 || 10iter: 2.4096 sec.
Iter 3770 || Loss: 4.4304 || 10iter: 2.4432 sec.
Iter 3780 || Loss: 4.1937 || 10iter: 2.4066 sec.
Iter 3790 || Loss: 4.2452 || 10iter: 2.4045 sec.
Iter 3800 || Loss: 4.7228 || 10iter: 2.4305 sec.
Iter 3810 || Loss: 4.3935 || 10iter: 2.4974 sec.
Iter 3820 || Loss: 4.1687 || 10iter: 2.4436 sec.
Iter 3830 || Loss: 4.1716 || 10iter: 2.4119 sec.
Iter 3840 || Loss: 4.7033 || 10iter: 2.4222 sec.
Iter 3850 || Loss: 4.6345 || 10iter: 2.4044 sec.
Iter 3860 || Loss: 4.1382 || 10iter: 2.3899 sec.
Iter 3870 || Loss: 4.1831 || 10iter: 2.4243 sec.
Iter 3880 || Loss: 4.4006 || 10iter: 2.5000 sec.
Iter 3890 || Loss: 4