# setup dataset

In [1]:
# import stuff
import os
import numpy as np
import time
import pandas as pd

import torch
import torch.utils.data as data
from itertools import product as product

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Function
from utils.to_fp16 import network_to_half

In [2]:
# import dataset
from utils.dataset import VOCDataset, DatasetTransform, make_datapath_list, Anno_xml2list, od_collate_fn

In [3]:
## meta settings

# select from efficientnet backbone or resnet backbone
backbone = "efficientnet-b0"
scale = 2
# scale==1: resolution 300
# scale==2: resolution 600
useBiFPN = True
HALF = True # enable FP16
DATASET = "VOC"
retina = False # for trying retinanets

## make data.Dataset for training

In [4]:
if not DATASET == "COCO":
    # load files
    # set your VOCdevkit path here.
    vocpath = "../VOCdevkit/VOC2007"
    train_img_list, train_anno_list, val_img_list, val_anno_list = make_datapath_list(vocpath)

    vocpath = "../VOCdevkit/VOC2012"
    train_img_list2, train_anno_list2, _, _ = make_datapath_list(vocpath)

    train_img_list.extend(train_img_list2)
    train_anno_list.extend(train_anno_list2)

    print("trainlist: ", len(train_img_list))
    print("vallist: ", len(val_img_list))

    # make Dataset
    voc_classes = ['aeroplane', 'bicycle', 'bird', 'boat',
                   'bottle', 'bus', 'car', 'cat', 'chair',
                   'cow', 'diningtable', 'dog', 'horse',
                   'motorbike', 'person', 'pottedplant',
                   'sheep', 'sofa', 'train', 'tvmonitor']

    color_mean = (104, 117, 123)  # (BGR)の色の平均値
    if scale == 1:
        input_size = 300  # 画像のinputサイズを300×300にする
    else:
        input_size = 512

    ## DatasetTransformを適応
    transform = DatasetTransform(input_size, color_mean)
    transform_anno = Anno_xml2list(voc_classes)

    # Dataloaderに入れるデータセットファイル。
    # ゲットで叩くと画像とGTを前処理して出力してくれる。
    train_dataset = VOCDataset(train_img_list, train_anno_list, phase = "train", transform=transform, transform_anno = transform_anno)
    val_dataset = VOCDataset(val_img_list, val_anno_list, phase="val", transform=DatasetTransform(
        input_size, color_mean), transform_anno=Anno_xml2list(voc_classes))

else:
    from dataset.coco import COCODetection
    import torch.utils.data as data
    from utils.dataset import VOCDataset, COCODatasetTransform, make_datapath_list, Anno_xml2list, od_collate_fn

    color_mean = (104, 117, 123)  # (BGR)の色の平均値
    if scale == 1:
        input_size = 300  # 画像のinputサイズを300×300にする
    else:
        input_size = 512

    ## DatasetTransformを適応
    transform = COCODatasetTransform(input_size, color_mean)
    train_dataset = COCODetection("../data/coco/", image_set="train2014", phase="train", transform=transform)
    val_dataset = COCODetection("../data/coco/", image_set="val2014", phase="val", transform=transform)
    
batch_size = 32

train_dataloader = data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=od_collate_fn, num_workers=8)

val_dataloader = data.DataLoader(
    val_dataset, batch_size=batch_size, shuffle=False, collate_fn=od_collate_fn, num_workers=8)

# 辞書型変数にまとめる
dataloaders_dict = {"train": train_dataloader, "val": val_dataloader}

trainlist:  16551
vallist:  4952


In [5]:
# 動作の確認
batch_iterator = iter(dataloaders_dict["val"])  # イタレータに変換
images, targets = next(batch_iterator)  # 1番目の要素を取り出す
print(images.size())  # torch.Size([4, 3, 300, 300])
print(len(targets))
print(targets[1].shape)  # ミニバッチのサイズのリスト、各要素は[n, 5]、nは物体数

torch.Size([32, 3, 512, 512])
32
torch.Size([1, 5])


# define EfficientDet model

In [6]:
from utils.efficientdet import EfficientDet

In [7]:
if not DATASET == "COCO":
    num_class = 21
else:
    num_class = 81

if scale==1:
    ssd_cfg = {
        'num_classes': num_class,  # 背景クラスを含めた合計クラス数
        'input_size': 300*scale,  # 画像の入力サイズ
        'bbox_aspect_num': [4, 6, 6, 6, 4, 4],  # 出力するDBoxのアスペクト比の種類
        'feature_maps': [37, 18, 9, 5, 3, 1],  # 各sourceの画像サイズ
        'steps': [8, 16, 32, 64, 100, 300],  # DBOXの大きさを決める
        'min_sizes': [30, 60, 111, 162, 213, 264],  # DBOXの大きさを決める
        'max_sizes': [60, 111, 162, 213, 264, 315],  # DBOXの大きさを決める
        'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
    }
elif scale==2:
    ssd_cfg = {
        'num_classes': num_class,  # 背景クラスを含めた合計クラス数
        'input_size': 512,  # 画像の入力サイズ
        'bbox_aspect_num': [4, 6, 6, 6, 4, 4],  # 出力するDBoxのアスペクト比の種類
        'feature_maps': [64, 32, 16, 8, 4, 2],  # 各sourceの画像サイズ
        'steps': [8, 16, 32, 64, 100, 300],  # DBOXの大きさを決める
        'min_sizes': [30, 60, 111, 162, 213, 264]*scale,  # DBOXの大きさを決める
        'max_sizes': [60, 111, 162, 213, 264, 315]*scale,  # DBOXの大きさを決める
        'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
    }

# test if net works
net = EfficientDet(phase="train", cfg=ssd_cfg, verbose=True, backbone=backbone, useBiFPN=useBiFPN)
out = net(torch.rand([1,3,300,300]))
print(out[0].size())

Loaded pretrained weights for efficientnet-b0
use BiFPN
layerc3: torch.Size([1, 40, 37, 37])
layerc4: torch.Size([1, 80, 18, 18])
layerc5: torch.Size([1, 320, 9, 9])
layer size: torch.Size([1, 256, 37, 37])
layer size: torch.Size([1, 256, 18, 18])
layer size: torch.Size([1, 256, 9, 9])
layer size: torch.Size([1, 256, 5, 5])
layer size: torch.Size([1, 256, 3, 3])
layer size: torch.Size([1, 256, 1, 1])
torch.Size([1, 8096, 4])


  "See the documentation of nn.Upsample for details.".format(mode))


In [8]:
net = EfficientDet(phase="train", cfg=ssd_cfg, verbose=False, backbone=backbone, useBiFPN=useBiFPN)

if retina:
    from utils.retinanet import RetinaFPN
    ssd_cfg = {
        'num_classes': num_class,  # 背景クラスを含めた合計クラス数
        'input_size': 300*scale,  # 画像の入力サイズ
        'bbox_aspect_num': [4, 6, 6, 6, 4, 4],  # 出力するDBoxのアスペクト比の種類
        'feature_maps': [38, 19, 10, 5, 3, 1],  # 各sourceの画像サイズ
        'steps': [8, 16, 32, 64, 100, 300],  # DBOXの大きさを決める
        'min_sizes': [30, 60, 111, 162, 213, 264],  # DBOXの大きさを決める
        'max_sizes': [60, 111, 162, 213, 264, 315],  # DBOXの大きさを決める
        'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
    }
    net = RetinaFPN("train", ssd_cfg)

# GPUが使えるか確認
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("using:", device)
net = net.to(device)
print("set weights!")

Loaded pretrained weights for efficientnet-b0
use BiFPN
using: cuda:0
set weights!


In [9]:
print(net)

EfficientDet(
  (layer0): Sequential(
    (0): Conv2dStaticSamePadding(
      3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False
      (static_padding): ZeroPad2d(padding=(0, 1, 0, 1), value=0.0)
    )
    (1): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
  )
  (layer2): Sequential(
    (0): MBConvBlock(
      (_depthwise_conv): Conv2dStaticSamePadding(
        32, 32, kernel_size=(3, 3), stride=[1, 1], groups=32, bias=False
        (static_padding): ZeroPad2d(padding=(1, 1, 1, 1), value=0.0)
      )
      (_bn1): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
      (_se_reduce): Conv2dStaticSamePadding(
        32, 8, kernel_size=(1, 1), stride=(1, 1)
        (static_padding): Identity()
      )
      (_se_expand): Conv2dStaticSamePadding(
        8, 32, kernel_size=(1, 1), stride=(1, 1)
        (static_padding): Identity()
      )
      (_project_conv): Conv2dStaticSamePadding(
   

In [10]:
from utils.ssd_model import MultiBoxLoss

# define loss
criterion = MultiBoxLoss(jaccard_thresh=0.5,neg_pos=3, device=device, half=HALF)

# optim
import torch.optim as optim
optimizer = optim.SGD(net.parameters(), lr=1e-3, momentum=0.9, weight_decay=5e-4)

In [11]:
if HALF:
    from apex import amp, optimizers
    # Initialization
    opt_level = 'O1'
    net, optimizer = amp.initialize(net, optimizer, opt_level=opt_level)

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


In [12]:
def get_current_lr(epoch):
    
    if DATASET == "COCO":
        reduce = [20, 40]
        # warmup
        if epoch < 1:
            lr = 1e-4
        else:
            lr = 1e-3
    else:
        reduce = [120,180]
        lr = 1e-3
        
    for i,lr_decay_epoch in enumerate(reduce):
        if epoch >= lr_decay_epoch:
            lr *= 0.1
    return lr

def adjust_learning_rate(optimizer, epoch):
    lr = get_current_lr(epoch)
    print("lr is:", lr)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [13]:
# モデルを学習させる関数を作成
batcht=[]
def train_model(net, dataloaders_dict, criterion, optimizer, num_epochs):

    # GPUが使えるかを確認
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("used device:", device)

    # ネットワークをGPUへ
    net.to(device)

    # ネットワークがある程度固定であれば、高速化させる
    torch.backends.cudnn.benchmark = True

    # イテレーションカウンタをセット
    iteration = 1
    epoch_train_loss = 0.0  # epochの損失和
    epoch_val_loss = 0.0  # epochの損失和
    logs = []

    # epochのループ
    for epoch in range(num_epochs+1):
        
        adjust_learning_rate(optimizer, epoch)
        
        # 開始時刻を保存
        t_epoch_start = time.time()
        t_iter_start = time.time()

        print('-------------')
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-------------')

        # epochごとの訓練と検証のループ
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()  # モデルを訓練モードに
                print('(train)')
            else:
                if((epoch+1) % 10 == 0):
                    net.eval()   # モデルを検証モードに
                    print('-------------')
                    print('(val)')
                else:
                    # 検証は10回に1回だけ行う
                    continue

            # データローダーからminibatchずつ取り出すループ
            for images, targets in dataloaders_dict[phase]:

                # GPUが使えるならGPUにデータを送る
                images = images.to(device)
                targets = [ann.to(device)
                           for ann in targets]  # リストの各要素のテンソルをGPUへ
                if HALF:
                    images = images.half()
                    targets = [ann.half() for ann in targets]
                # optimizerを初期化
                optimizer.zero_grad()

                # 順伝搬（forward）計算
                with torch.set_grad_enabled(phase == 'train'):
                    # 順伝搬（forward）計算
                    tick = time.time()
                    outputs = net(images)
                    tock = time.time()
                    batcht.append(tock-tick)
                    print("batch time:", np.mean(batcht))
                    #print(outputs[0].type())
                    # 損失の計算
                    loss_l, loss_c = criterion(outputs, targets)
                    loss = loss_l + loss_c

                    # 訓練時はバックプロパゲーション
                    if phase == 'train':
                        if HALF:
                            with amp.scale_loss(loss, optimizer) as scaled_loss:
                                scaled_loss.backward()
                        else:
                            loss.backward()  # 勾配の計算

                        # 勾配が大きくなりすぎると計算が不安定になるので、clipで最大でも勾配2.0に留める
                        nn.utils.clip_grad_value_(
                            net.parameters(), clip_value=2.0)

                        optimizer.step()  # パラメータ更新

                        if (iteration % 10 == 0):  # 10iterに1度、lossを表示
                            t_iter_finish = time.time()
                            duration = t_iter_finish - t_iter_start
                            print('Iter {} || Loss: {:.4f} || 10iter: {:.4f} sec.'.format(
                                iteration, loss.item(), duration))
                            t_iter_start = time.time()

                        epoch_train_loss += loss.item()
                        iteration += 1

                    # 検証時
                    else:
                        epoch_val_loss += loss.item()

        # epochのphaseごとのlossと正解率
        t_epoch_finish = time.time()
        print('-------------')
        print('epoch {} || Epoch_TRAIN_Loss:{:.4f} ||Epoch_VAL_Loss:{:.4f}'.format(
            epoch+1, epoch_train_loss, epoch_val_loss))
        print('timer:  {:.4f} sec.'.format(t_epoch_finish - t_epoch_start))
        t_epoch_start = time.time()

        # ログを保存
        log_epoch = {'epoch': epoch+1,
                     'train_loss': epoch_train_loss, 'val_loss': epoch_val_loss}
        logs.append(log_epoch)
        df = pd.DataFrame(logs)
        df.to_csv("log_output.csv")

        epoch_train_loss = 0.0  # epochの損失和
        epoch_val_loss = 0.0  # epochの損失和

        # ネットワークを保存する
        if ((epoch+1) % 10 == 0):
            if useBiFPN:
                word="BiFPN"
            else:
                word="FPN"
            torch.save(net.state_dict(), 'weights/'+DATASET+"_"+backbone+"_" + str(300*scale) + "_" + word + "_" + 
                       str(epoch+1) + '.pth')


In [None]:
if DATASET == "COCO":
    num_epochs = 50
else:
    num_epochs = 200
    
train_model(net, dataloaders_dict, criterion, optimizer, num_epochs=num_epochs)

used device: cuda:0
lr is: 0.001
-------------
Epoch 1/200
-------------
(train)


  "See the documentation of nn.Upsample for details.".format(mode))


batch time: 1.4208438396453857
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
batch time: 0.749925971031189
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0
batch time: 0.5173380374908447
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 8192.0
batch time: 0.400715708732605
batch time: 0.3310044288635254
batch time: 0.288063923517863
batch time: 0.2540460654667446
batch time: 0.22861766815185547
batch time: 0.2091119024488661
batch time: 0.19361252784729005
Iter 10 || Loss: 16.3804 || 10iter: 10.8001 sec.
batch time: 0.18068920482288708
batch time: 0.16980804999669394
batch time: 0.16061062079209548
batch time: 0.15274827820914133
batch time: 0.1459245522816976
batch time: 0.1399286687374115
batch time: 0.13468425414141486
batch time: 0.13013850318060982
batch time: 0.12593797633522436
batch time: 0.12223780155181885
Iter 20 || Loss: 14.7553 || 10iter: 5.4631 sec.
batch time: 0.11879262470063709
batch

batch time: 0.0593738775648829
batch time: 0.059330021569488245
batch time: 0.05928614912512096
batch time: 0.059240774674849075
Iter 220 || Loss: 7.7707 || 10iter: 5.5319 sec.
batch time: 0.05919642362119925
batch time: 0.05915580461691092
batch time: 0.05911224305362445
batch time: 0.05906997301748821
batch time: 0.05903566784328885
batch time: 0.059006193042856404
batch time: 0.0589738795410694
batch time: 0.059020756629475375
batch time: 0.05899643377445671
batch time: 0.058968152170595915
Iter 230 || Loss: 7.5936 || 10iter: 5.5751 sec.
batch time: 0.05893709442832253
batch time: 0.05889900491155427
batch time: 0.05886560345923952
batch time: 0.05882938091571514
batch time: 0.058795064560910486
batch time: 0.05876249074935913
batch time: 0.05874526349804069
batch time: 0.058712213981051406
batch time: 0.05867828484858429
batch time: 0.058654058972994486
Iter 240 || Loss: 7.0318 || 10iter: 5.5524 sec.
batch time: 0.05862698020776772
batch time: 0.0585996602192398
batch time: 0.05857

batch time: 0.056036209462715446
batch time: 0.056060953573747116
Iter 440 || Loss: 6.4678 || 10iter: 5.6182 sec.
batch time: 0.05606276810574694
batch time: 0.05605098294996028
batch time: 0.05604398869499396
batch time: 0.05603116804415041
batch time: 0.056020092160514234
batch time: 0.056007610308215224
batch time: 0.05599432290267091
batch time: 0.05598111982856478
batch time: 0.05596917934035405
batch time: 0.05596178160773383
Iter 450 || Loss: 6.9035 || 10iter: 5.5495 sec.
batch time: 0.0559555400501598
batch time: 0.05594354234965502
batch time: 0.055934589693351563
batch time: 0.05592159132600356
batch time: 0.05591596718672868
batch time: 0.05590790301038508
batch time: 0.055900924837041475
batch time: 0.05589237015320224
batch time: 0.055939920068046885
batch time: 0.055930867402449895
Iter 460 || Loss: 6.8679 || 10iter: 5.6673 sec.
batch time: 0.05592306989392593
batch time: 0.055913625857530735
batch time: 0.05590536218482526
batch time: 0.055895702078424654
batch time: 0.0

  "See the documentation of nn.Upsample for details.".format(mode))


batch time: 0.057033233789579946
batch time: 0.05709133881788987
Iter 520 || Loss: 6.7143 || 10iter: 4.0647 sec.
batch time: 0.057128293958140425
batch time: 0.057145640767853834
batch time: 0.05713414416486401
batch time: 0.057121746867667626
batch time: 0.05711540040515718
batch time: 0.05710609829471139
batch time: 0.057096617045393476
batch time: 0.05709120134512583
batch time: 0.057081042255480934
batch time: 0.05707931248646862
Iter 530 || Loss: 6.9429 || 10iter: 5.8179 sec.
batch time: 0.05707111825601754
batch time: 0.05705815643296206
batch time: 0.057050528714178206
batch time: 0.0570403721448634
batch time: 0.05705347061157227
batch time: 0.05704433482084701
batch time: 0.05703371374744721
batch time: 0.05702323541322162
batch time: 0.057029415813580515
batch time: 0.057016751059779415
Iter 540 || Loss: 7.1472 || 10iter: 5.5745 sec.
batch time: 0.05700360164184006
batch time: 0.05699129122209725
batch time: 0.056980832505621304
batch time: 0.05701239187927807
batch time: 0.0

Iter 740 || Loss: 6.8518 || 10iter: 5.5964 sec.
batch time: 0.05581476524291251
batch time: 0.05581074487166906
batch time: 0.05580368850625861
batch time: 0.05579592335608698
batch time: 0.055789068081234926
batch time: 0.05578140950394699
batch time: 0.05577423805533004
batch time: 0.05576580539744168
batch time: 0.05575773267147538
batch time: 0.0557602596282959
Iter 750 || Loss: 6.2037 || 10iter: 5.6235 sec.
batch time: 0.055751520530202894
batch time: 0.05574588223974755
batch time: 0.05573946324635945
batch time: 0.05573354665417254
batch time: 0.055744785662518434
batch time: 0.05573679845799845
batch time: 0.05572903455484811
batch time: 0.055720491270905746
batch time: 0.055715232655621956
batch time: 0.05570763129937022
Iter 760 || Loss: 6.4748 || 10iter: 5.6492 sec.
batch time: 0.055700014517905676
batch time: 0.055693958687970016
