# setup dataset

In [1]:
# import stuff
import os
import numpy as np
import time
import pandas as pd

import torch
import torch.utils.data as data
from itertools import product as product

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Function

In [2]:
# import dataset
from utils.dataset import VOCDataset, COCODatasetTransform, make_datapath_list, Anno_xml2list, od_collate_fn

In [3]:
# select from efficientnet backbone or resnet backbone
backbone = "efficientnet-b0"
scale = 1
# scale==1: resolution 300
# scale==2: resolution 600
useBiFPN = True
DATASET = "COCO"

## make data.Dataset for training

In [4]:
if not DATASET == "COCO":
    # load files
    # set your VOCdevkit path here.
    vocpath = "../VOCdevkit/VOC2007"
    train_img_list, train_anno_list, val_img_list, val_anno_list = make_datapath_list(vocpath)

    vocpath = "../VOCdevkit/VOC2012"
    train_img_list2, train_anno_list2, _, _ = make_datapath_list(vocpath)

    train_img_list.extend(train_img_list2)
    train_anno_list.extend(train_anno_list2)

    print("trainlist: ", len(train_img_list))
    print("vallist: ", len(val_img_list))

    # make Dataset
    voc_classes = ['aeroplane', 'bicycle', 'bird', 'boat',
                   'bottle', 'bus', 'car', 'cat', 'chair',
                   'cow', 'diningtable', 'dog', 'horse',
                   'motorbike', 'person', 'pottedplant',
                   'sheep', 'sofa', 'train', 'tvmonitor']

    color_mean = (104, 117, 123)  # (BGR)の色の平均値
    input_size = 300*scale  # 画像のinputサイズを300×300にする

    ## DatasetTransformを適応
    transform = COCODatasetTransform(input_size, color_mean)
    transform_anno = Anno_xml2list(voc_classes)

    # Dataloaderに入れるデータセットファイル。
    # ゲットで叩くと画像とGTを前処理して出力してくれる。
    train_dataset = VOCDataset(train_img_list, train_anno_list, phase = "train", transform=transform, transform_anno = transform_anno)
    val_dataset = VOCDataset(val_img_list, val_anno_list, phase="val", transform=COCODatasetTransform(
        input_size, color_mean), transform_anno=Anno_xml2list(voc_classes))

else:
    from dataset.coco import COCODetection
    import torch.utils.data as data
    from utils.dataset import VOCDataset, COCODatasetTransform, make_datapath_list, Anno_xml2list, od_collate_fn

    color_mean = (104, 117, 123)  # (BGR)の色の平均値
    input_size = 300  # 画像のinputサイズを300×300にする

    ## DatasetTransformを適応
    transform = COCODatasetTransform(input_size, color_mean)
    train_dataset = COCODetection("../data/coco/", image_set="train2014", phase="train", transform=transform)
    val_dataset = COCODetection("../data/coco/", image_set="val2014", phase="val", transform=transform)
    
batch_size = 32

train_dataloader = data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=od_collate_fn, num_workers=8)

val_dataloader = data.DataLoader(
    val_dataset, batch_size=batch_size, shuffle=False, collate_fn=od_collate_fn, num_workers=8)

# 辞書型変数にまとめる
dataloaders_dict = {"train": train_dataloader, "val": val_dataloader}

loading annotations into memory...
Done (t=10.21s)
creating index...
index created!
loading annotations into memory...
Done (t=5.83s)
creating index...
index created!


In [5]:
# 動作の確認
batch_iterator = iter(dataloaders_dict["val"])  # イタレータに変換
images, targets = next(batch_iterator)  # 1番目の要素を取り出す
print(images.size())  # torch.Size([4, 3, 300, 300])
print(len(targets))
print(targets[1].shape)  # ミニバッチのサイズのリスト、各要素は[n, 5]、nは物体数

torch.Size([32, 3, 300, 300])
32
torch.Size([9, 5])


# define EfficientDet model

In [6]:
from utils.efficientdet import EfficientDet

In [7]:
if not DATASET == "COCO":
    num_class = 21
else:
    num_class = 81

if scale==1:
    ssd_cfg = {
        'num_classes': num_class,  # 背景クラスを含めた合計クラス数
        'input_size': 300*scale,  # 画像の入力サイズ
        'bbox_aspect_num': [4, 6, 6, 6, 4, 4],  # 出力するDBoxのアスペクト比の種類
        'feature_maps': [37, 18, 9, 5, 3, 1],  # 各sourceの画像サイズ
        'steps': [8, 16, 32, 64, 100, 300],  # DBOXの大きさを決める
        'min_sizes': [30, 60, 111, 162, 213, 264],  # DBOXの大きさを決める
        'max_sizes': [60, 111, 162, 213, 264, 315],  # DBOXの大きさを決める
        'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
    }
elif scale==2:
    ssd_cfg = {
        'num_classes': num_class,  # 背景クラスを含めた合計クラス数
        'input_size': 300*scale,  # 画像の入力サイズ
        'bbox_aspect_num': [4, 6, 6, 6, 4, 4],  # 出力するDBoxのアスペクト比の種類
        'feature_maps': [75, 38, 19, 10, 5, 3],  # 各sourceの画像サイズ
        'steps': [8, 16, 32, 64, 100, 300],  # DBOXの大きさを決める
        'min_sizes': [30, 60, 111, 162, 213, 264]*scale,  # DBOXの大きさを決める
        'max_sizes': [60, 111, 162, 213, 264, 315]*scale,  # DBOXの大きさを決める
        'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
    }

# test if net works
net = EfficientDet(phase="train", cfg=ssd_cfg, verbose=True, backbone=backbone, useBiFPN=useBiFPN)
out = net(torch.rand([1,3,300,300]))
print(out[0].size())

Loaded pretrained weights for efficientnet-b0
use BiFPN
layerc3: torch.Size([1, 40, 37, 37])
layerc4: torch.Size([1, 80, 18, 18])
layerc5: torch.Size([1, 320, 9, 9])
layer size: torch.Size([1, 256, 37, 37])
layer size: torch.Size([1, 256, 18, 18])
layer size: torch.Size([1, 256, 9, 9])
layer size: torch.Size([1, 256, 5, 5])
layer size: torch.Size([1, 256, 3, 3])
layer size: torch.Size([1, 256, 1, 1])
torch.Size([1, 8096, 4])


  "See the documentation of nn.Upsample for details.".format(mode))


In [8]:
net = EfficientDet(phase="train", cfg=ssd_cfg, verbose=False, backbone=backbone, useBiFPN=useBiFPN)

# GPUが使えるか確認
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("using:", device)

print("set weights!")

Loaded pretrained weights for efficientnet-b0
use BiFPN
using: cuda:0
set weights!


In [9]:
print(net)

EfficientDet(
  (layer0): Sequential(
    (0): Conv2dStaticSamePadding(
      3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False
      (static_padding): ZeroPad2d(padding=(0, 1, 0, 1), value=0.0)
    )
    (1): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
  )
  (layer2): Sequential(
    (0): MBConvBlock(
      (_depthwise_conv): Conv2dStaticSamePadding(
        32, 32, kernel_size=(3, 3), stride=[1, 1], groups=32, bias=False
        (static_padding): ZeroPad2d(padding=(1, 1, 1, 1), value=0.0)
      )
      (_bn1): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
      (_se_reduce): Conv2dStaticSamePadding(
        32, 8, kernel_size=(1, 1), stride=(1, 1)
        (static_padding): Identity()
      )
      (_se_expand): Conv2dStaticSamePadding(
        8, 32, kernel_size=(1, 1), stride=(1, 1)
        (static_padding): Identity()
      )
      (_project_conv): Conv2dStaticSamePadding(
   

In [10]:
from utils.ssd_model import MultiBoxLoss

# define loss
criterion = MultiBoxLoss(jaccard_thresh=0.5,neg_pos=3, device=device)

# optim
import torch.optim as optim
optimizer = optim.SGD(net.parameters(), lr=1e-3, momentum=0.9, weight_decay=5e-4)

In [14]:
def get_current_lr(epoch):
    
    if DATASET == "COCO":
        reduce = [10, 15]
        # warmup
        lr = 1e-3
    else:
        reduce = [120,180]
        lr = 1e-3
        
    for i,lr_decay_epoch in enumerate(reduce):
        if epoch >= lr_decay_epoch:
            lr *= 0.1
    return lr

def adjust_learning_rate(optimizer, epoch):
    lr = get_current_lr(epoch)
    print("lr is:", lr)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [15]:
# モデルを学習させる関数を作成
def train_model(net, dataloaders_dict, criterion, optimizer, num_epochs):

    # GPUが使えるかを確認
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("used device:", device)

    # ネットワークをGPUへ
    net.to(device)

    # ネットワークがある程度固定であれば、高速化させる
    torch.backends.cudnn.benchmark = True

    # イテレーションカウンタをセット
    iteration = 1
    epoch_train_loss = 0.0  # epochの損失和
    epoch_val_loss = 0.0  # epochの損失和
    logs = []

    # epochのループ
    for epoch in range(num_epochs+1):
        
        adjust_learning_rate(optimizer, epoch)
        
        # 開始時刻を保存
        t_epoch_start = time.time()
        t_iter_start = time.time()

        print('-------------')
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-------------')

        # epochごとの訓練と検証のループ
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()  # モデルを訓練モードに
                print('(train)')
            else:
                if((epoch+1) % 10 == 0):
                    net.eval()   # モデルを検証モードに
                    print('-------------')
                    print('(val)')
                else:
                    # 検証は10回に1回だけ行う
                    continue

            # データローダーからminibatchずつ取り出すループ
            for images, targets in dataloaders_dict[phase]:

                # GPUが使えるならGPUにデータを送る
                images = images.to(device)
                targets = [ann.to(device)
                           for ann in targets]  # リストの各要素のテンソルをGPUへ

                # optimizerを初期化
                optimizer.zero_grad()

                # 順伝搬（forward）計算
                with torch.set_grad_enabled(phase == 'train'):
                    # 順伝搬（forward）計算
                    outputs = net(images)

                    # 損失の計算
                    loss_l, loss_c = criterion(outputs, targets)
                    loss = loss_l + loss_c

                    # 訓練時はバックプロパゲーション
                    if phase == 'train':
                        loss.backward()  # 勾配の計算

                        # 勾配が大きくなりすぎると計算が不安定になるので、clipで最大でも勾配2.0に留める
                        nn.utils.clip_grad_value_(
                            net.parameters(), clip_value=2.0)

                        optimizer.step()  # パラメータ更新

                        if (iteration % 10 == 0):  # 10iterに1度、lossを表示
                            t_iter_finish = time.time()
                            duration = t_iter_finish - t_iter_start
                            print('Iter {} || Loss: {:.4f} || 10iter: {:.4f} sec.'.format(
                                iteration, loss.item(), duration))
                            t_iter_start = time.time()

                        epoch_train_loss += loss.item()
                        iteration += 1

                    # 検証時
                    else:
                        epoch_val_loss += loss.item()

        # epochのphaseごとのlossと正解率
        t_epoch_finish = time.time()
        print('-------------')
        print('epoch {} || Epoch_TRAIN_Loss:{:.4f} ||Epoch_VAL_Loss:{:.4f}'.format(
            epoch+1, epoch_train_loss, epoch_val_loss))
        print('timer:  {:.4f} sec.'.format(t_epoch_finish - t_epoch_start))
        t_epoch_start = time.time()

        # ログを保存
        log_epoch = {'epoch': epoch+1,
                     'train_loss': epoch_train_loss, 'val_loss': epoch_val_loss}
        logs.append(log_epoch)
        df = pd.DataFrame(logs)
        df.to_csv("log_output.csv")

        epoch_train_loss = 0.0  # epochの損失和
        epoch_val_loss = 0.0  # epochの損失和

        # ネットワークを保存する
        if ((epoch+1) % 10 == 0):
            if useBiFPN:
                word="BiFPN"
            else:
                word="FPN"
            torch.save(net.state_dict(), 'weights/'+DATASET+"_"+backbone+"_" + str(300*scale) + "_" + word + "_" + 
                       str(epoch+1) + '.pth')


In [None]:
if DATASET == "COCO":
    num_epochs = 30
else:
    num_epochs = 200
    
train_model(net, dataloaders_dict, criterion, optimizer, num_epochs=num_epochs)

used device: cuda:0
lr is: 0.001
-------------
Epoch 1/30
-------------
(train)


  "See the documentation of nn.Upsample for details.".format(mode))


Iter 10 || Loss: 19.4255 || 10iter: 5.3582 sec.
Iter 20 || Loss: 17.9685 || 10iter: 3.0559 sec.
Iter 30 || Loss: 17.3752 || 10iter: 3.0660 sec.
Iter 40 || Loss: 16.3284 || 10iter: 3.1459 sec.
Iter 50 || Loss: 15.9392 || 10iter: 3.1156 sec.
Iter 60 || Loss: 15.1132 || 10iter: 3.1178 sec.
Iter 70 || Loss: 14.5142 || 10iter: 3.0875 sec.
Iter 80 || Loss: 13.6458 || 10iter: 3.0677 sec.
Iter 90 || Loss: 13.3723 || 10iter: 3.0514 sec.
Iter 100 || Loss: 12.3718 || 10iter: 3.0492 sec.
Iter 110 || Loss: 12.2682 || 10iter: 3.0454 sec.
Iter 120 || Loss: 11.4667 || 10iter: 3.0619 sec.
Iter 130 || Loss: 11.5692 || 10iter: 3.0726 sec.
Iter 140 || Loss: 10.6132 || 10iter: 3.0621 sec.
Iter 150 || Loss: 11.0858 || 10iter: 3.0746 sec.
Iter 160 || Loss: 10.3657 || 10iter: 3.0941 sec.
Iter 170 || Loss: 9.9537 || 10iter: 3.1120 sec.
Iter 180 || Loss: 10.5440 || 10iter: 3.0670 sec.
Iter 190 || Loss: 10.1650 || 10iter: 3.0600 sec.
Iter 200 || Loss: 9.7239 || 10iter: 3.1529 sec.
Iter 210 || Loss: 9.7864 || 10i