# setup dataset

In [13]:
# import stuff
import os
import numpy as np
import time
import pandas as pd

import torch
import torch.utils.data as data
from itertools import product as product

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Function

In [14]:
# import dataset
from utils.dataset import VOCDataset, DatasetTransform, make_datapath_list, Anno_xml2list, od_collate_fn

In [17]:
## meta settings
import pretrainedmodels
model_name = 'resnet18' # choose from any resnets
DATASET = "VOC"
retina = False # for trying retinanets
fpn = False # try for fpns

## make data.Dataset for training

In [18]:
if not DATASET == "COCO":
    # load files
    # set your VOCdevkit path here.
    vocpath = "../VOCdevkit/VOC2007"
    train_img_list, train_anno_list, val_img_list, val_anno_list = make_datapath_list(vocpath)

    vocpath = "../VOCdevkit/VOC2012"
    train_img_list2, train_anno_list2, _, _ = make_datapath_list(vocpath)

    train_img_list.extend(train_img_list2)
    train_anno_list.extend(train_anno_list2)

    print("trainlist: ", len(train_img_list))
    print("vallist: ", len(val_img_list))

    # make Dataset
    voc_classes = ['aeroplane', 'bicycle', 'bird', 'boat',
                   'bottle', 'bus', 'car', 'cat', 'chair',
                   'cow', 'diningtable', 'dog', 'horse',
                   'motorbike', 'person', 'pottedplant',
                   'sheep', 'sofa', 'train', 'tvmonitor']

    color_mean = (104, 117, 123)  # (BGR)の色の平均値
    if scale == 1:
        input_size = 300  # 画像のinputサイズを300×300にする
    else:
        input_size = 512

    ## DatasetTransformを適応
    transform = DatasetTransform(input_size, color_mean)
    transform_anno = Anno_xml2list(voc_classes)

    # Dataloaderに入れるデータセットファイル。
    # ゲットで叩くと画像とGTを前処理して出力してくれる。
    train_dataset = VOCDataset(train_img_list, train_anno_list, phase = "train", transform=transform, transform_anno = transform_anno)
    val_dataset = VOCDataset(val_img_list, val_anno_list, phase="val", transform=DatasetTransform(
        input_size, color_mean), transform_anno=Anno_xml2list(voc_classes))

else:
    from dataset.coco import COCODetection
    import torch.utils.data as data
    from utils.dataset import VOCDataset, COCODatasetTransform, make_datapath_list, Anno_xml2list, od_collate_fn

    color_mean = (104, 117, 123)  # (BGR)の色の平均値
    if scale == 1:
        input_size = 300  # 画像のinputサイズを300×300にする
    else:
        input_size = 512

    ## DatasetTransformを適応
    transform = COCODatasetTransform(input_size, color_mean)
    train_dataset = COCODetection("../data/coco/", image_set="train2014", phase="train", transform=transform)
    val_dataset = COCODetection("../data/coco/", image_set="val2014", phase="val", transform=transform)

batch_size = int(32/scale)

train_dataloader = data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=od_collate_fn, num_workers=8)

val_dataloader = data.DataLoader(
    val_dataset, batch_size=batch_size, shuffle=False, collate_fn=od_collate_fn, num_workers=8)

# 辞書型変数にまとめる
dataloaders_dict = {"train": train_dataloader, "val": val_dataloader}

trainlist:  16551
vallist:  4952


In [19]:
# 動作の確認
batch_iterator = iter(dataloaders_dict["val"])  # イタレータに変換
images, targets = next(batch_iterator)  # 1番目の要素を取り出す
print(images.size())  # torch.Size([4, 3, 300, 300])
print(len(targets))
print(targets[1].shape)  # ミニバッチのサイズのリスト、各要素は[n, 5]、nは物体数

torch.Size([32, 3, 300, 300])
32
torch.Size([1, 5])


# define Centernet model and test

In [29]:
class double_conv(nn.Module):
    '''(conv => BN => ReLU) * 2'''
    def __init__(self, in_ch, out_ch):
        super(double_conv, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_ch, out_ch, 3, padding=1),
            nn.BatchNorm2d(out_ch),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_ch, out_ch, 3, padding=1),
            nn.BatchNorm2d(out_ch),
            nn.ReLU(inplace=True)
        )

    def forward(self, x):
        x = self.conv(x)
        return x

class up(nn.Module):
    def __init__(self, in_ch, out_ch, bilinear=True):
        super(up, self).__init__()

        #  would be a nice idea if the upsampling could be learned too,
        #  but my machine do not have enough memory to handle all those weights
        if bilinear:
            self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
        else:
            self.up = nn.ConvTranspose2d(in_ch//2, in_ch//2, 2, stride=2)

        self.conv = double_conv(in_ch, out_ch)
        
    def forward(self, x1, x2=None):
        x1 = self.up(x1)
        if x2 is not None:
            x = torch.cat([x2, x1], dim=1)
            # input is CHW
            diffY = x2.size()[2] - x1.size()[2]
            diffX = x2.size()[3] - x1.size()[3]

            x1 = F.pad(x1, (diffX // 2, diffX - diffX//2,
                            diffY // 2, diffY - diffY//2))
        else:
            x = x1
        x = self.conv(x)
        return x

In [30]:
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.model_zoo as model_zoo

# create backbone.
basemodel = pretrainedmodels.__dict__[model_name](num_classes=1000, pretrained='imagenet')
basemodel = nn.Sequential(*list(basemodel.children())[:-2])

In [31]:
class centernet(nn.Module):
    '''Mixture of previous classes'''
    def __init__(self, n_classes):
        super(centernet, self).__init__()
        self.base_model = basemodel
        
        if model_name == "resnet34" or model_name=="resnet18":
            num_ch = 512
        else:
            num_ch = 2048
        
        self.up1 = up(num_ch, 512)
        self.up2 = up(512, 256)
        self.up3 = up(256, 256)
        # output classification
        self.outc = nn.Conv2d(256, n_classes, 1)
        # output residue
        self.outr = nn.Conv2d(256, n_classes*4, 1)
        
    def forward(self, x):
        batch_size = x.shape[0]
        
        x = self.base_model(x)
        
        # Add positional info        
        x = self.up1(x)
        x = self.up2(x)
        x = self.up3(x)
        outc = self.outc(x)
        outr = self.outr(x)
        return outc, outr

In [38]:
if not DATASET == "COCO":
    num_class = 21
else:
    num_class = 81

# test if net works
net = centernet(num_class)
print(torch.rand([1,3,input_size,input_size]).size())
outc, outr = net(torch.rand([1,3,input_size,input_size]))
print(outc.size())

torch.Size([1, 3, 300, 300])
torch.Size([1, 21, 80, 80])


# set up optimizers

In [40]:
# Gets the GPU if there is one, otherwise the cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [42]:
from utils.ssd_model import MultiBoxLoss
# define loss
criterion = MultiBoxLoss(jaccard_thresh=0.5,neg_pos=3, device=device, half=HALF)
# optim
import torch.optim as optim
optimizer = optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=1e-3)

In [12]:
# while the original efficientdet uses cosine annealining lr scheduling, we utilize epoch-based lr decreasing for simplicity.
def get_current_lr(epoch): 
    if DATASET == "COCO":
        reduce = [120, 180]
        lr = 1e-3
    else:
        reduce = [120,180]
        lr = 1e-3
        
    for i,lr_decay_epoch in enumerate(reduce):
        if epoch >= lr_decay_epoch:
            lr *= 0.1
    return lr

def adjust_learning_rate(optimizer, epoch):
    lr = get_current_lr(epoch)
    print("lr is:", lr)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

# set up train and eval scripts

In [13]:
# train script. nothing special..
def train_model(net, dataloaders_dict, criterion, optimizer, num_epochs):

    # GPUが使えるかを確認
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("used device:", device)

    # ネットワークをGPUへ
    net.to(device)

    # ネットワークがある程度固定であれば、高速化させる
    torch.backends.cudnn.benchmark = True

    # イテレーションカウンタをセット
    iteration = 1
    epoch_train_loss = 0.0  # epochの損失和
    epoch_val_loss = 0.0  # epochの損失和
    logs = []

    # epochのループ
    for epoch in range(num_epochs+1):
        
        adjust_learning_rate(optimizer, epoch)
        
        # 開始時刻を保存
        t_epoch_start = time.time()
        t_iter_start = time.time()

        print('-------------')
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-------------')

        # epochごとの訓練と検証のループ
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()  # モデルを訓練モードに
                print('(train)')
            else:
                if((epoch+1) % 10 == 0):
                    net.eval()   # モデルを検証モードに
                    print('-------------')
                    print('(val)')
                else:
                    # 検証は10回に1回だけ行う
                    continue

            # データローダーからminibatchずつ取り出すループ
            for images, targets in dataloaders_dict[phase]:

                # GPUが使えるならGPUにデータを送る
                images = images.to(device)
                targets = [ann.to(device)
                           for ann in targets]  # リストの各要素のテンソルをGPUへ
                if HALF:
                    images = images.half()
                    targets = [ann.half() for ann in targets]
                # optimizerを初期化
                optimizer.zero_grad()

                # 順伝搬（forward）計算
                with torch.set_grad_enabled(phase == 'train'):
                    # 順伝搬（forward）計算
                    outputs = net(images)
                    #print(outputs[0].type())
                    # 損失の計算
                    loss_l, loss_c = criterion(outputs, targets)
                    loss = loss_l + loss_c

                    # 訓練時はバックプロパゲーション
                    if phase == 'train':
                        loss.backward()  # 勾配の計算

                        # 勾配が大きくなりすぎると計算が不安定になるので、clipで最大でも勾配2.0に留める
                        nn.utils.clip_grad_value_(
                            net.parameters(), clip_value=2.0)

                        optimizer.step()  # パラメータ更新

                        if (iteration % 10 == 0):  # 10iterに1度、lossを表示
                            t_iter_finish = time.time()
                            duration = t_iter_finish - t_iter_start
                            print('Iter {} || Loss: {:.4f} || 10iter: {:.4f} sec.'.format(
                                iteration, loss.item(), duration))
                            t_iter_start = time.time()
                        # filter inf..
                        if not loss.item() == float("inf"):
                            epoch_train_loss += loss.item()
                        iteration += 1

                    # 検証時
                    else:
                        if not loss.item() == float("inf"):
                            epoch_val_loss += loss.item()

        # epochのphaseごとのlossと正解率
        t_epoch_finish = time.time()
        print('-------------')
        print('epoch {} || Epoch_TRAIN_Loss:{:.4f} ||Epoch_VAL_Loss:{:.4f}'.format(
            epoch+1, epoch_train_loss, epoch_val_loss))
        print('timer:  {:.4f} sec.'.format(t_epoch_finish - t_epoch_start))
        t_epoch_start = time.time()

        # ログを保存
        log_epoch = {'epoch': epoch+1,
                     'train_loss': epoch_train_loss, 'val_loss': epoch_val_loss}
        logs.append(log_epoch)
        df = pd.DataFrame(logs)
        df.to_csv("log/"+DATASET+"_"+backbone+"_" + str(300*scale) +"log_output.csv")

        epoch_train_loss = 0.0  # epochの損失和
        epoch_val_loss = 0.0  # epochの損失和

        # ネットワークを保存する
        if ((epoch+1) % 5 == 0):
            if useBiFPN:
                word="BiFPN"
            else:
                word="FPN"
            torch.save(net.state_dict(), 'weights/'+DATASET+"_"+backbone+"_" + str(300*scale) + "_" + word + "_" + 
                       str(epoch+1) + '.pth')


In [None]:
if DATASET == "COCO":
    num_epochs = 200
else:
    num_epochs = 200
    
train_model(net, dataloaders_dict, criterion, optimizer, num_epochs=num_epochs)

used device: cuda:0
lr is: 0.001
-------------
Epoch 1/200
-------------
(train)


  "See the documentation of nn.Upsample for details.".format(mode))


Iter 10 || Loss: 14.6762 || 10iter: 7.9883 sec.
Iter 20 || Loss: 12.6147 || 10iter: 3.3072 sec.
Iter 30 || Loss: 10.7831 || 10iter: 3.3764 sec.
Iter 40 || Loss: 9.5539 || 10iter: 3.4383 sec.
Iter 50 || Loss: 8.7597 || 10iter: 3.3863 sec.
Iter 60 || Loss: 8.3781 || 10iter: 3.4459 sec.
Iter 70 || Loss: 8.5997 || 10iter: 3.4078 sec.
Iter 80 || Loss: 8.3461 || 10iter: 3.4021 sec.
Iter 90 || Loss: 8.3254 || 10iter: 3.3984 sec.
Iter 100 || Loss: 8.1916 || 10iter: 3.4102 sec.
Iter 110 || Loss: 8.5010 || 10iter: 3.3978 sec.
Iter 120 || Loss: 8.0312 || 10iter: 3.3895 sec.
Iter 130 || Loss: 7.5776 || 10iter: 3.4147 sec.
Iter 140 || Loss: 7.4582 || 10iter: 3.3953 sec.
Iter 150 || Loss: 7.8578 || 10iter: 3.4497 sec.
Iter 160 || Loss: 7.9628 || 10iter: 3.3826 sec.
Iter 170 || Loss: 7.7113 || 10iter: 3.4399 sec.
Iter 180 || Loss: 7.7354 || 10iter: 3.3790 sec.
Iter 190 || Loss: 7.6620 || 10iter: 3.3862 sec.
Iter 200 || Loss: 7.6940 || 10iter: 3.3755 sec.
Iter 210 || Loss: 7.4468 || 10iter: 3.4130 sec

  "See the documentation of nn.Upsample for details.".format(mode))


Iter 520 || Loss: 6.9472 || 10iter: 2.3994 sec.
Iter 530 || Loss: 6.9510 || 10iter: 3.4278 sec.
Iter 540 || Loss: 6.6578 || 10iter: 3.3295 sec.
Iter 550 || Loss: 6.7569 || 10iter: 3.3849 sec.
Iter 560 || Loss: 6.6651 || 10iter: 3.4692 sec.
Iter 570 || Loss: 6.7154 || 10iter: 3.3967 sec.
Iter 580 || Loss: 6.8194 || 10iter: 3.3336 sec.
Iter 590 || Loss: 6.6416 || 10iter: 3.4128 sec.
Iter 600 || Loss: 7.0260 || 10iter: 3.4065 sec.
Iter 610 || Loss: 6.6993 || 10iter: 3.4378 sec.
Iter 620 || Loss: 6.7130 || 10iter: 3.3429 sec.
Iter 630 || Loss: 6.6879 || 10iter: 3.3698 sec.
Iter 640 || Loss: 6.7656 || 10iter: 3.4128 sec.
Iter 650 || Loss: 6.6890 || 10iter: 3.4234 sec.
Iter 660 || Loss: 6.7733 || 10iter: 3.4324 sec.
Iter 670 || Loss: 6.4825 || 10iter: 3.5050 sec.
Iter 680 || Loss: 6.7527 || 10iter: 3.3603 sec.
Iter 690 || Loss: 6.6672 || 10iter: 3.3518 sec.
Iter 700 || Loss: 6.3197 || 10iter: 3.3612 sec.
Iter 710 || Loss: 6.6545 || 10iter: 3.3912 sec.
Iter 720 || Loss: 6.8526 || 10iter: 3.35

Iter 2110 || Loss: 5.3264 || 10iter: 3.3978 sec.
Iter 2120 || Loss: 5.3107 || 10iter: 3.3645 sec.
Iter 2130 || Loss: 5.4049 || 10iter: 3.3747 sec.
Iter 2140 || Loss: 5.6159 || 10iter: 3.4130 sec.
Iter 2150 || Loss: 5.3737 || 10iter: 3.3554 sec.
Iter 2160 || Loss: 5.6128 || 10iter: 3.3987 sec.
Iter 2170 || Loss: 5.6840 || 10iter: 3.4465 sec.
Iter 2180 || Loss: 5.4251 || 10iter: 3.4079 sec.
Iter 2190 || Loss: 5.4585 || 10iter: 3.4962 sec.
Iter 2200 || Loss: 4.8977 || 10iter: 3.3691 sec.
Iter 2210 || Loss: 5.7564 || 10iter: 3.3954 sec.
Iter 2220 || Loss: 5.2696 || 10iter: 3.3692 sec.
Iter 2230 || Loss: 5.3867 || 10iter: 3.3577 sec.
Iter 2240 || Loss: 4.7236 || 10iter: 3.3490 sec.
Iter 2250 || Loss: 5.1422 || 10iter: 3.3428 sec.
Iter 2260 || Loss: 5.4216 || 10iter: 3.3415 sec.
Iter 2270 || Loss: 5.6682 || 10iter: 3.3664 sec.
Iter 2280 || Loss: 5.5349 || 10iter: 3.3840 sec.
Iter 2290 || Loss: 5.5123 || 10iter: 3.4147 sec.
Iter 2300 || Loss: 5.8075 || 10iter: 3.3575 sec.
Iter 2310 || Loss: 5

Iter 3690 || Loss: 4.8675 || 10iter: 3.3999 sec.
Iter 3700 || Loss: 4.7384 || 10iter: 3.4340 sec.
Iter 3710 || Loss: 4.6953 || 10iter: 3.3850 sec.
Iter 3720 || Loss: 5.2084 || 10iter: 3.3636 sec.
Iter 3730 || Loss: 4.3088 || 10iter: 3.4519 sec.
Iter 3740 || Loss: 4.9228 || 10iter: 3.4815 sec.
Iter 3750 || Loss: 4.7602 || 10iter: 3.4038 sec.
Iter 3760 || Loss: 4.9588 || 10iter: 3.4107 sec.
Iter 3770 || Loss: 4.8657 || 10iter: 3.4762 sec.
Iter 3780 || Loss: 5.0428 || 10iter: 3.5091 sec.
Iter 3790 || Loss: 5.7787 || 10iter: 3.4346 sec.
Iter 3800 || Loss: 4.7304 || 10iter: 3.3827 sec.
Iter 3810 || Loss: 4.7113 || 10iter: 3.3482 sec.
Iter 3820 || Loss: 4.7078 || 10iter: 3.3526 sec.
Iter 3830 || Loss: 4.8722 || 10iter: 3.4191 sec.
Iter 3840 || Loss: 5.0261 || 10iter: 3.3982 sec.
Iter 3850 || Loss: 5.0775 || 10iter: 3.3621 sec.
Iter 3860 || Loss: 5.1993 || 10iter: 3.3771 sec.
Iter 3870 || Loss: 4.8228 || 10iter: 3.3633 sec.
Iter 3880 || Loss: 5.2044 || 10iter: 3.4146 sec.
Iter 3890 || Loss: 4

Iter 5260 || Loss: 4.4406 || 10iter: 3.4452 sec.
Iter 5270 || Loss: 4.4373 || 10iter: 3.3466 sec.
Iter 5280 || Loss: 4.5616 || 10iter: 3.4416 sec.
Iter 5290 || Loss: 4.5757 || 10iter: 3.3746 sec.
Iter 5300 || Loss: 4.7119 || 10iter: 3.3633 sec.
Iter 5310 || Loss: 4.4746 || 10iter: 3.4697 sec.
Iter 5320 || Loss: 4.2691 || 10iter: 3.4707 sec.
Iter 5330 || Loss: 4.8476 || 10iter: 3.4119 sec.
Iter 5340 || Loss: 5.0027 || 10iter: 3.3624 sec.
Iter 5350 || Loss: 4.7545 || 10iter: 3.3863 sec.
Iter 5360 || Loss: 4.8939 || 10iter: 3.4075 sec.
Iter 5370 || Loss: 4.5954 || 10iter: 3.3554 sec.
Iter 5380 || Loss: 4.3597 || 10iter: 3.3572 sec.
