# setup dataset

In [1]:
# import stuff
import os
import numpy as np
import time
import pandas as pd

import torch
import torch.utils.data as data
from itertools import product as product

import torch
import torch.nn as nn
import torch.nn.init as init
import torch.nn.functional as F
from torch.autograd import Function
from utils.to_fp16 import network_to_half

In [2]:
# import dataset
from utils.dataset import VOCDataset, DatasetTransform, make_datapath_list, Anno_xml2list, od_collate_fn

In [3]:
## meta settings

# select from efficientnet backbone or resnet backbone
backbone = "efficientnet-b0"
scale = 2
# scale==1: resolution 300
# scale==2: resolution 600
useBiFPN = True
HALF = False # enable FP16
DATASET = "VOC"
retina = False # for trying retinanets

## make data.Dataset for training

In [4]:
if not DATASET == "COCO":
    # load files
    # set your VOCdevkit path here.
    vocpath = "../VOCdevkit/VOC2007"
    train_img_list, train_anno_list, val_img_list, val_anno_list = make_datapath_list(vocpath)

    vocpath = "../VOCdevkit/VOC2012"
    train_img_list2, train_anno_list2, _, _ = make_datapath_list(vocpath)

    train_img_list.extend(train_img_list2)
    train_anno_list.extend(train_anno_list2)

    print("trainlist: ", len(train_img_list))
    print("vallist: ", len(val_img_list))

    # make Dataset
    voc_classes = ['aeroplane', 'bicycle', 'bird', 'boat',
                   'bottle', 'bus', 'car', 'cat', 'chair',
                   'cow', 'diningtable', 'dog', 'horse',
                   'motorbike', 'person', 'pottedplant',
                   'sheep', 'sofa', 'train', 'tvmonitor']

    color_mean = (104, 117, 123)  # (BGR)の色の平均値
    if scale == 1:
        input_size = 300  # 画像のinputサイズを300×300にする
    else:
        input_size = 512

    ## DatasetTransformを適応
    transform = DatasetTransform(input_size, color_mean)
    transform_anno = Anno_xml2list(voc_classes)

    # Dataloaderに入れるデータセットファイル。
    # ゲットで叩くと画像とGTを前処理して出力してくれる。
    train_dataset = VOCDataset(train_img_list, train_anno_list, phase = "train", transform=transform, transform_anno = transform_anno)
    val_dataset = VOCDataset(val_img_list, val_anno_list, phase="val", transform=DatasetTransform(
        input_size, color_mean), transform_anno=Anno_xml2list(voc_classes))

else:
    from dataset.coco import COCODetection
    import torch.utils.data as data
    from utils.dataset import VOCDataset, COCODatasetTransform, make_datapath_list, Anno_xml2list, od_collate_fn

    color_mean = (104, 117, 123)  # (BGR)の色の平均値
    if scale == 1:
        input_size = 300  # 画像のinputサイズを300×300にする
    else:
        input_size = 512

    ## DatasetTransformを適応
    transform = COCODatasetTransform(input_size, color_mean)
    train_dataset = COCODetection("../data/coco/", image_set="train2014", phase="train", transform=transform)
    val_dataset = COCODetection("../data/coco/", image_set="val2014", phase="val", transform=transform)

batch_size = int(32/scale)

train_dataloader = data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=od_collate_fn, num_workers=8)

val_dataloader = data.DataLoader(
    val_dataset, batch_size=batch_size, shuffle=False, collate_fn=od_collate_fn, num_workers=8)

# 辞書型変数にまとめる
dataloaders_dict = {"train": train_dataloader, "val": val_dataloader}

trainlist:  16551
vallist:  4952


In [5]:
# 動作の確認
batch_iterator = iter(dataloaders_dict["val"])  # イタレータに変換
images, targets = next(batch_iterator)  # 1番目の要素を取り出す
print(images.size())  # torch.Size([4, 3, 300, 300])
print(len(targets))
print(targets[1].shape)  # ミニバッチのサイズのリスト、各要素は[n, 5]、nは物体数

torch.Size([16, 3, 512, 512])
16
torch.Size([1, 5])


# define EfficientDet model

In [6]:
from utils.efficientdet import EfficientDet

In [7]:
if not DATASET == "COCO":
    num_class = 21
else:
    num_class = 81

if scale==1:
    ssd_cfg = {
        'num_classes': num_class,  # 背景クラスを含めた合計クラス数
        'input_size': 300*scale,  # 画像の入力サイズ
        'bbox_aspect_num': [4, 6, 6, 6, 4, 4],  # 出力するDBoxのアスペクト比の種類
        'feature_maps': [37, 18, 9, 5, 3, 1],  # 各sourceの画像サイズ
        'steps': [8, 16, 32, 64, 100, 300],  # DBOXの大きさを決める
        'min_sizes': [30, 60, 111, 162, 213, 264],  # DBOXの大きさを決める
        'max_sizes': [60, 111, 162, 213, 264, 315],  # DBOXの大きさを決める
        'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
    }
elif scale==2:
    ssd_cfg = {
        'num_classes': num_class,  # 背景クラスを含めた合計クラス数
        'input_size': 512,  # 画像の入力サイズ
        'bbox_aspect_num': [4, 6, 6, 6, 4, 4],  # 出力するDBoxのアスペクト比の種類
        'feature_maps': [64, 32, 16, 8, 4, 2],  # 各sourceの画像サイズ
        'steps': [8, 16, 32, 64, 100, 300],  # DBOXの大きさを決める
        'min_sizes': [30, 60, 111, 162, 213, 264]*scale,  # DBOXの大きさを決める
        'max_sizes': [60, 111, 162, 213, 264, 315]*scale,  # DBOXの大きさを決める
        'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
    }

# test if net works
net = EfficientDet(phase="train", cfg=ssd_cfg, verbose=True, backbone=backbone, useBiFPN=useBiFPN)
out = net(torch.rand([1,3,input_size,input_size]))
print(out[0].size())

Loaded pretrained weights for efficientnet-b0
EfficientNet(
  (_conv_stem): Conv2dStaticSamePadding(
    3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False
    (static_padding): ZeroPad2d(padding=(0, 1, 0, 1), value=0.0)
  )
  (_bn0): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
  (_blocks): ModuleList(
    (0): MBConvBlock(
      (_depthwise_conv): Conv2dStaticSamePadding(
        32, 32, kernel_size=(3, 3), stride=[1, 1], groups=32, bias=False
        (static_padding): ZeroPad2d(padding=(1, 1, 1, 1), value=0.0)
      )
      (_bn1): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
      (_se_reduce): Conv2dStaticSamePadding(
        32, 8, kernel_size=(1, 1), stride=(1, 1)
        (static_padding): Identity()
      )
      (_se_expand): Conv2dStaticSamePadding(
        8, 32, kernel_size=(1, 1), stride=(1, 1)
        (static_padding): Identity()
      )
      (_project_conv): Conv2dS

layerc3: torch.Size([1, 40, 64, 64])
layerc4: torch.Size([1, 80, 32, 32])
layerc5: torch.Size([1, 320, 16, 16])
layer size: torch.Size([1, 256, 64, 64])
layer size: torch.Size([1, 256, 32, 32])
layer size: torch.Size([1, 256, 16, 16])
layer size: torch.Size([1, 256, 8, 8])
layer size: torch.Size([1, 256, 4, 4])
layer size: torch.Size([1, 256, 2, 2])
torch.Size([1, 24528, 4])


  "See the documentation of nn.Upsample for details.".format(mode))


In [8]:
net = EfficientDet(phase="train", cfg=ssd_cfg, verbose=False, backbone=backbone, useBiFPN=useBiFPN)

# call retinanet for test purpose
if retina:
    from utils.retinanet import RetinaFPN
    ssd_cfg = {
        'num_classes': num_class,  # 背景クラスを含めた合計クラス数
        'input_size': 300*scale,  # 画像の入力サイズ
        'bbox_aspect_num': [4, 6, 6, 6, 4, 4],  # 出力するDBoxのアスペクト比の種類
        'feature_maps': [38, 19, 10, 5, 3, 1],  # 各sourceの画像サイズ
        'steps': [8, 16, 32, 64, 100, 300],  # DBOXの大きさを決める
        'min_sizes': [30, 60, 111, 162, 213, 264],  # DBOXの大きさを決める
        'max_sizes': [60, 111, 162, 213, 264, 315],  # DBOXの大きさを決める
        'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]],
    }
    net = RetinaFPN("train", ssd_cfg)

# GPUが使えるか確認
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("using:", device)

print("set weights!")

Loaded pretrained weights for efficientnet-b0
EfficientNet(
  (_conv_stem): Conv2dStaticSamePadding(
    3, 32, kernel_size=(3, 3), stride=(2, 2), bias=False
    (static_padding): ZeroPad2d(padding=(0, 1, 0, 1), value=0.0)
  )
  (_bn0): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
  (_blocks): ModuleList(
    (0): MBConvBlock(
      (_depthwise_conv): Conv2dStaticSamePadding(
        32, 32, kernel_size=(3, 3), stride=[1, 1], groups=32, bias=False
        (static_padding): ZeroPad2d(padding=(1, 1, 1, 1), value=0.0)
      )
      (_bn1): BatchNorm2d(32, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True)
      (_se_reduce): Conv2dStaticSamePadding(
        32, 8, kernel_size=(1, 1), stride=(1, 1)
        (static_padding): Identity()
      )
      (_se_expand): Conv2dStaticSamePadding(
        8, 32, kernel_size=(1, 1), stride=(1, 1)
        (static_padding): Identity()
      )
      (_project_conv): Conv2dS

In [9]:
# FP16..
if HALF:
    net = network_to_half(net)

In [10]:
# Freeze backbone layers
for param in net.layer0.parameters():
    param.requires_grad = False
for param in net.layer2.parameters():
    param.requires_grad = False
for param in net.layer3.parameters():
    param.requires_grad = False
for param in net.layer4.parameters():
    param.requires_grad = False
for param in net.layer5.parameters():
    param.requires_grad = False

In [11]:
from utils.ssd_model import MultiBoxLoss
# define loss
criterion = MultiBoxLoss(jaccard_thresh=0.5,neg_pos=3, device=device, half=HALF)
# optim
import torch.optim as optim
optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=1e-3, momentum=0.9, weight_decay=5e-4)

In [12]:
# while the original efficientdet uses cosine annealining lr scheduling, we utilize epoch-based lr decreasing for simplicity.
def get_current_lr(epoch): 
    if DATASET == "COCO":
        reduce = [120, 180]
        lr = 1e-3
    else:
        reduce = [120,180]
        lr = 1e-3
        
    for i,lr_decay_epoch in enumerate(reduce):
        if epoch >= lr_decay_epoch:
            lr *= 0.1
    return lr

def adjust_learning_rate(optimizer, epoch):
    lr = get_current_lr(epoch)
    print("lr is:", lr)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

In [13]:
# train script. nothing special..
def train_model(net, dataloaders_dict, criterion, optimizer, num_epochs):

    # GPUが使えるかを確認
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("used device:", device)

    # ネットワークをGPUへ
    net.to(device)

    # ネットワークがある程度固定であれば、高速化させる
    torch.backends.cudnn.benchmark = True

    # イテレーションカウンタをセット
    iteration = 1
    epoch_train_loss = 0.0  # epochの損失和
    epoch_val_loss = 0.0  # epochの損失和
    logs = []

    # epochのループ
    for epoch in range(num_epochs+1):
        
        adjust_learning_rate(optimizer, epoch)
        
        # 開始時刻を保存
        t_epoch_start = time.time()
        t_iter_start = time.time()

        print('-------------')
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-------------')

        # epochごとの訓練と検証のループ
        for phase in ['train', 'val']:
            if phase == 'train':
                net.train()  # モデルを訓練モードに
                print('(train)')
            else:
                if((epoch+1) % 10 == 0):
                    net.eval()   # モデルを検証モードに
                    print('-------------')
                    print('(val)')
                else:
                    # 検証は10回に1回だけ行う
                    continue

            # データローダーからminibatchずつ取り出すループ
            for images, targets in dataloaders_dict[phase]:

                # GPUが使えるならGPUにデータを送る
                images = images.to(device)
                targets = [ann.to(device)
                           for ann in targets]  # リストの各要素のテンソルをGPUへ
                if HALF:
                    images = images.half()
                    targets = [ann.half() for ann in targets]
                # optimizerを初期化
                optimizer.zero_grad()

                # 順伝搬（forward）計算
                with torch.set_grad_enabled(phase == 'train'):
                    # 順伝搬（forward）計算
                    outputs = net(images)
                    #print(outputs[0].type())
                    # 損失の計算
                    loss_l, loss_c = criterion(outputs, targets)
                    loss = loss_l + loss_c

                    # 訓練時はバックプロパゲーション
                    if phase == 'train':
                        loss.backward()  # 勾配の計算

                        # 勾配が大きくなりすぎると計算が不安定になるので、clipで最大でも勾配2.0に留める
                        nn.utils.clip_grad_value_(
                            net.parameters(), clip_value=2.0)

                        optimizer.step()  # パラメータ更新

                        if (iteration % 10 == 0):  # 10iterに1度、lossを表示
                            t_iter_finish = time.time()
                            duration = t_iter_finish - t_iter_start
                            print('Iter {} || Loss: {:.4f} || 10iter: {:.4f} sec.'.format(
                                iteration, loss.item(), duration))
                            t_iter_start = time.time()
                        # filter inf..
                        if not loss.item() == float("inf"):
                            epoch_train_loss += loss.item()
                        iteration += 1

                    # 検証時
                    else:
                        if not loss.item() == float("inf"):
                            epoch_val_loss += loss.item()

        # epochのphaseごとのlossと正解率
        t_epoch_finish = time.time()
        print('-------------')
        print('epoch {} || Epoch_TRAIN_Loss:{:.4f} ||Epoch_VAL_Loss:{:.4f}'.format(
            epoch+1, epoch_train_loss, epoch_val_loss))
        print('timer:  {:.4f} sec.'.format(t_epoch_finish - t_epoch_start))
        t_epoch_start = time.time()

        # ログを保存
        log_epoch = {'epoch': epoch+1,
                     'train_loss': epoch_train_loss, 'val_loss': epoch_val_loss}
        logs.append(log_epoch)
        df = pd.DataFrame(logs)
        df.to_csv("log_output.csv")

        epoch_train_loss = 0.0  # epochの損失和
        epoch_val_loss = 0.0  # epochの損失和

        # ネットワークを保存する
        if ((epoch+1) % 5 == 0):
            if useBiFPN:
                word="BiFPN"
            else:
                word="FPN"
            torch.save(net.state_dict(), 'weights/'+DATASET+"_"+backbone+"_" + str(300*scale) + "_" + word + "_" + 
                       str(epoch+1) + '.pth')


In [None]:
if DATASET == "COCO":
    num_epochs = 200
else:
    num_epochs = 200
    
train_model(net, dataloaders_dict, criterion, optimizer, num_epochs=num_epochs)

used device: cuda:0
lr is: 0.001
-------------
Epoch 1/200
-------------
(train)


  "See the documentation of nn.Upsample for details.".format(mode))


Iter 10 || Loss: 15.7518 || 10iter: 4.7307 sec.
Iter 20 || Loss: 14.4122 || 10iter: 1.8921 sec.
Iter 30 || Loss: 13.8655 || 10iter: 1.9784 sec.
Iter 40 || Loss: 12.7425 || 10iter: 1.9309 sec.
Iter 50 || Loss: 11.7954 || 10iter: 1.9372 sec.
Iter 60 || Loss: 11.3030 || 10iter: 1.8859 sec.
Iter 70 || Loss: 10.5033 || 10iter: 1.8864 sec.
Iter 80 || Loss: 10.3696 || 10iter: 1.9323 sec.
Iter 90 || Loss: 9.3117 || 10iter: 1.9685 sec.
Iter 100 || Loss: 8.8864 || 10iter: 1.9152 sec.
Iter 110 || Loss: 9.4674 || 10iter: 1.9103 sec.
Iter 120 || Loss: 9.3020 || 10iter: 1.8883 sec.
Iter 130 || Loss: 8.4071 || 10iter: 1.9002 sec.
Iter 140 || Loss: 8.6193 || 10iter: 1.9468 sec.
Iter 150 || Loss: 8.3351 || 10iter: 1.9050 sec.
Iter 160 || Loss: 8.5078 || 10iter: 1.9140 sec.
Iter 170 || Loss: 7.6491 || 10iter: 1.8982 sec.
Iter 180 || Loss: 8.7354 || 10iter: 1.9168 sec.
Iter 190 || Loss: 8.0486 || 10iter: 1.8867 sec.
Iter 200 || Loss: 8.0660 || 10iter: 1.9110 sec.
Iter 210 || Loss: 8.1561 || 10iter: 1.903

  "See the documentation of nn.Upsample for details.".format(mode))


Iter 1040 || Loss: 7.3981 || 10iter: 2.7324 sec.
Iter 1050 || Loss: 7.1401 || 10iter: 2.0177 sec.
Iter 1060 || Loss: 6.3296 || 10iter: 1.9268 sec.
Iter 1070 || Loss: 6.9658 || 10iter: 1.9337 sec.
Iter 1080 || Loss: 6.6879 || 10iter: 1.9037 sec.
Iter 1090 || Loss: 6.2444 || 10iter: 1.9686 sec.
Iter 1100 || Loss: 7.0513 || 10iter: 1.9276 sec.
Iter 1110 || Loss: 6.4635 || 10iter: 1.9627 sec.
Iter 1120 || Loss: 7.0456 || 10iter: 1.9197 sec.
Iter 1130 || Loss: 7.0069 || 10iter: 1.8963 sec.
Iter 1140 || Loss: 7.0612 || 10iter: 1.9401 sec.
Iter 1150 || Loss: 5.8751 || 10iter: 1.9078 sec.
Iter 1160 || Loss: 6.3783 || 10iter: 1.9610 sec.
Iter 1170 || Loss: 7.1168 || 10iter: 1.8801 sec.
Iter 1180 || Loss: 6.9482 || 10iter: 1.8925 sec.
Iter 1190 || Loss: 6.2115 || 10iter: 1.8916 sec.
Iter 1200 || Loss: 6.2355 || 10iter: 1.9432 sec.
Iter 1210 || Loss: 7.4168 || 10iter: 1.9235 sec.
Iter 1220 || Loss: 5.5916 || 10iter: 1.9344 sec.
Iter 1230 || Loss: 6.1026 || 10iter: 1.8963 sec.
Iter 1240 || Loss: 6

Iter 2680 || Loss: 6.6553 || 10iter: 1.9460 sec.
Iter 2690 || Loss: 6.7202 || 10iter: 1.8867 sec.
Iter 2700 || Loss: 5.9778 || 10iter: 1.8967 sec.
Iter 2710 || Loss: 6.0141 || 10iter: 1.9559 sec.
Iter 2720 || Loss: 5.4180 || 10iter: 1.9364 sec.
Iter 2730 || Loss: 6.5242 || 10iter: 1.9199 sec.
Iter 2740 || Loss: 6.6022 || 10iter: 1.9561 sec.
Iter 2750 || Loss: 5.9271 || 10iter: 1.9231 sec.
Iter 2760 || Loss: 6.3372 || 10iter: 1.9598 sec.
Iter 2770 || Loss: 5.7391 || 10iter: 1.9252 sec.
Iter 2780 || Loss: 6.3043 || 10iter: 1.8984 sec.
Iter 2790 || Loss: 7.3497 || 10iter: 1.9225 sec.
Iter 2800 || Loss: 5.5991 || 10iter: 1.9239 sec.
Iter 2810 || Loss: 5.7196 || 10iter: 1.9760 sec.
Iter 2820 || Loss: 6.8417 || 10iter: 1.9412 sec.
Iter 2830 || Loss: 6.6544 || 10iter: 1.9212 sec.
Iter 2840 || Loss: 5.4433 || 10iter: 1.9410 sec.
Iter 2850 || Loss: 5.5030 || 10iter: 1.8911 sec.
Iter 2860 || Loss: 5.8773 || 10iter: 1.9169 sec.
Iter 2870 || Loss: 6.0134 || 10iter: 1.9294 sec.
Iter 2880 || Loss: 5

Iter 4290 || Loss: 5.1026 || 10iter: 1.9341 sec.
Iter 4300 || Loss: 5.7481 || 10iter: 1.9045 sec.
Iter 4310 || Loss: 5.5600 || 10iter: 1.9480 sec.
Iter 4320 || Loss: 5.7486 || 10iter: 1.9326 sec.
Iter 4330 || Loss: 6.1395 || 10iter: 1.9198 sec.
Iter 4340 || Loss: 5.8731 || 10iter: 1.9244 sec.
Iter 4350 || Loss: 5.4092 || 10iter: 2.0157 sec.
Iter 4360 || Loss: 5.3820 || 10iter: 2.0196 sec.
Iter 4370 || Loss: 5.5349 || 10iter: 2.0085 sec.
Iter 4380 || Loss: 6.0740 || 10iter: 1.9467 sec.
Iter 4390 || Loss: 5.1966 || 10iter: 1.9536 sec.
Iter 4400 || Loss: 5.6187 || 10iter: 1.9000 sec.
Iter 4410 || Loss: 5.6328 || 10iter: 1.9419 sec.
Iter 4420 || Loss: 5.4136 || 10iter: 1.9053 sec.
Iter 4430 || Loss: 5.2870 || 10iter: 1.9082 sec.
Iter 4440 || Loss: 5.1681 || 10iter: 1.8873 sec.
Iter 4450 || Loss: 5.4315 || 10iter: 1.9121 sec.
Iter 4460 || Loss: 5.6397 || 10iter: 1.9025 sec.
Iter 4470 || Loss: 5.5429 || 10iter: 1.9126 sec.
Iter 4480 || Loss: 5.7347 || 10iter: 1.9035 sec.
Iter 4490 || Loss: 4

Iter 5930 || Loss: 5.4469 || 10iter: 1.9012 sec.
Iter 5940 || Loss: 5.4560 || 10iter: 1.9149 sec.
Iter 5950 || Loss: 5.1948 || 10iter: 1.9584 sec.
Iter 5960 || Loss: 5.2100 || 10iter: 1.9072 sec.
Iter 5970 || Loss: 5.4193 || 10iter: 1.9015 sec.
Iter 5980 || Loss: 5.8716 || 10iter: 1.9516 sec.
Iter 5990 || Loss: 5.2573 || 10iter: 1.9030 sec.
Iter 6000 || Loss: 5.3483 || 10iter: 1.9348 sec.
Iter 6010 || Loss: 5.1394 || 10iter: 1.9322 sec.
Iter 6020 || Loss: 5.3814 || 10iter: 1.9413 sec.
Iter 6030 || Loss: 5.2683 || 10iter: 1.9003 sec.
Iter 6040 || Loss: 5.4217 || 10iter: 1.9310 sec.
Iter 6050 || Loss: 5.8575 || 10iter: 1.9128 sec.
Iter 6060 || Loss: 5.6449 || 10iter: 1.9036 sec.
Iter 6070 || Loss: 5.8926 || 10iter: 1.9544 sec.
Iter 6080 || Loss: 5.6739 || 10iter: 1.8953 sec.
Iter 6090 || Loss: 5.3722 || 10iter: 1.8978 sec.
Iter 6100 || Loss: 4.5837 || 10iter: 1.9282 sec.
Iter 6110 || Loss: 5.6616 || 10iter: 1.9304 sec.
Iter 6120 || Loss: 4.8514 || 10iter: 1.9411 sec.
Iter 6130 || Loss: 6

Iter 7540 || Loss: 5.4203 || 10iter: 1.9277 sec.
Iter 7550 || Loss: 5.6426 || 10iter: 1.9230 sec.
Iter 7560 || Loss: 5.5743 || 10iter: 1.9535 sec.
Iter 7570 || Loss: 4.9691 || 10iter: 1.9032 sec.
Iter 7580 || Loss: 5.3590 || 10iter: 1.8872 sec.
Iter 7590 || Loss: 5.3064 || 10iter: 1.9940 sec.
Iter 7600 || Loss: 5.4071 || 10iter: 1.9078 sec.
Iter 7610 || Loss: 5.5328 || 10iter: 1.9019 sec.
Iter 7620 || Loss: 4.5438 || 10iter: 1.9248 sec.
Iter 7630 || Loss: 4.6554 || 10iter: 1.8960 sec.
Iter 7640 || Loss: 5.7452 || 10iter: 1.8856 sec.
Iter 7650 || Loss: 5.5052 || 10iter: 2.0103 sec.
Iter 7660 || Loss: 4.3756 || 10iter: 1.9704 sec.
Iter 7670 || Loss: 5.0586 || 10iter: 2.0165 sec.
Iter 7680 || Loss: 5.7227 || 10iter: 2.0719 sec.
Iter 7690 || Loss: 5.6114 || 10iter: 2.0187 sec.
Iter 7700 || Loss: 4.8371 || 10iter: 1.8732 sec.
Iter 7710 || Loss: 5.3792 || 10iter: 1.9124 sec.
Iter 7720 || Loss: 5.2324 || 10iter: 1.9028 sec.
Iter 7730 || Loss: 5.2070 || 10iter: 1.9346 sec.
Iter 7740 || Loss: 5

Iter 9180 || Loss: 5.2094 || 10iter: 1.8939 sec.
Iter 9190 || Loss: 5.9387 || 10iter: 1.9174 sec.
Iter 9200 || Loss: 4.8921 || 10iter: 1.9552 sec.
Iter 9210 || Loss: 5.4270 || 10iter: 2.0010 sec.
Iter 9220 || Loss: 4.9059 || 10iter: 1.8728 sec.
Iter 9230 || Loss: 5.6045 || 10iter: 1.8955 sec.
Iter 9240 || Loss: 5.0029 || 10iter: 1.9200 sec.
Iter 9250 || Loss: 5.0002 || 10iter: 1.9235 sec.
Iter 9260 || Loss: 5.1014 || 10iter: 1.9489 sec.
Iter 9270 || Loss: 4.9806 || 10iter: 1.9019 sec.
Iter 9280 || Loss: 6.4522 || 10iter: 1.9158 sec.
Iter 9290 || Loss: 4.9896 || 10iter: 1.9013 sec.
Iter 9300 || Loss: 5.3658 || 10iter: 1.9329 sec.
Iter 9310 || Loss: 5.1430 || 10iter: 1.8043 sec.
-------------
epoch 9 || Epoch_TRAIN_Loss:5476.7265 ||Epoch_VAL_Loss:0.0000
timer:  204.0673 sec.
lr is: 0.001
-------------
Epoch 10/200
-------------
(train)
Iter 9320 || Loss: 4.7028 || 10iter: 2.7259 sec.
Iter 9330 || Loss: 5.4410 || 10iter: 1.9497 sec.
Iter 9340 || Loss: 4.3508 || 10iter: 1.8773 sec.
Iter 93

Iter 10770 || Loss: 5.3237 || 10iter: 1.9322 sec.
Iter 10780 || Loss: 5.6913 || 10iter: 1.9236 sec.
Iter 10790 || Loss: 5.4656 || 10iter: 1.9038 sec.
Iter 10800 || Loss: 5.8244 || 10iter: 1.9645 sec.
Iter 10810 || Loss: 5.4652 || 10iter: 1.9265 sec.
Iter 10820 || Loss: 5.6042 || 10iter: 1.8911 sec.
Iter 10830 || Loss: 4.7095 || 10iter: 1.9338 sec.
Iter 10840 || Loss: 5.3835 || 10iter: 1.9390 sec.
Iter 10850 || Loss: 4.8674 || 10iter: 1.9307 sec.
Iter 10860 || Loss: 4.8937 || 10iter: 1.9208 sec.
Iter 10870 || Loss: 5.0901 || 10iter: 1.9088 sec.
Iter 10880 || Loss: 5.0186 || 10iter: 1.9463 sec.
Iter 10890 || Loss: 4.7500 || 10iter: 1.9539 sec.
Iter 10900 || Loss: 5.4548 || 10iter: 1.9261 sec.
Iter 10910 || Loss: 4.4846 || 10iter: 1.9562 sec.
Iter 10920 || Loss: 5.3497 || 10iter: 1.9009 sec.
Iter 10930 || Loss: 5.1473 || 10iter: 1.8944 sec.
Iter 10940 || Loss: 5.4784 || 10iter: 1.9549 sec.
Iter 10950 || Loss: 5.2840 || 10iter: 1.9256 sec.
Iter 10960 || Loss: 4.5664 || 10iter: 1.9380 sec.


Iter 12380 || Loss: 5.7821 || 10iter: 1.9876 sec.
Iter 12390 || Loss: 5.6177 || 10iter: 1.9172 sec.
Iter 12400 || Loss: 4.6723 || 10iter: 1.9027 sec.
Iter 12410 || Loss: 5.2864 || 10iter: 1.8680 sec.
Iter 12420 || Loss: 5.6567 || 10iter: 1.7462 sec.
-------------
epoch 12 || Epoch_TRAIN_Loss:5323.9684 ||Epoch_VAL_Loss:0.0000
timer:  203.9996 sec.
lr is: 0.001
-------------
Epoch 13/200
-------------
(train)
Iter 12430 || Loss: 5.2301 || 10iter: 3.8338 sec.
Iter 12440 || Loss: 5.2184 || 10iter: 1.8786 sec.
Iter 12450 || Loss: 5.0316 || 10iter: 1.8974 sec.
Iter 12460 || Loss: 4.6722 || 10iter: 1.9490 sec.
Iter 12470 || Loss: 5.5091 || 10iter: 1.9512 sec.
Iter 12480 || Loss: 5.0956 || 10iter: 1.9118 sec.
Iter 12490 || Loss: 6.1115 || 10iter: 1.9043 sec.
Iter 12500 || Loss: 4.3868 || 10iter: 1.9399 sec.
Iter 12510 || Loss: 5.1611 || 10iter: 1.9371 sec.
Iter 12520 || Loss: 5.1821 || 10iter: 1.9531 sec.
Iter 12530 || Loss: 4.9484 || 10iter: 1.8945 sec.
Iter 12540 || Loss: 4.9977 || 10iter: 1

Iter 13960 || Loss: 5.2317 || 10iter: 1.9100 sec.
Iter 13970 || Loss: 5.4712 || 10iter: 1.9166 sec.
Iter 13980 || Loss: 5.2736 || 10iter: 1.9241 sec.
Iter 13990 || Loss: 5.4525 || 10iter: 1.9305 sec.
Iter 14000 || Loss: 6.0101 || 10iter: 1.9083 sec.
Iter 14010 || Loss: 5.4147 || 10iter: 1.9020 sec.
Iter 14020 || Loss: 5.1907 || 10iter: 1.9243 sec.
Iter 14030 || Loss: 5.1204 || 10iter: 1.9381 sec.
Iter 14040 || Loss: 4.4785 || 10iter: 1.9198 sec.
Iter 14050 || Loss: 4.7081 || 10iter: 1.9138 sec.
Iter 14060 || Loss: 4.9156 || 10iter: 1.9636 sec.
Iter 14070 || Loss: 4.7927 || 10iter: 1.9209 sec.
Iter 14080 || Loss: 5.0901 || 10iter: 1.9288 sec.
Iter 14090 || Loss: 5.2445 || 10iter: 1.9286 sec.
Iter 14100 || Loss: 5.1526 || 10iter: 1.9376 sec.
Iter 14110 || Loss: 5.8268 || 10iter: 1.9312 sec.
Iter 14120 || Loss: 4.9961 || 10iter: 1.9255 sec.
Iter 14130 || Loss: 5.0759 || 10iter: 1.9575 sec.
Iter 14140 || Loss: 4.2812 || 10iter: 1.9595 sec.
Iter 14150 || Loss: 5.6307 || 10iter: 1.9874 sec.


Iter 15540 || Loss: 5.8025 || 10iter: 1.9934 sec.
Iter 15550 || Loss: 5.0796 || 10iter: 1.8539 sec.
Iter 15560 || Loss: 5.5387 || 10iter: 1.9182 sec.
Iter 15570 || Loss: 4.8442 || 10iter: 1.9398 sec.
Iter 15580 || Loss: 4.8590 || 10iter: 1.9199 sec.
Iter 15590 || Loss: 5.2256 || 10iter: 1.9237 sec.
Iter 15600 || Loss: 4.4678 || 10iter: 1.8979 sec.
Iter 15610 || Loss: 5.6006 || 10iter: 1.9012 sec.
Iter 15620 || Loss: 5.3900 || 10iter: 1.9227 sec.
Iter 15630 || Loss: 5.2369 || 10iter: 1.8937 sec.
Iter 15640 || Loss: 5.2440 || 10iter: 1.8926 sec.
Iter 15650 || Loss: 5.4751 || 10iter: 1.8861 sec.
Iter 15660 || Loss: 5.3793 || 10iter: 1.9427 sec.
Iter 15670 || Loss: 5.2494 || 10iter: 1.9208 sec.
Iter 15680 || Loss: 5.7264 || 10iter: 1.9400 sec.
Iter 15690 || Loss: 4.5855 || 10iter: 1.9523 sec.
Iter 15700 || Loss: 5.5591 || 10iter: 1.9249 sec.
Iter 15710 || Loss: 5.4837 || 10iter: 1.9512 sec.
Iter 15720 || Loss: 4.9216 || 10iter: 1.9012 sec.
Iter 15730 || Loss: 4.4263 || 10iter: 1.9071 sec.
