# train

In [1]:
from torch.utils.data import DataLoader
import torch.nn as nn
from utils.datasets import LoadDataset
from utils import torch_utils
import time
import rtest
from tqdm import tqdm
from models import *
import os

In [2]:
# specify visible GPU
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
device = torch_utils.select_device()

Using CUDA device0 _CudaDeviceProperties(name='GeForce RTX 2070', total_memory=8192MB)


In [3]:
IMG_SIZE = 416
EPOCHES = 270
BATCH_SIZE = 10
START_EPOCH = 0
CFG = 'cfg/yolov3-tiny.cfg'
DATA_CFG = 'cfg/coco-h.data'
NUM_WORKERS = 5
FREEZE_BACKBONE = False
FROM_SCRATCH = True

weights = 'F:/Dev/weights/'
latest = os.path.join(weights, 'latest.pt')
best = os.path.join(weights, 'best.pt')

In [4]:
train_path = parse_data_cfg(DATA_CFG)['train'];train_path

'C:/coco/trainvalno5kn.txt'

In [5]:
# Dataset
dataset = LoadDataset(train_path, img_size=IMG_SIZE);dataset[1][0].shape

torch.Size([3, 416, 416])

In [6]:
# model
model = Darknet(CFG, IMG_SIZE).to(device)

In [7]:
# Dataloader
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=False, pin_memory=True, collate_fn=dataset.collate_fn)

In [8]:
# check Dataloader
for i, (imgs, targets, _, _) in enumerate(dataloader):
    #print(targets.shape)
    plot_images(imgs=imgs, targets=targets, fname='train_batch0.jpg')
    if i == 0:
        break

In [9]:
cutoff = -1  # backbone reaches to cutoff layer
START_EPOCH = 0
best_loss = float('inf')

if not FROM_SCRATCH:
    if '-tiny.cfg' in CFG:
        cutoff = load_darknet_weights(model, weights + 'yolov3-tiny.conv.15')
    else:
        cutoff = load_darknet_weights(model, weights + 'darknet53.conv.74')

In [10]:
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)
model.to(device);

In [11]:
model_info(model)


layer                                     name  gradient   parameters                shape         mu      sigma
    0                          0.conv_0.weight      True          432        [16, 3, 3, 3]   -0.00889      0.109
    1                    0.batch_norm_0.weight      True           16                 [16]      0.539      0.365
    2                      0.batch_norm_0.bias      True           16                 [16]          0          0
    3                          2.conv_2.weight      True         4608       [32, 16, 3, 3]  -0.000259     0.0479
    4                    2.batch_norm_2.weight      True           32                 [32]       0.49      0.276
    5                      2.batch_norm_2.bias      True           32                 [32]          0          0
    6                          4.conv_4.weight      True        18432       [64, 32, 3, 3]   -0.00017     0.0341
    7                    4.batch_norm_4.weight      True           64                 [64]     

In [12]:
# Optimizer
lr0 = 0.001  # initial learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr0, momentum=0.9, weight_decay=0.0005)

# Scheduler (reduce lr at epochs 218, 245, i.e. batches 400k, 450k)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[218, 245], gamma=0.1,
                                                 last_epoch=START_EPOCH - 1)

In [13]:
# Start training
t = time.time()
nB = len(dataloader) # num of batches
n_burnin = min(round(nB / 5 + 1), 1000)
accumulate = 1
multi_scale = False

for epoch in range(START_EPOCH, EPOCHES):
    model.train()
    print(('\n%8s%12s' + '%10s' * 7) % ('Epoch', 'Batch', 'xy', 'wh', 'conf', 'cls', 'total', 'nTargets', 'time'))

    # Update scheduler
    scheduler.step()

    # Freeze backbone at epoch 0, unfreeze at epoch 1
    if FREEZE_BACKBONE and epoch < 2:
        for name, p in model.named_parameters():
            if int(name.split('.')[1]) < cutoff:  # if layer < 75
                p.requires_grad = False if epoch == 0 else True

    mloss = torch.zeros(5).to(device) # mean losses
    for i, (imgs, targets, _, _) in enumerate(dataloader):
        imgs = imgs.to(device)
        targets = targets.to(device)
        nt = len(targets)

        # SGD burn-in
        if epoch == 0 and i <= n_burnin:
            lr = lr0 * (i / n_burnin) ** 4
            for x in optimizer.param_groups:
                x['lr'] = lr

        # Run model
        pred = model(imgs)

        # Build targets
        target_list = build_targets(model, targets)

        # Compute loss
        loss, loss_items = compute_loss(pred, target_list)

        # Compute gradient
        loss.backward()

        # Accumulate gradient for x batches before optimizing
        if (i + 1) % accumulate == 0 or (i + 1) == nB:
            optimizer.step()
            optimizer.zero_grad()

        # Update running mean of tracked metrics
        mloss = (mloss * i + loss_items) / (i + 1)

        # Print batch results
        s = ('%8s%12s' + '%10.3g' * 7) % (
            '%g/%g' % (epoch, EPOCHES - 1),
            '%g/%g' % (i, nB - 1), *mloss, nt, time.time() - t)
        t = time.time()
        print(s)

        # Multi-Scale training (320 - 608 pixels) every 10 batches
        if multi_scale and (i + 1) % 10 == 0:
            dataset.img_size = random.choice(range(10, 20)) * 32
            print('multi_scale img_size = %g' % dataset.img_size)

    # Calculate mAP
    with torch.no_grad():
        results = rtest.test(CFG, DATA_CFG, batch_size=BATCH_SIZE, img_size=IMG_SIZE, model=model, conf_thres=0.1)

    # Write epoch results
    with open('results.txt', 'a') as file:
        file.write(s + '%11.3g' * 5 % results + '\n')  # P, R, mAP, F1, test_loss

    # Update best loss
    test_loss = results[4]
    if test_loss < best_loss:
        best_loss = test_loss

    # Save training results
    save = True
    if save:
        # Create checkpoint
        chkpt = {'epoch': epoch,
                 'best_loss': best_loss,
                 'model': model.module.state_dict() if type(
                     model) is nn.parallel.DistributedDataParallel else model.state_dict(),
                 'optimizer': optimizer.state_dict()}

        # Save latest checkpoint
        torch.save(chkpt, latest)

        # Save best checkpoint
        if best_loss == test_loss:
            torch.save(chkpt, best)

        # Save backup every 10 epochs (optional)
        if epoch > 0 and epoch % 10 == 0:
            torch.save(chkpt, weights + 'backup%g.pt' % epoch)

        # Delete checkpoint
        del chkpt

   0/269 11618/11726      1.15     0.378      5.54      5.33      12.4        84    0.0658
   0/269 11619/11726      1.15     0.378      5.54      5.33      12.4       132    0.0658
   0/269 11620/11726      1.15     0.378      5.54      5.33      12.4        89    0.0648
   0/269 11621/11726      1.15     0.378      5.54      5.33      12.4        64    0.0608
   0/269 11622/11726      1.15     0.378      5.54      5.33      12.4        78    0.0608
   0/269 11623/11726      1.15     0.378      5.54      5.33      12.4        81    0.0618
   0/269 11624/11726      1.15     0.378      5.54      5.33      12.4        58    0.0648
   0/269 11625/11726      1.15     0.378      5.54      5.33      12.4        48    0.0618
   0/269 11626/11726      1.15     0.378      5.54      5.33      12.4        68    0.0668
   0/269 11627/11726      1.15     0.378      5.54      5.33      12.4        89    0.0598
   0/269 11628/11726      1.15     0.378      5.54      5.33      12.4        55    0.0608

Computing mAP:  10%|█         | 50/500 [00:45<04:52,  1.54it/s]

FileNotFoundError: Traceback (most recent call last):
  File "E:\Anaconda3\envs\pytorch\lib\site-packages\torch\utils\data\dataloader.py", line 138, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "E:\Anaconda3\envs\pytorch\lib\site-packages\torch\utils\data\dataloader.py", line 138, in <listcomp>
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "F:\github\YOLOv3\utils\datasets.py", line 35, in __getitem__
    with open(label_pth, 'r') as f:
FileNotFoundError: [Errno 2] No such file or directory: 'C:/coco/labels/val2014/COCO_val2014_000000058636.txt'
