# train

In [1]:
from torch.utils.data import DataLoader
import torch.nn as nn
from utils.datasets import LoadDataset
from utils import torch_utils
import time
import rtest
from tqdm import tqdm
from models import *
import os
import tensorboardX

In [2]:
# specify visible GPU
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
device = torch_utils.select_device()

Using CUDA device0 _CudaDeviceProperties(name='GeForce RTX 2070', total_memory=8192MB)


In [3]:
IMG_SIZE = 416
EPOCHES = 270
BATCH_SIZE = 10
START_EPOCH = 0
CFG = 'cfg/yolov3-tiny.cfg'
DATA_CFG = 'cfg/coco-h.data'
NUM_WORKERS = 5
FREEZE_BACKBONE = False
FROM_SCRATCH = True

LOG_PATH = 'F:/github/YOLOv3/logs/'
weights = 'F:/github/YOLOv3/weights/'
latest = os.path.join(weights, 'latest.pt')
best = os.path.join(weights, 'best.pt')

In [4]:
train_path = parse_data_cfg(DATA_CFG)['train'];train_path

'C:/coco/trainvalno5kn.txt'

In [5]:
# Dataset
dataset = LoadDataset(train_path, img_size=IMG_SIZE)

In [6]:
# model
model = Darknet(CFG, IMG_SIZE).to(device)

In [7]:
# Dataloader
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=False, pin_memory=True, collate_fn=dataset.collate_fn)

In [8]:
# check Dataloader
for i, (imgs, targets, _, _) in enumerate(dataloader):
    #print(targets.shape)
    plot_images(imgs=imgs, targets=targets, fname='train_batch0.jpg')
    if i == 0:
        break

In [9]:
cutoff = -1  # backbone reaches to cutoff layer
START_EPOCH = 0
best_loss = float('inf')

if not FROM_SCRATCH:
    if '-tiny.cfg' in CFG:
        cutoff = load_darknet_weights(model, weights + 'yolov3-tiny.conv.15')
    else:
        cutoff = load_darknet_weights(model, weights + 'darknet53.conv.74')

In [10]:
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)
model.to(device);

In [11]:
model_info(model)


layer                                     name  gradient   parameters                shape         mu      sigma
    0                          0.conv_0.weight      True          432        [16, 3, 3, 3]   -0.00553       0.11
    1                    0.batch_norm_0.weight      True           16                 [16]      0.505       0.31
    2                      0.batch_norm_0.bias      True           16                 [16]          0          0
    3                          2.conv_2.weight      True         4608       [32, 16, 3, 3]  -0.000136     0.0478
    4                    2.batch_norm_2.weight      True           32                 [32]      0.514      0.291
    5                      2.batch_norm_2.bias      True           32                 [32]          0          0
    6                          4.conv_4.weight      True        18432       [64, 32, 3, 3]  -5.63e-05      0.034
    7                    4.batch_norm_4.weight      True           64                 [64]     

In [12]:
# Optimizer
lr0 = 0.001  # initial learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr0, momentum=0.9, weight_decay=0.0005)

# Scheduler (reduce lr at epochs 218, 245, i.e. batches 400k, 450k)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[218, 245], gamma=0.1,
                                                 last_epoch=START_EPOCH - 1)

In [13]:
# Start training
t = time.time()
nB = len(dataloader) # num of batches
n_burnin = min(round(nB / 5 + 1), 1000)
accumulate = 1
multi_scale = False
TBoard = tensorboardX.SummaryWriter(log_dir=LOG_PATH)

print('Start training with Batch_size: {}, Eopch: {}, Batches: {}'.format(BATCH_SIZE, EPOCHES, len(dataloader)))
for epoch in range(START_EPOCH, EPOCHES):
    model.train()
    #print(('\n%8s%12s' + '%10s' * 7) % ('Epoch', 'Batch', 'xy', 'wh', 'conf', 'cls', 'total', 'nTargets', 'time'))

    # Update scheduler
    scheduler.step()

    mloss = torch.zeros(5).to(device) # mean losses
    for i, (imgs, targets, _, _) in tqdm(enumerate(dataloader), desc='{}/{}'.format(epoch,EPOCHES-1)):
        imgs = imgs.to(device)
        targets = targets.to(device)
        nt = len(targets)

        # SGD burn-in
        if epoch == 0 and i <= n_burnin:
            lr = lr0 * (i / n_burnin) ** 4
            for x in optimizer.param_groups:
                x['lr'] = lr

        # Run model
        pred = model(imgs)

        # Build targets
        target_list = build_targets(model, targets)

        # Compute loss
        loss, loss_items = compute_loss(pred, target_list)

        # Compute gradient
        loss.backward()

        # Accumulate gradient for x batches before optimizing
        if (i + 1) % accumulate == 0 or (i + 1) == nB:
            optimizer.step()
            optimizer.zero_grad()

        # Update running mean of tracked metrics
        mloss = (mloss * i + loss_items) / (i + 1)

    # Calculate mAP
    with torch.no_grad():
        results = rtest.test(CFG, DATA_CFG, batch_size=BATCH_SIZE, img_size=IMG_SIZE, model=model, conf_thres=0.1);

    # Write epoch results
    with open('results.txt', 'a') as file:
        file.write(s + '%11.3g' * 5 % results + '\n')  # P, R, mAP, F1, test_loss

    # Update best loss
    test_loss = results[4]
    if test_loss < best_loss:
        best_loss = test_loss
        
        TBoard.add_scalar('gMetrics/train_loss', loss.item(), epoch)
        TBoard.add_scalar('gMetrics/P', results[0], epoch)
        TBoard.add_scalar('gMetrics/R', results[1], epoch)
        TBoard.add_scalar('gMetrics/mAP', results[2], epoch)
        TBoard.add_scalar('gMetrics/F1', results[3], epoch)
        TBoard.add_scalar('gMetrics/test_loss', results[4], epoch)
        

    # Save training results
    save = True
    if save:
        # Create checkpoint
        chkpt = {'epoch': epoch,
                 'best_loss': best_loss,
                 'model': model.module.state_dict() if type(
                     model) is nn.parallel.DistributedDataParallel else model.state_dict(),
                 'optimizer': optimizer.state_dict()}

        # Save latest checkpoint
        torch.save(chkpt, latest)

        # Save best checkpoint
        if best_loss == test_loss:
            torch.save(chkpt, best)

        # Save backup every 10 epochs (optional)
        if epoch > 0 and epoch % 10 == 0:
            torch.save(chkpt, weights + 'backup%g.pt' % epoch)

        # Delete checkpoint
        del chkpt
end = time.time()
print('Training finished! using time: {}'.format(end - t))

Start training with Batch_size: 10, Eopch: 270, Batches: 11727


0/269: 11727it [17:14, 11.34it/s]
Computing mAP:  10%|█         | 50/500 [00:46<05:03,  1.48it/s]

FileNotFoundError: Traceback (most recent call last):
  File "E:\Anaconda3\envs\pytorch\lib\site-packages\torch\utils\data\dataloader.py", line 138, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "E:\Anaconda3\envs\pytorch\lib\site-packages\torch\utils\data\dataloader.py", line 138, in <listcomp>
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "F:\github\YOLOv3\utils\datasets.py", line 35, in __getitem__
    with open(label_pth, 'r') as f:
FileNotFoundError: [Errno 2] No such file or directory: 'C:/coco/labels/val2014/COCO_val2014_000000058636.txt'


In [14]:
targets.shape

torch.Size([8, 6])

In [19]:
len(pred)

2

In [21]:
pred[0].shape

torch.Size([1, 3, 13, 13, 85])

In [22]:
pred[1].shape

torch.Size([1, 3, 26, 26, 85])

In [16]:
target_list

([tensor([[0.2334, 0.3405],
          [0.5747, 0.0351],
          [0.2818, 0.7711],
          [0.4127, 0.7092],
          [0.4089, 0.9174],
          [0.0509, 0.8906],
          [0.6879, 0.8373],
          [0.3572, 0.3974]]), tensor([[0.8177, 0.8348],
          [0.1018, 0.7811],
          [0.3757, 0.6747],
          [0.7143, 0.7948]])], [tensor([[ 0.1446, -0.5405],
          [ 0.4300, -0.1284],
          [ 0.4204, -0.0591],
          [-0.1973, -0.2687],
          [-0.5004, -0.9974],
          [-0.7636, -0.9944],
          [-0.3942, -0.5817],
          [-0.2739, -0.5738]]), tensor([[ 0.2831, -0.6511],
          [ 0.4954,  0.1165],
          [ 0.3894, -0.2355],
          [ 0.5096, -0.2276]])], [tensor([45, 45, 50, 45, 49, 49, 49, 49]),
  tensor([49, 49, 49, 49])], [(tensor([0, 0, 0, 0, 0, 0, 0, 0]),
   tensor([2, 1, 1, 2, 0, 0, 0, 0]),
   tensor([8, 4, 8, 5, 2, 2, 3, 2]),
   tensor([ 6,  9,  8,  4,  8, 10,  8,  8])),
  (tensor([0, 0, 0, 0]),
   tensor([2, 1, 2, 2]),
   tensor([5, 5, 7, 4

In [14]:
import os
with open('C:/coco/5kn.txt', 'r') as f:
    data = f.read().splitlines()

In [17]:
for d in data:
    txt = d.replace('images', 'labels').replace('.jpg','.txt').replace('.png','.txt')
    if not os.path.exists(txt):
        print(txt)

C:/coco/labels/val2014/COCO_val2014_000000058636.txt
C:/coco/labels/val2014/COCO_val2014_000000101623.txt
C:/coco/labels/val2014/COCO_val2014_000000127135.txt
C:/coco/labels/val2014/COCO_val2014_000000130712.txt
C:/coco/labels/val2014/COCO_val2014_000000176701.txt
C:/coco/labels/val2014/COCO_val2014_000000179430.txt
C:/coco/labels/val2014/COCO_val2014_000000192817.txt
C:/coco/labels/val2014/COCO_val2014_000000224742.txt
C:/coco/labels/val2014/COCO_val2014_000000228771.txt
C:/coco/labels/val2014/COCO_val2014_000000252101.txt
C:/coco/labels/val2014/COCO_val2014_000000253688.txt
C:/coco/labels/val2014/COCO_val2014_000000274957.txt
C:/coco/labels/val2014/COCO_val2014_000000297736.txt
C:/coco/labels/val2014/COCO_val2014_000000300090.txt
C:/coco/labels/val2014/COCO_val2014_000000301765.txt
C:/coco/labels/val2014/COCO_val2014_000000310622.txt
C:/coco/labels/val2014/COCO_val2014_000000328098.txt
C:/coco/labels/val2014/COCO_val2014_000000337653.txt
C:/coco/labels/val2014/COCO_val2014_0000003397