# train

In [1]:
from torch.utils.data import DataLoader
import torch.nn as nn
from utils.datasets import LoadDataset
from utils import torch_utils
import time
import rtest
from tqdm import tqdm
from models import *
import os
import tensorboardX

In [2]:
# specify visible GPU
os.environ["CUDA_VISIBLE_DEVICES"] = '0'
device = torch_utils.select_device()

Using CPU


In [3]:
IMG_SIZE = 416
EPOCHES = 270
BATCH_SIZE = 1
START_EPOCH = 0
CFG = 'cfg/yolov3-tiny.cfg'
DATA_CFG = 'cfg/coco.data'
NUM_WORKERS = 5
FREEZE_BACKBONE = False
FROM_SCRATCH = True

LOG_PATH = 'E:/condaDev/YOLOv3/logs/'
weights = 'E:/condaDev/YOLOv3/logs/'
latest = os.path.join(weights, 'latest.pt')
best = os.path.join(weights, 'best.pt')

In [4]:
train_path = parse_data_cfg(DATA_CFG)['train'];train_path

'E:/data/coco/trainvalno5k.txt'

In [5]:
# Dataset
dataset = LoadDataset(train_path, img_size=IMG_SIZE)

torch.Size([3, 416, 416])

In [6]:
# model
model = Darknet(CFG, IMG_SIZE).to(device)

In [7]:
# Dataloader
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=False, pin_memory=True, collate_fn=dataset.collate_fn)

In [9]:
# check Dataloader
for i, (imgs, targets, _, _) in enumerate(dataloader):
    #print(targets.shape)
    plot_images(imgs=imgs, targets=targets, fname='train_batch0.jpg')
    if i == 0:
        break

In [8]:
cutoff = -1  # backbone reaches to cutoff layer
START_EPOCH = 0
best_loss = float('inf')

if not FROM_SCRATCH:
    if '-tiny.cfg' in CFG:
        cutoff = load_darknet_weights(model, weights + 'yolov3-tiny.conv.15')
    else:
        cutoff = load_darknet_weights(model, weights + 'darknet53.conv.74')

In [9]:
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)
model.to(device);

In [10]:
model_info(model)


layer                                     name  gradient   parameters                shape         mu      sigma
    0                          0.conv_0.weight      True          432        [16, 3, 3, 3]  -0.000276       0.11
    1                    0.batch_norm_0.weight      True           16                 [16]      0.463       0.25
    2                      0.batch_norm_0.bias      True           16                 [16]          0          0
    3                          2.conv_2.weight      True         4608       [32, 16, 3, 3]   0.000397      0.048
    4                    2.batch_norm_2.weight      True           32                 [32]      0.482      0.297
    5                      2.batch_norm_2.bias      True           32                 [32]          0          0
    6                          4.conv_4.weight      True        18432       [64, 32, 3, 3]   0.000186      0.034
    7                    4.batch_norm_4.weight      True           64                 [64]     

In [11]:
# Optimizer
lr0 = 0.001  # initial learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr0, momentum=0.9, weight_decay=0.0005)

# Scheduler (reduce lr at epochs 218, 245, i.e. batches 400k, 450k)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[218, 245], gamma=0.1,
                                                 last_epoch=START_EPOCH - 1)

In [None]:
# Start training
t = time.time()
nB = len(dataloader) # num of batches
n_burnin = min(round(nB / 5 + 1), 1000)
accumulate = 1
multi_scale = False
TBoard = tensorboardX.SummaryWriter(log_dir=LOG_PATH)

print('Start training with Batch_size: {}, Eopch: {}, Batches: {}'.format(BATCH_SIZE, EPOCHES, len(dataloader)))
for epoch in range(START_EPOCH, EPOCHES):
    model.train()
    #print(('\n%8s%12s' + '%10s' * 7) % ('Epoch', 'Batch', 'xy', 'wh', 'conf', 'cls', 'total', 'nTargets', 'time'))

    # Update scheduler
    scheduler.step()

    mloss = torch.zeros(5).to(device) # mean losses
    for i, (imgs, targets, _, _) in tqdm(enumerate(dataloader), desc='{}/{}'.format(epoch,EPOCHES-1)):
        imgs = imgs.to(device)
        targets = targets.to(device)
        nt = len(targets)

        # SGD burn-in
        if epoch == 0 and i <= n_burnin:
            lr = lr0 * (i / n_burnin) ** 4
            for x in optimizer.param_groups:
                x['lr'] = lr

        # Run model
        pred = model(imgs)

        # Build targets
        target_list = build_targets(model, targets)

        # Compute loss
        loss, loss_items = compute_loss(pred, target_list)

        # Compute gradient
        loss.backward()

        # Accumulate gradient for x batches before optimizing
        if (i + 1) % accumulate == 0 or (i + 1) == nB:
            optimizer.step()
            optimizer.zero_grad()

        # Update running mean of tracked metrics
        mloss = (mloss * i + loss_items) / (i + 1)

    # Calculate mAP
    with torch.no_grad():
        results = rtest.test(CFG, DATA_CFG, batch_size=BATCH_SIZE, img_size=IMG_SIZE, model=model, conf_thres=0.1);

    # Write epoch results
    with open('results.txt', 'a') as file:
        file.write(s + '%11.3g' * 5 % results + '\n')  # P, R, mAP, F1, test_loss

    # Update best loss
    test_loss = results[4]
    if test_loss < best_loss:
        best_loss = test_loss
        
        TBoard.add_scalar('gMetrics/train_loss', loss.item(), epoch)
        TBoard.add_scalar('gMetrics/P', results[0], epoch)
        TBoard.add_scalar('gMetrics/R', results[1], epoch)
        TBoard.add_scalar('gMetrics/mAP', results[2], epoch)
        TBoard.add_scalar('gMetrics/F1', results[3], epoch)
        TBoard.add_scalar('gMetrics/test_loss', results[4], epoch)
        

    # Save training results
    save = True
    if save:
        # Create checkpoint
        chkpt = {'epoch': epoch,
                 'best_loss': best_loss,
                 'model': model.module.state_dict() if type(
                     model) is nn.parallel.DistributedDataParallel else model.state_dict(),
                 'optimizer': optimizer.state_dict()}

        # Save latest checkpoint
        torch.save(chkpt, latest)

        # Save best checkpoint
        if best_loss == test_loss:
            torch.save(chkpt, best)

        # Save backup every 10 epochs (optional)
        if epoch > 0 and epoch % 10 == 0:
            torch.save(chkpt, weights + 'backup%g.pt' % epoch)

        # Delete checkpoint
        del chkpt
end = time.time()
print('Training finished! using time: {}'.format(end - t))

Start training with Batch_size: 1, Eopch: 270, Batches: 117264


0/269: 212it [01:57,  1.82it/s]