# train

In [1]:
from torch.utils.data import DataLoader
import torch.nn as nn
import torchvision
from utils.datasets import LoadDataset
from utils import torch_utils
import time
import rtest
from tqdm import tnrange, tqdm_notebook, tqdm
from models import *
import os
from tqdm import tnrange, tqdm_notebook
import tensorboardX
from prefetch_generator import BackgroundGenerator

In [2]:
# specify visible GPU
os.environ["CUDA_VISIBLE_DEVICES"] = '2,3,4,5'
device = torch_utils.select_device()

Using CUDA device0 _CudaDeviceProperties(name='Tesla P100-PCIE-16GB', total_memory=16280MB)
           device1 _CudaDeviceProperties(name='Tesla P100-PCIE-16GB', total_memory=16280MB)
           device2 _CudaDeviceProperties(name='Tesla P100-PCIE-16GB', total_memory=16280MB)
           device3 _CudaDeviceProperties(name='Tesla P100-PCIE-16GB', total_memory=16280MB)


In [3]:
torch.backends.cudnn.benchmark = True
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed(1)

In [4]:
IMG_SIZE = 416
EPOCHES = 240
BATCH_SIZE = 64
START_EPOCH = 0
CFG = 'cfg/yolov3-tiny.cfg'
DATA_CFG = 'cfg/coco.data'
NUM_WORKERS = 30
FREEZE_BACKBONE = False
FROM_SCRATCH = True
NAME = 'train_yolo-tiny_{}'.format(int(time.time()))

LOG_PATH = 'logs/'
weights = 'weights/'
latest = os.path.join(weights, 'latest.pt')
best = os.path.join(weights, 'best.pt')
TBoard = tensorboardX.SummaryWriter(log_dir=os.path.join(LOG_PATH, NAME))

In [5]:
train_path = parse_data_cfg(DATA_CFG)['train'];train_path

'/data/hktxt/e/CV/coco/trainvalno5k.txt'

In [6]:
# Dataset
dataset = LoadDataset(train_path, img_size=IMG_SIZE)

In [7]:
# model
model = Darknet(CFG, IMG_SIZE).to(device)

In [8]:
# Dataloader
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=False, pin_memory=True, collate_fn=dataset.collate_fn)

In [9]:
len(dataloader)

1833

In [10]:
# check Dataloader
for i, (imgs, targets, _, _) in enumerate(dataloader):
    #print(targets.shape)
    plot_images(imgs=imgs, targets=targets, fname='train_batch0.jpg')
    if i == 0:
        break

In [10]:
cutoff = -1  # backbone reaches to cutoff layer
START_EPOCH = 0
best_loss = float('inf')

if not FROM_SCRATCH:
    if '-tiny.cfg' in CFG:
        cutoff = load_darknet_weights(model, weights + 'yolov3-tiny.conv.15')
    else:
        cutoff = load_darknet_weights(model, weights + 'darknet53.conv.74')

In [11]:
if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)
model.to(device);

Let's use 4 GPUs!


In [12]:
model_info(model)


layer                                     name  gradient   parameters                shape         mu      sigma
    0                   module.0.conv_0.weight      True          432        [16, 3, 3, 3]    0.00273      0.109
    1             module.0.batch_norm_0.weight      True           16                 [16]      0.569      0.303
    2               module.0.batch_norm_0.bias      True           16                 [16]          0          0
    3                   module.2.conv_2.weight      True         4608       [32, 16, 3, 3]   -0.00111     0.0481
    4             module.2.batch_norm_2.weight      True           32                 [32]      0.549      0.305
    5               module.2.batch_norm_2.bias      True           32                 [32]          0          0
    6                   module.4.conv_4.weight      True        18432       [64, 32, 3, 3]   4.86e-05     0.0341
    7             module.4.batch_norm_4.weight      True           64                 [64]     

In [13]:
# Optimizer
lr0 = 0.001  # initial learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr0, momentum=0.9, weight_decay=0.0005)

# Scheduler (reduce lr at epochs 218, 245, i.e. batches 400k, 450k)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[218, 245], gamma=0.1,
                                                 last_epoch=START_EPOCH - 1)

In [None]:
resume = True
transfer = False

if resume:  # Load previously saved model
    print('resume. load latest.pt')
    if transfer:  # Transfer learning
        chkpt = torch.load(weights + 'yolov3-spp.pt', map_location=device)
        model.load_state_dict({k: v for k, v in chkpt['model'].items() if v.numel() > 1 and v.shape[0] != 255},
                              strict=False)
        for p in model.parameters():
            p.requires_grad = True if p.shape[0] == nf else False

    else:  # resume from latest.pt
        chkpt = torch.load(latest, map_location=device)  # load checkpoint
        model.load_state_dict(chkpt['model'])

    START_EPOCH = chkpt['epoch'] + 1
    if chkpt['optimizer'] is not None:
        optimizer.load_state_dict(chkpt['optimizer'])
        best_loss = chkpt['best_loss']
    del chkpt


# Start training
train_start = time.time()
nB = len(dataloader) # num of batches
n_burnin = min(round(nB / 5 + 1), 1000)
accumulate = 1
multi_scale = False

#print('Start training with Batch_size: {}, Eopch: {}, Batches: {}'.format(BATCH_SIZE, EPOCHES, len(dataloader)))
for epoch in range(START_EPOCH, EPOCHES):
    model.train()
    #print(('\n%8s%12s' + '%10s' * 7) % ('Epoch', 'Batch', 'xy', 'wh', 'conf', 'cls', 'total', 'nTargets', 'time'))

    # Update scheduler
    scheduler.step()

    mloss = torch.zeros(5).to(device) # mean losses
    #for i, (imgs, targets, _, _) in enumerate(tqdm(dataloader, desc='{}/{}'.format(epoch, EPOCHES-1))):
    with tqdm(enumerate(BackgroundGenerator(dataloader)),total=len(dataloader)) as pbar:
        start_time = time.time()
        for i, (imgs, targets, _, _) in pbar:
            #pbar.set_description('{}/{}'.format(epoch, EPOCHES))
            imgs = imgs.to(device)
            targets = targets.to(device)
            nt = len(targets)

            # SGD burn-in
            if epoch == 0 and i <= n_burnin:
                lr = lr0 * (i / n_burnin) ** 4
                for x in optimizer.param_groups:
                    x['lr'] = lr
                    
            prepare_time = start_time-time.time()
            
            # Run model
            pred = model(imgs)

            # Build targets
            target_list = build_targets(model, targets)

            # Compute loss
            loss, loss_items = compute_loss(pred, target_list)

            # Compute gradient
            loss.backward()

            # Accumulate gradient for x batches before optimizing
            if (i + 1) % accumulate == 0 or (i + 1) == nB:
                optimizer.step()
                optimizer.zero_grad()

            # Update running mean of tracked metrics
            mloss = (mloss * i + loss_items) / (i + 1)

            # Print batch results
            s = ('%8s%12s' + '%10.3g' * 7) % (
                '%g/%g' % (epoch, EPOCHES - 1),
                '%g/%g' % (i, nB - 1), *mloss, nt, time.time() - train_start)
            
            process_time = start_time-time.time()-prepare_time
            pbar.set_description("Compute efficiency:{:.2f}, epoch:{}/{}".format(
                process_time/(process_time+prepare_time), epoch+1, EPOCHES))
            start_time = time.time()
            pbar.set_postfix(loss=loss.item())
            # add graph is cost time
            #if epoch == 0:
                #TBoard.add_graph(model, (imgs,))
            #step_num = epoch * len(dataloader) + i
            #imgs_batch = torchvision.utils.make_grid(imgs, nrow=5)
            #TBoard.add_image('images', imgs_batch, step_num)
            #TBoard.add_graph(cnn, (b_x, ))

            

    # Calculate mAP
    if epoch >= 30:
        with torch.no_grad():
            results = rtest.test(CFG, DATA_CFG, batch_size=BATCH_SIZE, img_size=IMG_SIZE, model=model, conf_thres=0.1);
        
        TBoard.add_scalar('gMetrics/train_loss', loss.item(), epoch)
        TBoard.add_scalar('gMetrics/P', results[0], epoch)
        TBoard.add_scalar('gMetrics/R', results[1], epoch)
        TBoard.add_scalar('gMetrics/mAP', results[2], epoch)
        TBoard.add_scalar('gMetrics/F1', results[3], epoch)
        TBoard.add_scalar('gMetrics/test_loss', results[4], epoch)

    # Write epoch results
        with open('results.txt', 'a') as file:
            file.write(s + '%11.3g' * 5 % results + '\n')  # P, R, mAP, F1, test_loss

        # Update best loss
        test_loss = results[4]
        if test_loss < best_loss:
            best_loss = test_loss
        

    # Save training results
    save = True
    if save and epoch >= 30:
        # Create checkpoint
        chkpt = {'epoch': epoch,
                 'best_loss': best_loss,
                 'model': model.module.state_dict() if type(
                     model) is nn.parallel.DistributedDataParallel else model.state_dict(),
                 'optimizer': optimizer.state_dict()}

        # Save latest checkpoint
        torch.save(chkpt, latest)

        # Save best checkpoint
        if best_loss == test_loss:
            torch.save(chkpt, best)

        # Save backup every 10 epochs (optional)
        if epoch > 0 and epoch % 10 == 0:
            torch.save(chkpt, weights + 'backup%g.pt' % epoch)

        # Delete checkpoint
        del chkpt
TBoard.close()
end = time.time()
print('Training finished! using time: {}'.format(end - train_start))

resume. load latest.pt


Compute efficiency:1.00, epoch:79/240: 100%|██████████| 1833/1833 [20:43<00:00,  1.17it/s, loss=1.03]
100%|██████████| 78/78 [01:28<00:00,  1.02s/it]
Compute efficiency:0.87, epoch:80/240: 100%|██████████| 1833/1833 [20:57<00:00,  5.05it/s, loss=0.962]
100%|██████████| 78/78 [01:29<00:00,  1.53it/s]
Compute efficiency:0.95, epoch:81/240: 100%|██████████| 1833/1833 [20:18<00:00,  4.03it/s, loss=0.921]
100%|██████████| 78/78 [01:21<00:00,  1.69it/s]
Compute efficiency:0.94, epoch:82/240: 100%|██████████| 1833/1833 [20:12<00:00,  4.93it/s, loss=0.927]
100%|██████████| 78/78 [01:14<00:00,  1.94it/s]
Compute efficiency:0.93, epoch:83/240: 100%|██████████| 1833/1833 [20:18<00:00,  6.01it/s, loss=0.956]
100%|██████████| 78/78 [01:14<00:00,  1.62it/s]
Compute efficiency:0.95, epoch:84/240: 100%|██████████| 1833/1833 [20:35<00:00,  4.64it/s, loss=1.03]
100%|██████████| 78/78 [01:23<00:00,  2.10it/s]
Compute efficiency:0.94, epoch:85/240: 100%|██████████| 1833/1833 [20:50<00:00,  4.94it/s, loss=