In [None]:
import argparse

import torch.distributed as dist
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler

import test  # import test.py to get mAP after each epoch
from models import *
from utils.datasets import *
from utils.utils import *

from mymodel import *

wdir = 'weights' + os.sep  # weights dir
last = wdir + 'last.pt'
best = wdir + 'best.pt'
test_best = wdir + 'test_best.pt'
results_file = 'results.txt'

# Hyperparameters (results68: 59.9 mAP@0.5 yolov3-spp-416) https://github.com/ultralytics/yolov3/issues/310

hyp = {'giou': 3.54,  # giou loss gain
       'cls': 37.4,  # cls loss gain
       'cls_pw': 1.0,  # cls BCELoss positive_weight
       'obj': 64.3,  # obj loss gain (*=img_size/320 if img_size != 320)
       'obj_pw': 1.0,  # obj BCELoss positive_weight
       'iou_t': 0.225,  # iou training threshold
       'lr0': 0.01,  # initial learning rate (SGD=5E-3, Adam=5E-4)
       'lrf': -4.,  # final LambdaLR learning rate = lr0 * (10 ** lrf)
       'momentum': 0.937,  # SGD momentum
       'weight_decay': 0.000484,  # optimizer weight decay
       'fl_gamma': 0.5,  # focal loss gamma
       'hsv_h': 0.0138,  # image HSV-Hue augmentation (fraction)
       'hsv_s': 0.678,  # image HSV-Saturation augmentation (fraction)
       'hsv_v': 0.36,  # image HSV-Value augmentation (fraction)
       'degrees': 1.98,  # image rotation (+/- deg)
       'translate': 0.05,  # image translation (+/- fraction)
       'scale': 0.05,  # image scale (+/- gain)
       'shear': 0.641}  # image shear (+/- deg)

# Overwrite hyp with hyp*.txt (optional)
f = glob.glob('hyp*.txt')
if f:
    print('Using %s' % f[0])
    for k, v in zip(hyp.keys(), np.loadtxt(f[0])):
        hyp[k] = v

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument('--epochs', type=int, default=5)  # 500200 batches at bs 16, 117263 COCO images = 273 epochs
parser.add_argument('--batch-size', type=int, default=16)  # effective bs = batch_size * accumulate = 16 * 4 = 64
parser.add_argument('--accumulate', type=int, default=4, help='batches to accumulate before optimizing')
parser.add_argument('--cfg', type=str, default='cfg/yolov3-tiny-1cls_1.cfg', help='*.cfg path')
parser.add_argument('--data', type=str, default='data/coco2017.data', help='*.data path')
parser.add_argument('--multi-scale', action='store_true', help='adjust (67% - 150%) img_size every 10 batches')
parser.add_argument('--img-size', nargs='+', type=int, default=[320], help='train and test image-sizes')
parser.add_argument('--rect', action='store_true', help='rectangular training')
parser.add_argument('--resume', action='store_true', help='resume training from last.pt')
parser.add_argument('--nosave', action='store_true', help='only save final checkpoint')
parser.add_argument('--notest', action='store_true', help='only test final epoch')
parser.add_argument('--evolve', action='store_true', help='evolve hyperparameters')
parser.add_argument('--bucket', type=str, default='', help='gsutil bucket')
parser.add_argument('--cache-images', action='store_true', help='cache images for faster training')
parser.add_argument('--weights', type=str, default='', help='initial weights path')
parser.add_argument('--arc', type=str, default='default', help='yolo architecture')  # default, uCE, uBCE
parser.add_argument('--name', default='', help='renames results.txt to results_name.txt if supplied')
parser.add_argument('--device', default='1', help='device id (i.e. 0 or 0,1 or cpu)')
parser.add_argument('--adam', action='store_true', help='use adam optimizer')
parser.add_argument('--single-cls', action='store_true', help='train as single-class dataset')
parser.add_argument('--var', type=float, help='debug variable')
opt = parser.parse_known_args()[0]
opt.weights = last if opt.resume else opt.weights
print(opt)
print(opt.weights)
device = torch_utils.select_device(opt.device, batch_size=opt.batch_size)
print(device)

tb_writer = None
if not opt.evolve:  # Train normally
    try:
        # Start Tensorboard with "tensorboard --logdir=runs", view at http://localhost:6006/
        from torch.utils.tensorboard import SummaryWriter

        tb_writer = SummaryWriter()
    except:
        pass

cfg = opt.cfg
data = opt.data
img_size, img_size_test = opt.img_size if len(opt.img_size) == 2 else opt.img_size * 2  # train, test sizes
epochs = opt.epochs  # 500200 batches at bs 64, 117263 images = 273 epochs
batch_size = opt.batch_size
accumulate = opt.accumulate  # effective bs = batch_size * accumulate = 16 * 4 = 64
weights = opt.weights  # initial training weights

# Initialize
init_seeds()
if opt.multi_scale:
    img_sz_min = round(img_size / 32 / 1.5)
    img_sz_max = round(img_size / 32* 1.5)
    img_size = img_sz_max * 32  # initiate with maximum multi_scale size
    print('Using multi-scale %g - %g' % (img_sz_min * 32, img_size))

    # Configure run
    # data_dict = parse_data_cfg(data)
train_path = '../DAC-SDC2021/dataset/data_training'
test_path = '../DAC-SDC2021/dataset/sample'
nc = 1 

    # Remove previous results
for f in glob.glob('*_batch*.png') + glob.glob(results_file):
    os.remove(f)

In [None]:
model = UltraNet_Bypass().to(device)
#print(model)

In [None]:
pg0, pg1, pg2 = [], [], []  # optimizer parameter groups
for k, v in dict(model.named_parameters()).items():
    #print(k,"\n")  # eg: layers.0.weight  layers.1.bias
    #print(v)   # tensor value
    if '.bias' in k:
        pg2 += [v]  # biases
    elif 'Conv2d.weight' in k:
        pg1 += [v]  # apply weight_decay
    else:
        pg0 += [v]  # all else

In [None]:
if opt.adam:
    # hyp['lr0'] *= 0.1  # reduce lr (i.e. SGD=5E-3, Adam=5E-4)
    optimizer = optim.Adam(pg0, lr=hyp['lr0'])
    # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1)
else:
    optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True)
optimizer.add_param_group({'params': pg1, 'weight_decay': hyp['weight_decay']})  # add pg1 with weight_decay
optimizer.add_param_group({'params': pg2})  # add pg2 (biases)
optimizer.param_groups[2]['lr'] *= 2.0  # bias lr
del pg0, pg1, pg2

In [None]:
start_epoch = 0
best_fitness = 0.0
test_best_iou = 0.0
if weights.endswith('.pt'):  # pytorch format
    # possible weights are '*.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc.
    print("load yolo-weights...")
    chkpt = torch.load(weights, map_location=device)
    #print(chkpt)

    # load model
    try:
        chkpt['model'] = {k: v for k, v in chkpt['model'].items() if model.state_dict()[k].numel() == v.numel()}
        #chkpt['model'] = {k: v for k, v in chkpt['model'].items() if model.state_dict()[k].numel() == v.numel()}
        #chkpt = {k: v for k, v in chkpt.items() if k in model.state_dict()}
        model.load_state_dict(chkpt, strict=False)
        print("successfully load model")
    except KeyError as e:
        s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " % (opt.weights, opt.cfg, opt.weights)
        raise KeyError(s) from e
        
    # load optimizer
    if chkpt['optimizer'] is not None:
        optimizer.load_state_dict(chkpt['optimizer'])
        best_fitness = chkpt['best_fitness']
        print("successfully load optimizer")
        #print(chkpt['optimizer'])

    # load results
    if chkpt.get('training_results') is not None:
        with open(results_file, 'w') as file:
            file.write(chkpt['training_results'])  # write results.txt
        print("successfully load results")
        print(chkpt['training_results'])
        

    #start_epoch = chkpt['epoch'] + 1
    del chkpt

elif len(weights) > 0:  # darknet format
    # possible weights are '*.weights', 'yolov3-tiny.conv.15',  'darknet53.conv.74' etc.
    load_darknet_weights(model, weights)

In [None]:
lf = lambda x: (1 + math.cos(x * math.pi / epochs)) / 2 * 0.99 + 0.01  # cosine https://arxiv.org/pdf/1812.01187.pdf
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
scheduler.last_epoch = start_epoch

dataset = LoadImagesAndLabels(train_path, img_size, batch_size,
                              augment=True,
                              hyp=hyp,  # augmentation hyperparameters
                              rect=opt.rect,  # rectangular training
                              cache_images=opt.cache_images,
                              single_cls=opt.single_cls)

batch_size = min(batch_size, len(dataset))
nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8])  # number of workers
dataloader = torch.utils.data.DataLoader(dataset,
                                         batch_size=batch_size,
                                         num_workers=nw,
                                         shuffle=not opt.rect,  # Shuffle=True unless rectangular training is used
                                         pin_memory=True,
                                         collate_fn=dataset.collate_fn)

testloader = torch.utils.data.DataLoader(LoadImagesAndLabels(test_path, img_size_test, batch_size * 2,
                                                                 hyp=hyp,
                                                                 rect=False,
                                                                 cache_images=opt.cache_images,
                                                                 single_cls=opt.single_cls),
                                         batch_size=batch_size * 2,
                                         num_workers=nw,
                                         pin_memory=True,
                                         collate_fn=dataset.collate_fn)

In [None]:
nb = len(dataloader)
prebias = start_epoch == 0
model.nc = nc  # attach number of classes to model
model.arc = opt.arc  # attach yolo architecture
model.hyp = hyp  # attach hyperparameters to model
model.class_weights = labels_to_class_weights(dataset.labels, nc).to(device)  # attach class weights
maps = np.zeros(nc)  # mAP per class
# torch.autograd.set_detect_anomaly(True)
results = (0, 0, 0, 0, 0, 0, 0)  # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification'
t0 = time.time()
torch_utils.model_info(model, report='summary')  # 'full' or 'summary'
print('Using %g dataloader workers' % nw)
print('Starting training for %g epochs...' % epochs)

In [None]:
for epoch in range(start_epoch, epochs):  # epoch ------------------------------------------------------------------
    model.train()
    model.gr = 1 - (1 + math.cos(min(epoch * 2, epochs) * math.pi / epochs)) / 2  # GIoU <-> 1.0 loss ratio

    # Prebias
    if prebias:
        ne = max(round(30 / nb), 3)  # number of prebias epochs
        ps = np.interp(epoch, [0, ne], [0.1, hyp['lr0'] * 2]), \
                np.interp(epoch, [0, ne], [0.9, hyp['momentum']])  # prebias settings (lr=0.1, momentum=0.9)
        if epoch == ne:
            # print_model_biases(model)
            prebias = False

        # Bias optimizer settings
        optimizer.param_groups[2]['lr'] = ps[0]
        if optimizer.param_groups[2].get('momentum') is not None:  # for SGD but not Adam
            optimizer.param_groups[2]['momentum'] = ps[1]

    mloss = torch.zeros(4).to(device)  # mean losses
    print(('\n' + '%10s' * 8) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'total', 'targets', 'img_size'))
    pbar = tqdm(enumerate(dataloader), total=nb)  # progress bar
    for i, (imgs, targets, paths, _) in pbar:  # batch -------------------------------------------------------------
        ni = i + nb * epoch  # number integrated batches (since train start)
        imgs = imgs.to(device).float() / 255.0  # uint8 to float32, 0 - 255 to 0.0 - 1.0
        targets = targets.to(device)
        """
        if ni < 1:
            f = 'train_batch%g.png' % i  # filename
            plot_images(imgs=imgs, targets=targets, paths=paths, fname=f)
            if tb_writer:
                tb_writer.add_image(f, cv2.imread(f)[:, :, ::-1], dataformats='HWC')
        """
        # Multi-Scale training
        if opt.multi_scale:
            if ni / accumulate % 1 == 0:  #  adjust img_size (67% - 150%) every 1 batch
                img_size = random.randrange(img_sz_min, img_sz_max + 1) * 32
            sf = img_size / max(imgs.shape[2:])  # scale factor
            if sf != 1:
                ns = [math.ceil(x * sf / 32.) * 32 for x in imgs.shape[2:]]  # new shape (stretched to 16-multiple)
                imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False)

        # Run model
        pred = model(imgs)

        # Compute loss
        loss, loss_items = compute_loss(pred, targets, model)
        if not torch.isfinite(loss):
            print('WARNING: non-finite loss, ending training ', loss_items)
            #return results

        # Scale loss by nominal batch_size of 64
        loss *= batch_size / 64

            
        loss.backward()

        # Optimize accumulated gradient
        if ni % accumulate == 0:
            optimizer.step()
            optimizer.zero_grad()

            # Print batch results
        mloss = (mloss * i + loss_items) / (i + 1)  # update mean losses
        mem = '%.3gG' % (torch.cuda.memory_cached() / 1E9 if torch.cuda.is_available() else 0)  # (GB)
        s = ('%10s' * 2 + '%10.3g' * 6) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, len(targets), img_size)
        pbar.set_description(s)
    
    scheduler.step()
    final_epoch = epoch + 1 == epochs
    if not opt.notest or final_epoch:  # Calculate mAP
        is_coco = any([x in data for x in ['coco.data', 'coco2014.data', 'coco2017.data']]) and model.nc == 80
        results = test.test(cfg,
                            data,
                            batch_size=batch_size * 2,
                            img_size=img_size_test,
                            model=model,
                            conf_thres=0.001,  # 0.001 if opt.evolve or (final_epoch and is_coco) else 0.01,
                            iou_thres=0.6,
                            save_json=final_epoch and is_coco,
                            single_cls=opt.single_cls,
                            dataloader=testloader)
    with open(results_file, 'a') as f:
        f.write(s + '%10.3g' * len(results) % results + '\n')  # P, R, mAP, F1, test_losses=(GIoU, obj, cls)
    if len(opt.name) and opt.bucket:
        os.system('gsutil cp results.txt gs://%s/results/results%s.txt' % (opt.bucket, opt.name))
    
    if tb_writer:
        x = list(mloss) + list(results)
        titles = ['GIoU', 'Objectness', 'Classification', 'Train loss',
                  'iou', 'loss', 'Giou loss', 'obj loss']
        for xi, title in zip(x, titles):
            tb_writer.add_scalar(title, xi, epoch)
    
    fi = fitness(np.array(results).reshape(1, -1))  # fitness_i = weighted combination of [P, R, mAP, F1]
    if fi > best_fitness:
        best_fitness = fi
        
    test_iou = results[0]
    if test_iou > test_best_iou:
        test_best_iou = test_iou

    # Save training results
    save = (not opt.nosave) or (final_epoch and not opt.evolve)
    
    if save:
        with open(results_file, 'r') as f:
            # Create checkpoint
            chkpt = {'epoch': epoch,
                     'best_fitness': best_fitness,
                     'training_results': f.read(),
                     'model': model.module.state_dict() if type(
                             model) is nn.parallel.DistributedDataParallel else model.state_dict(),
                     'optimizer': None if final_epoch else optimizer.state_dict()}

        # Save last checkpoint
        torch.save(chkpt, last)

        # Save best checkpoint
        if best_fitness == fi:
            torch.save(chkpt, best)
            
        if test_iou == test_best_iou:
            torch.save(chkpt, test_best)

        # Save backup every 10 epochs (optional)
        # if epoch > 0 and epoch % 10 == 0:
        #     torch.save(chkpt, wdir + 'backup%g.pt' % epoch)

        # Delete checkpoint
        del chkpt

In [None]:
n = opt.name
if len(n):
    n = '_' + n if not n.isnumeric() else n
    fresults, flast, fbest = 'results%s.txt' % n, 'last%s.pt' % n, 'best%s.pt' % n
    os.rename('results.txt', fresults)
    os.rename(wdir + 'last.pt', wdir + flast) if os.path.exists(wdir + 'last.pt') else None
    os.rename(wdir + 'best.pt', wdir + fbest) if os.path.exists(wdir + 'best.pt') else None
    if opt.bucket:  # save to cloud
        os.system('gsutil cp %s gs://%s/results' % (fresults, opt.bucket))
        os.system('gsutil cp %s gs://%s/weights' % (wdir + flast, opt.bucket))
        # os.system('gsutil cp %s gs://%s/weights' % (wdir + fbest, opt.bucket))

#if not opt.evolve:
#    plot_results()  # save as results.png
print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600))
dist.destroy_process_group() if torch.cuda.device_count() > 1 else None
torch.cuda.empty_cache()

print(results)