In [1]:
import time, os, copy, argparse, collections, sys, numpy as np, torch, torchvision, csv
import warnings
warnings.filterwarnings('ignore')

import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
from torchvision import datasets, models, transforms

from anchors import Anchors
from datagen import CSVDataset, collater, Resizer, AspectRatioBasedSampler, Augmenter, UnNormalizer, Normalizer
from torch.utils.data import Dataset, DataLoader, distributed

# Asserting torch verion to be 0.4.x
assert torch.__version__.split('.')[1] == '4'
from tqdm import tqdm
# Importing our custom model file and csv evaluation
import model, csv_eval

import horovod.torch as hvd

print('CUDA available: {}'.format(torch.cuda.is_available()))

class Parser_arg():
    def __init__(self, train, classes, val, savepath, steps_per_stats=100, 
                 depth=34, batch_size=16,epochs=50, resume=False):
        self.train = train
        self.classes = classes
        self.val = val
        self.steps_per_stats = steps_per_stats
        self.savepath = savepath
        self.depth = depth
        self.batch_size = batch_size
        self.epochs= epochs
        self.resume = resume


CUDA available: True


In [2]:
parser = Parser_arg(train="data/train/train_annot.csv",classes= "data/class_ids.txt",
                val = "data/validation/valid_annot.csv",savepath="models_ao")

if not os.path.exists(parser.savepath):
    os.makedirs(parser.savepath)

# Create the data loaders
if parser.train is None:
    raise ValueError('Must provide --train')

if parser.classes is None:
    raise ValueError('Must provide --classes')

print("Preparing the training Dataset")

hvd.init()
torch.cuda.set_device(hvd.local_rank())
kwargs = {'num_workers': 4, 'pin_memory': True} if torch.cuda.is_available() else {}
dataset_train = CSVDataset(train_file=parser.train, class_list=parser.classes, 
                           transform=transforms.Compose([Normalizer(), Augmenter(), Resizer()]))

# sampler = AspectRatioBasedSampler(dataset_train, batch_size=parser.batch_size, drop_last=False)
train_sampler = torch.utils.data.distributed.DistributedSampler(
    dataset_train, num_replicas=hvd.size(), rank=hvd.rank())

print("Preparing the training Dataloader")
# dataloader_train = DataLoader(dataset_train, num_workers=2, collate_fn=collater, batch_sampler=sampler)
dataloader_train = torch.utils.data.DataLoader(
    dataset_train, batch_size=parser.batch_size,
    sampler=train_sampler, **kwargs, collate_fn = collater)

if parser.val is None:
    dataset_val = None
    print('No validation annotations provided.')
else:
    print("Preparing the validation Dataset")
    dataset_val = CSVDataset(train_file=parser.val, class_list=parser.classes, 
                             transform=transforms.Compose([Normalizer(), Resizer()]))


if dataset_val is not None:
    # sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=parser.batch_size, drop_last=False)
    val_sampler = torch.utils.data.distributed.DistributedSampler(
        dataset_val, num_replicas=hvd.size(), rank=hvd.rank())    
    print("Preparing the validation Dataloader")
    dataloader_val = torch.utils.data.DataLoader(
        dataset_val, batch_size=parser.batch_size,
        sampler=val_sampler, **kwargs, collate_fn = collater)

print('Num training images: {}'.format(len(dataset_train)))
if parser.val is not None:
    print('Num validation images: {}'.format(len(dataset_val)))

# Create the model
start_epoch = 0
if parser.resume:
    print("=> loading checkpoint '{}'".format(parser.resume))
    checkpoint = torch.load(os.path.join(parser.savepath,'{}_retinanet_{}.pt'.format(parser.depth, parser.resume)))
    start_epoch = checkpoint['epoch']

if parser.depth == 18:
    retinanet = model.resnet18(num_classes=dataset_train.num_classes(), pretrained=True)
elif parser.depth == 34:
    retinanet = model.resnet34(num_classes=dataset_train.num_classes(), pretrained=True)
elif parser.depth == 50:
    retinanet = model.resnet50(num_classes=dataset_train.num_classes(), pretrained=True)
elif parser.depth == 101:
    retinanet = model.resnet101(num_classes=dataset_train.num_classes(), pretrained=True)
elif parser.depth == 152:
    retinanet = model.resnet152(num_classes=dataset_train.num_classes(), pretrained=True)
else:
    raise ValueError('Unsupported model depth, must be one of 18, 34, 50, 101, 152')		

if parser.resume:
    retinanet.load_state_dict(checkpoint['model_state_dict'])

use_gpu = True

if use_gpu:
    retinanet = retinanet.cuda()

# For the MultiGPU training
# retinanet = torch.nn.DataParallel(retinanet, device_ids=range(torch.cuda.device_count()))

retinanet.training = True

# 	import pdb; pdb.set_trace()
# Broadcast parameters from rank 0 to all other processes.
hvd.broadcast_parameters(retinanet.state_dict(), root_rank=0)

optimizer = optim.Adam(retinanet.parameters(), lr=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True)

optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=retinanet.named_parameters())

if parser.resume:
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    print("=> loaded checkpoint {}_retinanet_{}.pt".format(parser.depth, parser.resume))



retinanet.train()

retinanet.freeze_bn()
loss_hist = collections.deque(maxlen=500)

# sys.exit(0)

Preparing the training Dataset
Preparing the training Dataloader
Preparing the validation Dataset
Preparing the validation Dataloader
Num training images: 1935
Num validation images: 107


In [22]:
verbose = 1 if hvd.rank() == 0 else 0
for epoch_num in range(start_epoch+1,parser.epochs):
    with tqdm(total=len(dataloader_train), desc='Train Epoch     #{}'.format(epoch_num), disable=not verbose) as t:
        retinanet.train()
        retinanet.freeze_bn()
        epoch_loss, cls_loss_lst, reg_loss_lst = [], [], []
        stime = time.time()
        for iter_num, data in enumerate(dataloader_train):
            try:
                img, annot = data['img'].cuda(), data['annot'].cuda()
                optimizer.zero_grad()
                classification_loss, regression_loss = retinanet([img, annot])
                cls_loss_lst.append(float(classification_loss))
                reg_loss_lst.append(float(regression_loss))
                classification_loss = classification_loss.mean()
                regression_loss = regression_loss.mean()
                loss = classification_loss + regression_loss
                if bool(loss == 0):
                    continue
                loss.backward()
                torch.nn.utils.clip_grad_norm_(retinanet.parameters(), 0.1)
                optimizer.step()
                loss_hist.append(float(loss))
                epoch_loss.append(float(loss))
                if(iter_num % parser.steps_per_stats == 0):
                    st = 'Epoch: {} | Iter: {} | Ela_time: {:1.5f} | Cls_loss: {:1.5f} | Reg_loss: {:1.5f} | Avg_running_loss: {:1.5f}'.format(epoch_num, iter_num, time.time()-stime, np.mean(cls_loss_lst), np.mean(reg_loss_lst), np.mean(loss_hist))
                    with open(os.path.join(parser.savepath, 'train_log.txt'), 'a') as f:
                        f.write(st+"\n")
                    cls_loss_lst, reg_loss_lst, stime = [], [], time.time()

                del classification_loss
                del regression_loss
                t.update(1)
            except Exception as e:
                print(e)
                continue

    if parser.val is not None:
        with tqdm(total=len(dataloader_val), desc='Val Epoch     #{}'.format(epoch_num), disable=not verbose) as t:

            val_loss = []
            for iter_num, data in enumerate(dataloader_val):
                try:
                    optimizer.zero_grad()
                    img, annot = data['img'].cuda(), data['annot'].cuda()
                    classification_loss, regression_loss = retinanet([img, annot])
                    classification_loss = classification_loss.mean()
                    regression_loss = regression_loss.mean()
                    val_loss.append(float(classification_loss + regression_loss))
                    if bool(loss == 0):
                        continue
                    del classification_loss
                    del regression_loss
                    t.update(1)
                except Exception as e:
                    print(e)
                    continue
                    
        t.set_postfix({'epoch':epoch_num, 'train_loss': np.mean(epoch_loss),
                           'val_loss': np.mean(val_loss)})
        t.update(1)

        scheduler.step(np.mean(epoch_loss))	
        torch.save({'epoch':epoch_num,
        'model_state_dict':retinanet.state_dict(),
        'optimizer_state_dict':optimizer.state_dict(),
        'loss':loss}, os.path.join(parser.savepath,'{}_retinanet_{}.pt'.format(parser.depth, epoch_num)))


Train Epoch     #1: 100%|██████████| 121/121 [01:50<00:00,  1.10it/s]
Val Epoch     #1: 100%|██████████| 7/7 [00:03<00:00,  2.37it/s]
Train Epoch     #2: 100%|██████████| 121/121 [01:51<00:00,  1.10it/s]
Val Epoch     #2: 100%|██████████| 7/7 [00:03<00:00,  2.31it/s]
Train Epoch     #3: 100%|██████████| 121/121 [01:51<00:00,  1.10it/s]
Val Epoch     #3: 100%|██████████| 7/7 [00:03<00:00,  2.32it/s]
Train Epoch     #4: 100%|██████████| 121/121 [01:52<00:00,  1.10it/s]
Val Epoch     #4: 100%|██████████| 7/7 [00:03<00:00,  2.28it/s]
Train Epoch     #5: 100%|██████████| 121/121 [01:53<00:00,  1.08it/s]
Val Epoch     #5: 100%|██████████| 7/7 [00:03<00:00,  2.26it/s]
Train Epoch     #6: 100%|██████████| 121/121 [01:51<00:00,  1.10it/s]
Val Epoch     #6: 100%|██████████| 7/7 [00:03<00:00,  2.06it/s]
Train Epoch     #7: 100%|██████████| 121/121 [01:52<00:00,  1.09it/s]
Val Epoch     #7: 100%|██████████| 7/7 [00:03<00:00,  2.15it/s]
Train Epoch     #8: 100%|██████████| 121/121 [01:53<00:00,  1.

KeyboardInterrupt: 

  File "/home/ai/Documents/piyush/pytorch-retinanet/datagen.py", line 89, in __getitem__
    img = self.load_image(idx)
  File "/home/ai/Documents/piyush/pytorch-retinanet/datagen.py", line 98, in load_image
    img = skimage.io.imread(self.image_names[image_index])
  File "/home/ai/anaconda3/lib/python3.6/site-packages/skimage/io/_io.py", line 62, in imread
    img = call_plugin('imread', fname, plugin=plugin, **plugin_args)
  File "/home/ai/anaconda3/lib/python3.6/site-packages/skimage/io/manage_plugins.py", line 214, in call_plugin
    return func(*args, **kwargs)
  File "/home/ai/anaconda3/lib/python3.6/site-packages/skimage/io/_plugins/pil_plugin.py", line 37, in imread
    return pil_to_ndarray(im, dtype=dtype, img_num=img_num)
  File "/home/ai/anaconda3/lib/python3.6/site-packages/skimage/transform/_warps.py", line 169, in resize
    preserve_range=preserve_range)
  File "/home/ai/anaconda3/lib/python3.6/site-packages/skimage/io/_plugins/pil_plugin.py", line 53, in pil_to_ndarra

# Testing Space

In [2]:
parser = Parser_arg(train="data/train/train_annot.csv",classes= "data/class_ids.txt",
                val = "data/validation/valid_annot.csv",savepath="models_ao")
hvd.init()
torch.cuda.set_device(hvd.local_rank())
kwargs = {'num_workers': 4, 'pin_memory': True} if torch.cuda.is_available() else {}
dataset_train = CSVDataset(train_file=parser.train, class_list=parser.classes, 
                           transform=transforms.Compose([Normalizer(), Augmenter(), Resizer()]))

# sampler = AspectRatioBasedSampler(dataset_train, batch_size=parser.batch_size, drop_last=False)
train_sampler = torch.utils.data.distributed.DistributedSampler(
    dataset_train, num_replicas=hvd.size(), rank=hvd.rank())

print("Preparing the training Dataloader")
# dataloader_train = DataLoader(dataset_train, num_workers=2, collate_fn=collater, batch_sampler=sampler)
dataloader_train = torch.utils.data.DataLoader(
    dataset_train, batch_size=parser.batch_size,
    sampler=train_sampler, **kwargs, collate_fn = collater)

Preparing the training Dataloader


In [29]:
for epoch_num in range(0, 3):
    with tqdm(total=5, desc='Epoch     #{}'.format(epoch_num), disable=not verbose) as t:
        for i in range(5):
            time.sleep(0.5)
            t.update(1)
        t.write("Hi")

Epoch     #0: 100%|██████████| 5/5 [00:02<00:00,  1.99it/s]
Epoch     #1:   0%|          | 0/5 [00:00<?, ?it/s]

Hi


Epoch     #1: 100%|██████████| 5/5 [00:02<00:00,  1.99it/s]
Epoch     #2:   0%|          | 0/5 [00:00<?, ?it/s]

Hi


Epoch     #2: 100%|██████████| 5/5 [00:02<00:00,  1.99it/s]

Hi



