## adapting the below scripts to Places2

Directory: https://github.com/fastai/imagenet-fast/tree/master/imagenet_nv

CIFAR10 training notebook: https://github.com/fastai/imagenet-fast/blob/master/cifar10/cifar10-super-convergence-more-aug.ipynb

Imagenet training script: https://github.com/fastai/imagenet-fast/blob/master/imagenet_nv/main.py


In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [18]:
import argparse, os, shutil, time, warnings, datetime
from pathlib import Path
import numpy as np

from fastai.transforms import *
from fastai.dataset import *
from fastai.fp16 import *
from fastai.conv_learner import *
from pathlib import *

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.distributed as dist
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import pytorch_models
from fp16util import network_to_half, set_grad, copy_in_params


# print(models.cifar10.__dict__)
model_names = sorted(name for name in pytorch_models.__dict__
                     if name.islower() and not name.startswith("__")
                     and callable(pytorch_models.__dict__[name]))

print(model_names)

['bn', 'bnf_resnet50', 'bnz_resnet50', 'conv3x3', 'darknet_50', 'darknet_mini', 'darknet_mini2', 'darknet_mini3', 'darknet_small', 'dpn107', 'dpn131', 'dpn68', 'dpn92', 'dpn98', 'inceptionresnetv2', 'inceptionresnetv2_conc', 'inceptionv4', 'load', 'load_block17', 'load_block35', 'load_block8', 'load_conv2d', 'load_conv2d_nobn', 'load_linear', 'load_mixed_4a_7a', 'load_mixed_5', 'load_mixed_5b', 'load_mixed_6', 'load_mixed_6a', 'load_mixed_7', 'load_mixed_7a', 'pre_resnet101', 'pre_resnet152', 'pre_resnet18', 'pre_resnet34', 'pre_resnet50', 'resnet101', 'resnet152', 'resnet18', 'resnet34', 'resnet50', 'resnet50_3', 'resnext101', 'resnext152', 'resnext18', 'resnext34', 'resnext50', 'se_resnet_101', 'se_resnet_152', 'se_resnet_18', 'se_resnet_34', 'se_resnet_50', 'se_resnet_50_conc', 'se_resnext_101', 'se_resnext_152', 'se_resnext_50', 'test', 'test_block17', 'test_block35', 'test_block8', 'test_conv2d', 'test_conv2d_nobn', 'test_mixed_4a_7a', 'test_mixed_5b', 'test_mixed_6a', 'test_mixed

In [3]:
# Example usage: python run_fastai.py /home/paperspace/ILSVRC/Data/CLS-LOC/ -a resnext_50_32x4d --epochs 1 -j 4 -b 64 --fp16

parser = argparse.ArgumentParser(description='PyTorch Places2 Training')
parser.add_argument('data', metavar='DIR',
                 help='path to dataset')
parser.add_argument('--save-dir', type=str, default=Path.home()/'imagenet_training',
                    help='Directory to save logs and models.')
parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet18',
                    choices=model_names,
                    help='model architecture: ' +
                    ' | '.join(model_names) +
                    ' (default: resnet18)')
parser.add_argument('-j', '--workers', default=7, type=int, metavar='N',
                    help='number of data loading workers (default: 7)')
parser.add_argument('--epochs', default=1, type=int, metavar='N',
                    help='number of total epochs to run')
#parser.add_argument('--cycle-len', default=95, type=float, metavar='N',
#                    help='Length of cycle to run')
parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
                    help='manual epoch number (useful on restarts)')
parser.add_argument('-b', '--batch-size', default=256, type=int,
                    metavar='N', help='mini-batch size (default: 256)')
parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
                    metavar='LR', help='initial learning rate')
parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='momentum')
parser.add_argument('--weight-decay', '--wd', default=1e-4, type=float,
                    metavar='W', help='weight decay (default: 1e-4)')
parser.add_argument('--print-freq', '-p', default=100, type=int,
                    metavar='N', help='print frequency (default: 100 iterations)')
parser.add_argument('--resume', default='', type=str, metavar='PATH',
                    help='path to latest checkpoint (default: none)')
parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
                    help='evaluate model on validation set')
parser.add_argument('--pretrained', dest='pretrained', action='store_true', help='use pre-trained model')
parser.add_argument('--fp16', action='store_true', help='Run model fp16 mode.')
parser.add_argument('--use-tta', default=True, type=bool, help='Validate model with TTA at the end of traiing.')
parser.add_argument('--train-half', action='store_true', help='Train model on half images. TODO: allow custom epochs and LR')
parser.add_argument('--sz',       default=256, type=int, help='Size of transformed image.')
parser.add_argument('--decay-int', default=30, type=int, help='Decay LR by 10 every decay-int epochs')
#parser.add_argument('--use-clr', default='10,13.68,0.95,0.85', type=str, 
#                    help='div,pct,max_mom,min_mom. Pass in a string delimited by commas. Ex: "20,2,0.95,0.85"')
parser.add_argument('--loss-scale', type=float, default=1,
                    help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
parser.add_argument('--prof', dest='prof', action='store_true', help='Only run a few iters for profiling.')

parser.add_argument('--world-size', default=1, type=int,
                    help='Number of GPUs to use. Can either be manually set ' +
                    'or automatically set by using \'python -m multiproc\'.')
parser.add_argument('--rank', default=0, type=int,
                    help='Used for multi-process training. Can either be manually set ' +
                    'or automatically set by using \'python -m multiproc\'.')


_StoreAction(option_strings=['--rank'], dest='rank', nargs=None, const=None, default=0, type=<class 'int'>, choices=None, help="Used for multi-process training. Can either be manually set or automatically set by using 'python -m multiproc'.", metavar=None)

In [31]:
def get_loaders(traindir, valdir):
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    tensor_tfm = [transforms.ToTensor(), normalize]

    train_dataset = datasets.ImageFolder(
        traindir,
        transforms.Compose([
            transforms.RandomResizedCrop(args.sz),
            transforms.RandomHorizontalFlip(),
        ] + tensor_tfm))

    train_sampler = (torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None)

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
        num_workers=args.workers, pin_memory=True, sampler=train_sampler)

    val_loader = torch.utils.data.DataLoader(
        datasets.ImageFolder(valdir, transforms.Compose([
            transforms.Resize(int(args.sz*1.14)),
            transforms.CenterCrop(args.sz),
        ] + tensor_tfm)),
        batch_size=args.batch_size*2, shuffle=False,
        num_workers=args.workers, pin_memory=False)

    return train_loader,val_loader,train_sampler


# item() is a recent addition, so this helps with backward compatibility.
def to_python_float(t):
    if hasattr(t, 'item'):
        return t.item()
    else:
        return t[0]

    
class data_prefetcher():
    def __init__(self, loader, prefetch=True):
        self.loader,self.prefetch = iter(loader),prefetch
        if prefetch:
            self.stream = torch.cuda.Stream()
            self.preload()

    def preload(self):
        try:
            self.next_input, self.next_target = next(self.loader)
        except StopIteration:
            self.next_input = None
            self.next_target = None
            return
        with torch.cuda.stream(self.stream):
            self.next_input = self.next_input.cuda(async=True)
            self.next_target = self.next_target.cuda(async=True)

    def next(self):
        if not self.prefetch:
            input,target = next(self.loader)
            return input.cuda(async=True),target.cuda(async=True)

        torch.cuda.current_stream().wait_stream(self.stream)
        input = self.next_input
        target = self.next_target
        self.preload()
        return input, target


def train(train_loader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    # switch to train mode
    model.train()
    end = time.time()

    prefetcher = data_prefetcher(train_loader, prefetch=True)
    input, target = prefetcher.next()
    i = -1
    while input is not None:
        i += 1

        if args.prof and (i > 200): break
        # measure data loading time
        data_time.update(time.time() - end)

        input_var = Variable(input)
        target_var = Variable(target)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))

        if args.distributed:
            reduced_loss = reduce_tensor(loss.data)
            prec1 = reduce_tensor(prec1)
            prec5 = reduce_tensor(prec5)
        else:
            reduced_loss = loss.data

        losses.update(to_python_float(reduced_loss), input.size(0))
        top1.update(to_python_float(prec1), input.size(0))
        top5.update(to_python_float(prec5), input.size(0))

        loss = loss*args.loss_scale
        # compute gradient and do SGD step

        if args.fp16:
            model.zero_grad()
            loss.backward()
            set_grad(param_copy, list(model.parameters()))

            if args.loss_scale != 1:
                for param in param_copy:
                    param.grad.data = param.grad.data/args.loss_scale

            optimizer.step()
            copy_in_params(model, param_copy)
            torch.cuda.synchronize()
        else:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)

        end = time.time()
        input, target = prefetcher.next()

        if args.rank == 0 and i % args.print_freq == 0 and i > 1:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                   epoch, i, len(train_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses, top1=top1, top5=top5))


def validate(val_loader, model, criterion, epoch, start_time):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    model.eval()
    end = time.time()

    prefetcher = data_prefetcher(val_loader)
    input, target = prefetcher.next()
    i = -1
    while input is not None:
        i += 1

        target = target.cuda(async=True)
        input_var = Variable(input)
        target_var = Variable(target)

        # compute output
        with torch.no_grad():
            output = model(input_var)
            loss = criterion(output, target_var)

        reduced_loss = reduce_tensor(loss.data)

        # measure accuracy and record loss
        prec1, prec5 = accuracy(output.data, target, topk=(1, 5))

        reduced_prec1 = reduce_tensor(prec1)
        reduced_prec5 = reduce_tensor(prec5)

        losses.update(to_python_float(reduced_loss), input.size(0))
        top1.update(to_python_float(prec1), input.size(0))
        top5.update(to_python_float(prec5), input.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if args.rank == 0 and i % args.print_freq == 0:
            print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                  'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                   i, len(val_loader), batch_time=batch_time, loss=losses,
                   top1=top1, top5=top5))

        input, target = prefetcher.next()

    time_diff = datetime.now()-start_time
    print(f'~~{epoch}\t{float(time_diff.total_seconds() / 3600.0)}\t{top5.avg:.3f}\n')
    print(' * Prec@1 {top1.avg:.3f} Prec@5 {top5.avg:.3f}'.format(top1=top1, top5=top5))

    return top1.avg


def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, f'{args.save_dir}/model_best.pth.tar')


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def adjust_learning_rate(optimizer, epoch):
    """Sets the learning rate to the initial LR decayed by 10 every few epochs"""
    if   epoch<4 : lr = args.lr/(4-epoch)
    elif epoch<28: lr = args.lr/1
    elif epoch<47: lr = args.lr/10
    elif epoch<57: lr = args.lr/100
    else         : lr = args.lr/1000
    for param_group in optimizer.param_groups: param_group['lr'] = lr


def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res


def reduce_tensor(tensor):
    rt = tensor.clone()
    dist.all_reduce(rt, op=dist.reduce_op.SUM)
    rt /= args.world_size
    return rt



## Resnet block

In [5]:
args_input = [
    'data', 
    '--save-dir', 'training/test1', 
#     '-a', 'resnext29_8_64', 
#     '-j', '6', 
#     '--prof', 
    '-b', '512', 
#     '--sz', '32',
#     '--loss-scale', '128',
    '--fp16',
#     '--epochs', '1',
#     '--use-clr', '10,13.68,0.95,0.85',
#    '--wd', '2e-4',
#    '--lr', '1',
     '--train-half' # With fp16, iterations are so fast this doesn't matter
]

In [6]:
# This is important for speed 
cudnn.benchmark = True

global arg
args = parser.parse_args(args_input); 
args.distributed = args.world_size > 1
print(args)

if args.fp16:
    assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled."
    
#global arg
#args = parser.parse_args(args_input); args
#if args.cycle_len > 1: args.cycle_len = int(args.cycle_len)

Namespace(arch='resnet18', batch_size=512, data='data', decay_int=30, distributed=False, epochs=1, evaluate=False, fp16=True, loss_scale=1, lr=0.1, momentum=0.9, pretrained=False, print_freq=100, prof=False, rank=0, resume='', save_dir='training/test1', start_epoch=0, sz=256, train_half=True, use_tta=True, weight_decay=0.0001, workers=7, world_size=1)


In [7]:
# create model
if args.pretrained: 
    print("=> using pre-trained model '{}'".format(args.arch))
    model = pytorch_models.__dict__[args.arch](pretrained=True)
else: 
    print("=> creating model '{}'".format(args.arch))
    model = pytorch_models.__dict__[args.arch]()

=> creating model 'resnet18'


In [8]:
model = model.cuda()
if args.fp16: model = network_to_half(model)

global param_copy
if args.fp16:
    param_copy = [param.clone().type(torch.cuda.FloatTensor).detach() for param in model.parameters()]
    for param in param_copy: param.requires_grad = True
else: param_copy = list(model.parameters())    


In [9]:
# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss().cuda()
optimizer = torch.optim.SGD(param_copy, args.lr, momentum=args.momentum, weight_decay=args.weight_decay)

In [10]:
best_prec1 = 0
# optionally resume from a checkpoint
if args.resume:
    if os.path.isfile(args.resume):
        checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.gpu))
        args.start_epoch = checkpoint['epoch']
        best_prec1 = checkpoint['best_prec1']
        model.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
    else: print("=> no checkpoint found at '{}'".format(args.resume))

In [11]:
traindir = os.path.join(args.data, 'train')
valdir = os.path.join(args.data, 'valid')
train_loader,val_loader,train_sampler = get_loaders(traindir, valdir)

In [12]:
#if args.evaluate: return validate(val_loader, model, criterion, epoch, start_time)
#validate(val_loader, model, criterion, epoch, start_time)

In [13]:
epoch = 1
adjust_learning_rate(optimizer, epoch)

In [14]:
# this is some trick we will try later ...
# in last few epochs, use different image size to train
#if epoch==args.epochs-6:
#    args.sz=288
#    args.batch_size=128
#    train_loader,val_loader,train_sampler,val_sampler = get_loaders(
#        traindir, valdir, use_val_sampler=False, min_scale=0.5)

In [None]:
#torch.cuda.empty_cache()

# Time 0.563 (0.599) : the current 1-batch processing time in sec (running average)
# Data 0.258 (0.225) : the current batch loading time in sec (running average)
# Prec@1 2.344 (1.412) : top 1-category accuracy in % (running average)
# Prec@5 9.961 (5.736) : top 5-category accuracy in % (running average)

start_time = time.time()

with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=UserWarning)
    train(train_loader, model, criterion, optimizer, epoch)

Epoch: [1][100/3523]	Time 0.563 (0.599)	Data 0.258 (0.225)	Loss 5.3750 (5.7704)	Prec@1 2.344 (1.412)	Prec@5 9.961 (5.736)
Epoch: [1][200/3523]	Time 0.459 (0.560)	Data 0.151 (0.218)	Loss 5.0352 (5.4744)	Prec@1 4.297 (2.452)	Prec@5 13.867 (9.100)
Epoch: [1][300/3523]	Time 0.312 (0.547)	Data 0.001 (0.216)	Loss 4.7305 (5.2760)	Prec@1 7.422 (3.492)	Prec@5 22.266 (12.089)
Epoch: [1][400/3523]	Time 1.348 (0.541)	Data 1.029 (0.215)	Loss 4.4414 (5.1231)	Prec@1 9.961 (4.436)	Prec@5 28.320 (14.643)
Epoch: [1][500/3523]	Time 0.311 (0.535)	Data 0.001 (0.212)	Loss 4.3633 (4.9987)	Prec@1 8.594 (5.307)	Prec@5 28.320 (16.866)
Epoch: [1][600/3523]	Time 0.315 (0.531)	Data 0.001 (0.210)	Loss 4.2852 (4.8971)	Prec@1 11.914 (6.057)	Prec@5 30.664 (18.747)
Epoch: [1][700/3523]	Time 0.312 (0.529)	Data 0.001 (0.208)	Loss 4.1953 (4.8078)	Prec@1 11.328 (6.803)	Prec@5 32.227 (20.481)
Epoch: [1][800/3523]	Time 0.314 (0.528)	Data 0.002 (0.209)	Loss 4.1250 (4.7270)	Prec@1 11.328 (7.536)	Prec@5 34.375 (22.122)
Epoch: [

In [23]:
datetime.ctime()

AttributeError: module 'datetime' has no attribute 'ctime'

In [32]:
start_time = time.time()

prec1 = validate(val_loader, model, criterion, epoch, start_time)

AttributeError: module 'torch' has no attribute 'no_grad'

In [33]:
print(torch.__version__)
torch.g

0.3.1.post2
