In [1]:
import numpy as np

import os
from os.path import join
import time
import shutil

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
import torch.utils.trainer as trainer

from preproc_data import mkdir

# Params

In [2]:
dataset_name = 'preproc'
train_name = 'resnet101_003'

batch_size = 16

LR = 0.0001
momentum = 0.9
weight_decay = 1e-4

print_freq = 500
n_epoch = 10000
test_iter = 1

# Data loader

In [3]:
data_dir = '/workdir/data/{0}/'.format(dataset_name)
model_save_dir = '/workdir/data/models/{0}/{1}/'.format(dataset_name, train_name)
subm_dir = '/workdir/data/submissions/{0}/{1}/'.format(dataset_name, train_name)
mkdir(model_save_dir)
mkdir(subm_dir)

traindir = join(data_dir, 'train')
valdir = join(data_dir, 'val')

normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

train_loader = torch.utils.data.DataLoader(
    datasets.ImageFolder(traindir, transforms.Compose([
        transforms.RandomSizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ])),
    batch_size=batch_size, shuffle=True,
    num_workers=2, pin_memory=True)

val_loader = torch.utils.data.DataLoader(
    datasets.ImageFolder(valdir, transforms.Compose([
        transforms.Scale(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        normalize,
    ])),
    batch_size=batch_size, shuffle=False,
    num_workers=2, pin_memory=True)

In [4]:
train_loader.dataset.class_to_idx

{'0': 0, '1': 1, '2': 2, '3': 3, '4': 4}

# Model

In [5]:
def resnet50():
    model = models.resnet50(pretrained=True)
    
    model.fc = nn.Linear(2048, 5)

    optimizer = torch.optim.SGD(model.parameters(), LR,
                                momentum=momentum,
                                weight_decay=weight_decay)
    model = torch.nn.DataParallel(model).cuda()
    
    criterion = nn.CrossEntropyLoss().cuda()
    
    return model, criterion, optimizer

In [6]:
model, criterion, optimizer = resnet50()

# Train

In [None]:
def train(train_loader, model, criterion, optimizer, epoch):
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to train mode
    model.train()

    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        # measure data loading time
        data_time.update(time.time() - end)

        target = target.cuda(async=True)
        input_var = torch.autograd.Variable(input)
        target_var = torch.autograd.Variable(target)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # measure accuracy and record loss
        prec1 = accuracy(output.data, target, topk=(1,))
        losses.update(loss.data[0], input.size(0))
        top1.update(prec1[0][0], input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        
        if i % print_freq == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                   epoch, i, len(train_loader), batch_time=batch_time,
                   data_time=data_time, loss=losses, top1=top1))


def validate(val_loader, model, criterion):
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()

    # switch to evaluate mode
    model.eval()

    end = time.time()
    for i, (input, target) in enumerate(val_loader):
        target = target.cuda(async=True)
        input_var = torch.autograd.Variable(input, volatile=True)
        target_var = torch.autograd.Variable(target, volatile=True)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # measure accuracy and record loss
        prec1 = accuracy(output.data, target, topk=(1,))
        losses.update(loss.data[0], input.size(0))
        top1.update(prec1[0][0], input.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % print_freq == 0:
            print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                   i, len(val_loader), batch_time=batch_time, loss=losses,
                   top1=top1))

    print(' * Prec@1 {top1.avg:.3f}\t'
          ' Loss {loss.avg:.4f}'.format(top1=top1, loss=losses))

    return losses.avg


def save_checkpoint(state, is_best, filename=join(model_save_dir, 'checkpoint.pth.tar')):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, join(model_save_dir, 'model_best.pth.tar'))
        print('Best model saved')


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def adjust_learning_rate(optimizer, epoch):
    """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
    lr = LR * (0.9 ** (epoch // 30))
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr


def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res

start train

In [None]:
validate(val_loader, model, criterion)

best_loss = np.inf

for epoch in range(0, n_epoch):
    adjust_learning_rate(optimizer, epoch)

    # train for one epoch
    train(train_loader, model, criterion, optimizer, epoch)

    if epoch % test_iter == 0:
        # evaluate on validation set
        loss = validate(val_loader, model, criterion)

        # remember best prec@1 and save checkpoint
        is_best = loss < best_loss
        best_loss = min(loss, best_loss)
        save_checkpoint({
            'epoch': epoch + 1,
            'arch': train_name,
            'state_dict': model.state_dict(),
            'best_loss': best_loss,
        }, is_best)

Test: [0/440]	Time 0.631 (0.631)	Loss 2.5915 (2.5915)	Prec@1 0.000 (0.000)
 * Prec@1 41.190	 Loss 1.3948
Epoch: [0][0/1757]	Time 0.196 (0.196)	Data 0.100 (0.100)	Loss 1.2671 (1.2671)	Prec@1 50.000 (50.000)
Epoch: [0][500/1757]	Time 0.221 (0.220)	Data 0.000 (0.000)	Loss 0.7157 (0.8932)	Prec@1 75.000 (72.692)
Epoch: [0][1000/1757]	Time 0.220 (0.220)	Data 0.000 (0.000)	Loss 1.2031 (0.8616)	Prec@1 56.250 (73.258)
Epoch: [0][1500/1757]	Time 0.220 (0.220)	Data 0.000 (0.000)	Loss 0.4703 (0.8522)	Prec@1 93.750 (73.522)
Test: [0/440]	Time 0.142 (0.142)	Loss 2.7675 (2.7675)	Prec@1 0.000 (0.000)
 * Prec@1 72.459	 Loss 0.8367
Best model saved
Epoch: [1][0/1757]	Time 0.244 (0.244)	Data 0.125 (0.125)	Loss 0.3628 (0.3628)	Prec@1 93.750 (93.750)
Epoch: [1][500/1757]	Time 0.220 (0.220)	Data 0.000 (0.000)	Loss 0.9729 (0.8214)	Prec@1 68.750 (73.491)
Epoch: [1][1000/1757]	Time 0.219 (0.220)	Data 0.000 (0.000)	Loss 0.9410 (0.8190)	Prec@1 68.750 (73.514)
Epoch: [1][1500/1757]	Time 0.220 (0.220)	Data 0.000 (

# Load best model

In [9]:
state_dict = torch.load(join(model_save_dir, 'model_best.pth.tar'))
print(state_dict['arch'], state_dict['best_loss'], state_dict['epoch'])

resnet101_003 0.6747690619238271 22


In [10]:
model.load_state_dict(state_dict['state_dict'])