In [1]:
import math
import argparse
import os
import shutil
import time
import numpy as np
import sys
from collections import OrderedDict

import tensorflow as tf

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.autograd import Variable
from torch.autograd import Function

# used for logging to TensorBoard
from tensorboard_logger import configure, log_value

  from ._conv import register_converters as _register_converters


In [2]:
class Bottleneck(nn.Module):
    def __init__(self, nChannels, growthRate):
        super(Bottleneck, self).__init__()
        interChannels = 4*growthRate
        self.bn1 = nn.BatchNorm2d(nChannels)
        self.conv1 = nn.Conv2d(nChannels, interChannels, kernel_size=1,
                               bias=False)
        self.bn2 = nn.BatchNorm2d(interChannels)
        self.conv2 = nn.Conv2d(interChannels, growthRate, kernel_size=3,
                               padding=1, bias=False)

    def forward(self, x):
        out = self.conv1(F.relu(self.bn1(x)))
        out = self.conv2(F.relu(self.bn2(out)))
        out = torch.cat((x, out), 1)
        return out

class SingleLayer(nn.Module):
    def __init__(self, nChannels, growthRate):
        super(SingleLayer, self).__init__()
        self.bn1 = nn.BatchNorm2d(nChannels)
        self.conv1 = nn.Conv2d(nChannels, growthRate, kernel_size=3,
                               padding=1, bias=False)

    def forward(self, x):
        out = self.conv1(F.relu(self.bn1(x)))
        out = torch.cat((x, out), 1)
        return out

class Transition(nn.Module):
    def __init__(self, nChannels, nOutChannels):
        super(Transition, self).__init__()
        self.bn1 = nn.BatchNorm2d(nChannels)
        self.conv1 = nn.Conv2d(nChannels, nOutChannels, kernel_size=1,
                               bias=False)

    def forward(self, x):
        out = self.conv1(F.relu(self.bn1(x)))
        out = F.avg_pool2d(out, 2)
        return out


class DenseNet(nn.Module):
    def __init__(self, growthRate, depth, reduction, nClasses, bottleneck):
        super(DenseNet, self).__init__()

        nDenseBlocks = (depth-4) // 3
        if bottleneck:
            nDenseBlocks //= 2

        nChannels = 2*growthRate
        self.conv1 = nn.Conv2d(1, nChannels, kernel_size=3, padding=1,
                               bias=False)
        self.dense1 = self._make_dense(nChannels, growthRate, nDenseBlocks, bottleneck)
        nChannels += nDenseBlocks*growthRate
        nOutChannels = int(math.floor(nChannels*reduction))
        self.trans1 = Transition(nChannels, nOutChannels)

        nChannels = nOutChannels
        self.dense2 = self._make_dense(nChannels, growthRate, nDenseBlocks, bottleneck)
        nChannels += nDenseBlocks*growthRate
        nOutChannels = int(math.floor(nChannels*reduction))
        self.trans2 = Transition(nChannels, nOutChannels)

        nChannels = nOutChannels
        self.dense3 = self._make_dense(nChannels, growthRate, nDenseBlocks, bottleneck)
        nChannels += nDenseBlocks*growthRate

        self.bn1 = nn.BatchNorm2d(nChannels)
        self.fc = nn.Linear(nChannels, nClasses)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.bias.data.zero_()

    def _make_dense(self, nChannels, growthRate, nDenseBlocks, bottleneck):
        layers = []
        for i in range(int(nDenseBlocks)):
            if bottleneck:
                layers.append(Bottleneck(nChannels, growthRate))
            else:
                layers.append(SingleLayer(nChannels, growthRate))
            nChannels += growthRate
        return nn.Sequential(*layers)

    def forward(self, x):
        out = self.conv1(x)
        out = self.trans1(self.dense1(out))
        out = self.trans2(self.dense2(out))
        out = self.dense3(out)
        out = torch.squeeze(F.avg_pool2d(F.relu(self.bn1(out)), 7))
        out = F.log_softmax(self.fc(out))
        return out

In [3]:
hparam = dict()

# Dataset Configuration
hparam['dataset'] = 'FashionMNIST'				# Name of the dataset
hparam['classes'] = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
                     'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']		# List of classes
hparam['data_split'] = 30000					# Rearranging Train/Val Ratio (False if default)

# Training Configuration
hparam['print_freq'] = 10						# Print frequency
hparam['start_epoch'] = 0						# Manual Epoch number (useful on restarts)
hparam['resume'] = None							# Path to latest checkpoint
hparam['name'] = 'DenseNet_28_10'				# Name of experiment
hparam['tensorboard'] = False					# Log progress to Tensorboard
hparam['manual_seed'] = 40000					# Seed Number

# Hyperparameter Configuration
hparam['epochs'] = 300							# Number of total epochs to run
hparam['batch_size'] = 128						# Mini-batch size (256 recommended for 1080ti)
hparam['lr'] = 0.1								# Initial Learning Rate
hparam['momentum'] = 0.9						# Momentum
hparam['nesterov'] = True						# Nesterov momentum
hparam['weight_decay'] = 1e-4					# Weight-decay
hparam['layers'] = 100							# Total number of layers
hparam['growthrate'] = 12						# Growth Rate (DenseNet)
hparam['reduction'] = 0.5						# Reduction Rate (DenseNet)
hparam['bottleneck'] = True						# Bottleneck usage (DenseNet)
hparam['augment'] = True						# Whether to use standard augmentation

# GPU Configuration
hparam['cuda_device'] = '1'						# Which GPU Devices to use (CPU: False)

# Hardward Usage
hparam['num_workers'] = 4						# Number of Workers (1 for Single CUDA)

best_prec1 = 0

In [4]:
def main(hparam):
    global best_prec1
    
    if hparam['tensorboard']: configure("runs/%s"%(hparam['name']))
    
    torch.manual_seed(hparam['manual_seed'])
    print("Current CPU Random Seed: {}".format(torch.initial_seed()))
    
    device_nums = str(hparam['cuda_device']).strip('[]')
    device = torch.device("cuda:{}".format(device_nums)
                              if hparam['cuda_device'] else "cpu")
    
    if hparam['cuda_device']:
        torch.cuda.manual_seed_all(hparam['manual_seed'])
        print("Current GPU Random Seed: {}".format(torch.cuda.initial_seed()))
        
        
    normalize = transforms.Normalize(mean=[x/255.0 for x in [125.3, 123.0, 113.9]],
                                     std=[x/255.0 for x in [63.0, 62.1, 66.7]])

    if hparam['augment']:
        transform_train = transforms.Compose([
            transforms.ToTensor(),
            transforms.Lambda(lambda x: F.pad(x.unsqueeze(0),
                                (4,4,4,4),mode='reflect').squeeze()),
            transforms.ToPILImage(),
            transforms.RandomCrop(32),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
            ])
    else:
        transform_train = transforms.Compose([
            transforms.ToTensor(),
            normalize,
            ])
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        normalize
        ])

    # Load Dataset
    if hparam['data_split']:
        # Concat and Split Dataset into new Train/Val ratio
        (train_data, train_labels), (test_data, test_labels) = tf.keras.datasets.fashion_mnist.load_data()

        N = hparam['data_split']
        
        test_data = np.concatenate((test_data, train_data[N:]), axis=0)
        test_labels = np.concatenate((test_labels, train_labels[N:]), axis=0)
        
        train_data = train_data[:N]
        train_labels = train_labels[:N]
        
        train_data = torch.from_numpy(train_data).float()
        train_labels = torch.from_numpy(train_labels).long()
        test_data = torch.from_numpy(test_data).float()
        test_labels = torch.from_numpy(test_labels).long()
        
        print("Test Data:", test_data.shape)
        print("Train Data:", train_data.shape)
        
        train_dataset = torch.utils.data.TensorDataset(train_data.unsqueeze(1), train_labels)
        val_dataset = torch.utils.data.TensorDataset(test_data.unsqueeze(1), test_labels)

        print("(Rearranged)Train: {}\t Test: {}".format(len(train_dataset), len(val_dataset)))
        
    else:
        train_dataset = datasets.__dict__[hparam['dataset']](
            './data', train=True, download=True, transform=transform_test)
        val_dataset = datasets.__dict__[hparam['dataset']](
            './data', train=False, transform=transform_test)
    
    kwargs = {'num_workers': hparam['num_workers'], 'pin_memory': True}
    assert(hparam['dataset'] == 'FashionMNIST' or hparam['dataset'] == 'MNIST')
    
    # Import dataset to DataLoader
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=hparam['batch_size'], shuffle=True, **kwargs)
    val_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=hparam['batch_size'], shuffle=True, **kwargs)
    
    # create model
    model = DenseNet(growthRate=hparam['growthrate'], depth=hparam['layers'],
                     reduction=hparam['reduction'], bottleneck=hparam['bottleneck'],
                     nClasses=hparam['dataset'] == 'FashionMNIST' and len(hparam['classes']))

    # for training on multiple GPUs.
    # Use CUDA_VISIBLE_DEVICES=0,1 to specify which GPUs to use
    # WARNING: Multi-GPU Mode is not working properly for now...
    if type(hparam['cuda_device']) == list and len(hparam['cuda_device']) > 1:
        print('\n===> Training on Multi-GPU!')
        model = nn.DataParallel(model, device_ids=hparam['cuda_device'], dim=0)
    else:
        print('\n===> Training on Single-GPU!')
    model = model.to(device)
    
    # get the number of model parameters
    print('Number of model parameters: {}'.format(
        sum([p.data.nelement() for p in model.parameters()])))

    # optionally resume from a checkpoint
    if hparam['resume']:
        if os.path.isfile(hparam['resume']):
            print("=> loading checkpoint '{}'".format(hparam['resume']))
            checkpoint = torch.load(hparam['resume'])
            hparam['start_epoch'] = checkpoint['epoch']
            best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(hparam['resume'], checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(hparam['resume']))

    cudnn.benchmark = True

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.SGD(model.parameters(), hparam['lr'],
                                momentum=hparam['momentum'],
                                nesterov = hparam['nesterov'],
                                weight_decay=hparam['weight_decay'])

    start_time = time.time()
    
    for epoch in range(hparam['start_epoch'], hparam['epochs']):
        adjust_learning_rate(optimizer, epoch+1, hparam)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, hparam)

        # evaluate on validation set
        prec1 = validate(val_loader, model, criterion, epoch, hparam)

        # remember best prec@1 and save checkpoint
        is_best = prec1 > best_prec1
        best_prec1 = max(prec1, best_prec1)
        save_checkpoint({
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'best_prec1': best_prec1,
        }, is_best, hparam)
    print('Best accuracy: ', best_prec1)
    
    end_time = time.time()
    end_s = (end_time - start_time) % 60
    end_h = (end_time - start_time) // 3600
    end_m = (end_time - start_time - end_s) % 3600
    print("[Total Time Elapsed: {:02d}h {:02d}m {:.2f}s]")

In [5]:
def train(train_loader, model, criterion, optimizer, epoch, hparam):
    """Train for one epoch on the training set"""
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    
    # Device Mode
    device_nums = str(hparam['cuda_device']).strip('[]')
    device = torch.device("cuda:{}".format(device_nums)
                              if hparam['cuda_device'] else "cpu")

    # switch to train mode
    model.train()

    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        input = input.to(device)
        target = target.to(device, non_blocking=True)
        input_var = torch.autograd.Variable(input)
        target_var = torch.autograd.Variable(target)

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # measure accuracy and record loss
        prec1 = accuracy(output.data, target, topk=(1,))[0]
        losses.update(loss.data.item(), input.size(0))
        top1.update(prec1.item(), input.size(0))

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % hparam['print_freq'] == 0:
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                      epoch, i, len(train_loader), batch_time=batch_time,
                      loss=losses, top1=top1))
    # log to TensorBoard
    if hparam['tensorboard']:
        log_value('train_loss', losses.avg, epoch)
        log_value('train_acc', top1.avg, epoch)

In [6]:
def validate(val_loader, model, criterion, epoch, hparam):
    """Perform validation on the validation set"""
    batch_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    
    # Device Mode
    device_nums = str(hparam['cuda_device']).strip('[]')
    device = torch.device("cuda:{}".format(device_nums)
                              if hparam['cuda_device'] else "cpu")

    # switch to evaluate mode
    model.eval()

    end = time.time()
    for i, (input, target) in enumerate(val_loader):
        input = input.to(device)
        target = target.to(device, non_blocking=True)
        input_var = torch.autograd.Variable(input)
        target_var = torch.autograd.Variable(target)

        # compute output
        with torch.no_grad():
            output = model(input_var)
        loss = criterion(output, target_var)

        # measure accuracy and record loss
        prec1 = accuracy(output.data, target, topk=(1,))[0]
        losses.update(loss.data.item(), input.size(0))
        top1.update(prec1.item(), input.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % hparam['print_freq'] == 0:
            print('Test: [{0}/{1}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                  'Prec@1 {top1.val:.3f} ({top1.avg:.3f})'.format(
                      i, len(val_loader), batch_time=batch_time, loss=losses,
                      top1=top1))

    print(' * Prec@1 {top1.avg:.3f}'.format(top1=top1))
    # log to TensorBoard
    if hparam['tensorboard']:
        log_value('val_loss', losses.avg, epoch)
        log_value('val_acc', top1.avg, epoch)
    return top1.avg

In [7]:
def save_checkpoint(state, is_best, hparam, filename='checkpoint.pth.tar'):
    """Saves checkpoint to disk"""
    directory = "runs/{}/".format(hparam['name'])
    if not os.path.exists(directory):
        os.makedirs(directory)
    filename = directory + filename
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'runs/%s/'%(hparam['name']) + 'model_best.pth.tar')

In [8]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [9]:
def adjust_learning_rate(optimizer, epoch, hparam):
    """Sets the learning rate to the initial LR divided by 10 at 150th and 225th epochs"""
    learning_rate = hparam['lr'] * ((0.1 ** int(epoch >= 150)) * (0.1 ** int(epoch >= 225)))
    # log to TensorBoard
    if hparam['tensorboard']:
        log_value('learning_rate', learning_rate, epoch)
    for param_group in optimizer.param_groups:
        param_group['lr'] = learning_rate

In [10]:
def accuracy(output, target, topk=(1,)):
    """Computes the precision@k for the specified values of k"""
    maxk = max(topk)
    batch_size = target.size(0)

    _, pred = output.topk(maxk, 1, True, True)
    pred = pred.t()
    correct = pred.eq(target.view(1, -1).expand_as(pred))

    res = []
    for k in topk:
        correct_k = correct[:k].view(-1).float().sum(0)
        res.append(correct_k.mul_(100.0 / batch_size))
    return res

In [None]:
if __name__ == '__main__':
    main(hparam)

Current CPU Random Seed: 40000
Current GPU Random Seed: 40000
Test Data: torch.Size([40000, 28, 28])
Train Data: torch.Size([30000, 28, 28])
(Rearranged)Train: 30000	 Test: 40000

===> Training on Single-GPU!
Number of model parameters: 768730




Epoch: [0][0/235]	Time 1.813 (1.813)	Loss 2.3503 (2.3503)	Prec@1 6.250 (6.250)
Epoch: [0][10/235]	Time 0.142 (0.302)	Loss 1.4891 (1.9845)	Prec@1 50.781 (28.693)
Epoch: [0][20/235]	Time 0.143 (0.226)	Loss 1.1294 (1.6423)	Prec@1 62.500 (41.592)
Epoch: [0][30/235]	Time 0.144 (0.199)	Loss 0.9251 (1.4187)	Prec@1 72.656 (49.294)
Epoch: [0][40/235]	Time 0.143 (0.186)	Loss 0.8480 (1.2566)	Prec@1 71.875 (55.202)
Epoch: [0][50/235]	Time 0.144 (0.177)	Loss 0.6458 (1.1446)	Prec@1 75.781 (59.099)
Epoch: [0][60/235]	Time 0.143 (0.172)	Loss 0.7227 (1.0729)	Prec@1 69.531 (61.335)
Epoch: [0][70/235]	Time 0.143 (0.168)	Loss 0.7406 (1.0094)	Prec@1 70.312 (63.567)
Epoch: [0][80/235]	Time 0.143 (0.165)	Loss 0.5143 (0.9623)	Prec@1 80.469 (65.191)
Epoch: [0][90/235]	Time 0.143 (0.162)	Loss 0.5522 (0.9158)	Prec@1 78.125 (66.784)
Epoch: [0][100/235]	Time 0.143 (0.160)	Loss 0.5178 (0.8808)	Prec@1 78.125 (68.015)
Epoch: [0][110/235]	Time 0.143 (0.159)	Loss 0.5580 (0.8488)	Prec@1 80.469 (69.123)
Epoch: [0][120/23

Test: [220/313]	Time 0.041 (0.042)	Loss 0.3537 (0.4178)	Prec@1 92.188 (85.701)
Test: [230/313]	Time 0.041 (0.042)	Loss 0.2963 (0.4169)	Prec@1 88.281 (85.697)
Test: [240/313]	Time 0.041 (0.042)	Loss 0.5729 (0.4175)	Prec@1 82.812 (85.662)
Test: [250/313]	Time 0.041 (0.042)	Loss 0.3128 (0.4170)	Prec@1 92.969 (85.701)
Test: [260/313]	Time 0.041 (0.042)	Loss 0.3435 (0.4170)	Prec@1 88.281 (85.698)
Test: [270/313]	Time 0.041 (0.042)	Loss 0.2828 (0.4177)	Prec@1 91.406 (85.669)
Test: [280/313]	Time 0.041 (0.042)	Loss 0.4302 (0.4175)	Prec@1 84.375 (85.654)
Test: [290/313]	Time 0.041 (0.042)	Loss 0.4773 (0.4172)	Prec@1 86.719 (85.701)
Test: [300/313]	Time 0.041 (0.042)	Loss 0.4150 (0.4179)	Prec@1 83.594 (85.660)
Test: [310/313]	Time 0.041 (0.042)	Loss 0.4680 (0.4186)	Prec@1 86.719 (85.666)
 * Prec@1 85.657
Epoch: [2][0/235]	Time 0.300 (0.300)	Loss 0.2812 (0.2812)	Prec@1 92.188 (92.188)
Epoch: [2][10/235]	Time 0.144 (0.158)	Loss 0.2099 (0.2343)	Prec@1 92.969 (91.477)
Epoch: [2][20/235]	Time 0.144 

Test: [120/313]	Time 0.041 (0.043)	Loss 0.3550 (0.2627)	Prec@1 83.594 (90.567)
Test: [130/313]	Time 0.041 (0.043)	Loss 0.2340 (0.2628)	Prec@1 92.188 (90.571)
Test: [140/313]	Time 0.041 (0.043)	Loss 0.3847 (0.2632)	Prec@1 90.625 (90.603)
Test: [150/313]	Time 0.041 (0.042)	Loss 0.2012 (0.2662)	Prec@1 91.406 (90.465)
Test: [160/313]	Time 0.041 (0.042)	Loss 0.2437 (0.2659)	Prec@1 92.969 (90.431)
Test: [170/313]	Time 0.041 (0.042)	Loss 0.2800 (0.2661)	Prec@1 89.844 (90.470)
Test: [180/313]	Time 0.041 (0.042)	Loss 0.2192 (0.2670)	Prec@1 89.062 (90.418)
Test: [190/313]	Time 0.041 (0.042)	Loss 0.3208 (0.2662)	Prec@1 91.406 (90.502)
Test: [200/313]	Time 0.041 (0.042)	Loss 0.2940 (0.2674)	Prec@1 91.406 (90.442)
Test: [210/313]	Time 0.041 (0.042)	Loss 0.2837 (0.2689)	Prec@1 92.188 (90.421)
Test: [220/313]	Time 0.041 (0.042)	Loss 0.3064 (0.2678)	Prec@1 87.500 (90.480)
Test: [230/313]	Time 0.041 (0.042)	Loss 0.2268 (0.2675)	Prec@1 91.406 (90.483)
Test: [240/313]	Time 0.041 (0.042)	Loss 0.3781 (0.26

Test: [20/313]	Time 0.041 (0.050)	Loss 0.2636 (0.2445)	Prec@1 88.281 (91.295)
Test: [30/313]	Time 0.041 (0.047)	Loss 0.3491 (0.2445)	Prec@1 89.062 (91.482)
Test: [40/313]	Time 0.041 (0.046)	Loss 0.2413 (0.2453)	Prec@1 90.625 (91.425)
Test: [50/313]	Time 0.041 (0.045)	Loss 0.2463 (0.2409)	Prec@1 92.188 (91.820)
Test: [60/313]	Time 0.041 (0.044)	Loss 0.1851 (0.2407)	Prec@1 92.188 (91.829)
Test: [70/313]	Time 0.041 (0.044)	Loss 0.2236 (0.2421)	Prec@1 92.969 (91.824)
Test: [80/313]	Time 0.041 (0.044)	Loss 0.3414 (0.2418)	Prec@1 89.062 (91.792)
Test: [90/313]	Time 0.041 (0.043)	Loss 0.1815 (0.2375)	Prec@1 92.188 (91.870)
Test: [100/313]	Time 0.041 (0.043)	Loss 0.1967 (0.2401)	Prec@1 92.969 (91.808)
Test: [110/313]	Time 0.041 (0.043)	Loss 0.2162 (0.2387)	Prec@1 94.531 (91.829)
Test: [120/313]	Time 0.041 (0.043)	Loss 0.2545 (0.2380)	Prec@1 90.625 (91.871)
Test: [130/313]	Time 0.041 (0.043)	Loss 0.3240 (0.2379)	Prec@1 87.500 (91.854)
Test: [140/313]	Time 0.041 (0.043)	Loss 0.2184 (0.2369)	Prec

Best Accuracy: 93.81