# Train Model with Cosine Annealing + Cutout + Gradient Clipping
Use training procedure outlined in [source](https://github.com/D-X-Y/NAS-Projects)

### Imports

In [1]:
PATH = ''
import os
import sys
import argparse
from datetime import datetime

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

from torch.utils.data import DataLoader
from torch.autograd import Variable

from tensorboardX import SummaryWriter

os.chdir(PATH)
from conf import settings
from utils.model_utils import get_network

### Parameters

In [2]:
def parse_args(args):
    """
    Takes as input a string of args
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('-net', type=str, required=True, help='net type')
    parser.add_argument('-gpu', type=bool, default=True, help='use gpu or not')
    parser.add_argument('-w', type=int, default=2, help='number of workers for dataloader')
    parser.add_argument('-b', type=int, default=32, help='batch size for dataloader')
    parser.add_argument('-s', type=bool, default=True, help='whether shuffle the dataset')
    parser.add_argument('-warm', type=int, default=1, help='warm up training phase')
    parser.add_argument('-lr', type=float, default=0.1, help='initial learning rate')
    return parser.parse_args(args.split())
    

In [3]:
args = '-net nasnet -b 64 -lr 0.025'
args = parse_args(args)

### Load Model

In [4]:
net = get_network(args, use_gpu=args.gpu)

### Load Data

#### Cutout

In [5]:
class Cutout(object):
    def __init__(self, length):
        self.length = length

    def __repr__(self):
        return ('{name}(length={length})'.format(name=self.__class__.__name__, **self.__dict__))

    def __call__(self, img):
        h, w = img.size(1), img.size(2)
        mask = np.ones((h, w), np.float32)
        y = np.random.randint(h)
        x = np.random.randint(w)

        y1 = np.clip(y - self.length // 2, 0, h)
        y2 = np.clip(y + self.length // 2, 0, h)
        x1 = np.clip(x - self.length // 2, 0, w)
        x2 = np.clip(x + self.length // 2, 0, w)

        mask[y1: y2, x1: x2] = 0.
        mask = torch.from_numpy(mask)
        mask = mask.expand_as(img)
        img *= mask
        return img

In [6]:
def get_transform_train():
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD),
        Cutout(16)
    ])
    return transform_train
    
    
def get_transform_test():
    transform_test = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(settings.CIFAR100_TRAIN_MEAN, settings.CIFAR100_TRAIN_STD),
    ])
    return transform_test

In [7]:
def get_training_dataloader(mean, std, batch_size=16, num_workers=2, shuffle=True):
    """ return training dataloader
    Args:
        mean: mean of cifar100 training dataset
        std: std of cifar100 training dataset
        path: path to cifar100 training python dataset
        batch_size: dataloader batchsize
        num_workers: dataloader num_works
        shuffle: whether to shuffle 
    Returns: train_data_loader:torch dataloader object
    """

    transform_train = get_transform_train()
    cifar100_training = torchvision.datasets.CIFAR100(root='./data', train=True, download=True, transform=transform_train)
    cifar100_training_loader = DataLoader(
        cifar100_training, shuffle=shuffle, num_workers=num_workers, batch_size=batch_size)

    return cifar100_training_loader

def get_test_dataloader(mean, std, batch_size=16, num_workers=2, shuffle=True):
    """ return training dataloader
    Args:
        mean: mean of cifar100 test dataset
        std: std of cifar100 test dataset
        path: path to cifar100 test python dataset
        batch_size: dataloader batchsize
        num_workers: dataloader num_works
        shuffle: whether to shuffle 
    Returns: cifar100_test_loader:torch dataloader object
    """

    transform_test = get_transform_test()
    cifar100_test = torchvision.datasets.CIFAR100(root='./data', train=False, download=True, transform=transform_test)
    cifar100_test_loader = DataLoader(
        cifar100_test, shuffle=shuffle, num_workers=num_workers, batch_size=batch_size)

    return cifar100_test_loader

In [8]:
#data preprocessing:
cifar100_training_loader = get_training_dataloader(
    settings.CIFAR100_TRAIN_MEAN,
    settings.CIFAR100_TRAIN_STD,
    num_workers=args.w,
    batch_size=args.b,
    shuffle=args.s
)

cifar100_test_loader = get_test_dataloader(
    settings.CIFAR100_TRAIN_MEAN,
    settings.CIFAR100_TRAIN_STD,
    num_workers=args.w,
    batch_size=args.b,
    shuffle=args.s
)

Files already downloaded and verified
Files already downloaded and verified


### Loss Function

In [9]:
loss_function = nn.CrossEntropyLoss().cuda()

### Optimizer

In [10]:
optimizer = optim.SGD(net.parameters(), args.lr, momentum=0.9, weight_decay=0.0003)

### Train Scheduler
Use cosine annealing. Paper uses a crazy amount of epochs.

In [11]:
epochs = 600
lr_min = 0.0001

In [12]:
train_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 
                                                       float(epochs), 
                                                       eta_min=float(lr_min))

### Setup Checkpoint

In [13]:
checkpoint_path = os.path.join(settings.CHECKPOINT_PATH, args.net, settings.TIME_NOW)

### Use Tensorboard

In [14]:
if not os.path.exists(settings.LOG_DIR):
    os.mkdir(settings.LOG_DIR)
writer = SummaryWriter(log_dir=os.path.join(
        settings.LOG_DIR, args.net, settings.TIME_NOW))

### Create Checkpoint Folder to Save Model

In [15]:
#create checkpoint folder to save model
if not os.path.exists(checkpoint_path):
    os.makedirs(checkpoint_path)
checkpoint_path = os.path.join(checkpoint_path, '{net}-{epoch}-{type}.pth')

### Train the Model

In [16]:
def train(epoch):

    net.train()
    for batch_index, (images, labels) in enumerate(cifar100_training_loader):
        
        images = Variable(images)
        labels = Variable(labels)

        labels = labels.cuda()
        images = images.cuda()

        optimizer.zero_grad()
        outputs = net(images)
        loss = loss_function(outputs, labels)
        loss.backward()
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(net.parameters(), 5)
        optimizer.step()
        
        
def eval_training(epoch):
    net.eval()

    test_loss = 0.0 # cost function error
    correct = 0.0

    for (images, labels) in cifar100_test_loader:
        images = Variable(images)
        labels = Variable(labels)

        images = images.cuda()
        labels = labels.cuda()

        outputs = net(images)
        loss = loss_function(outputs, labels)
        test_loss += loss.item()
        _, preds = outputs.max(1)
        correct += preds.eq(labels).sum()

    #add informations to tensorboard
    writer.add_scalar('Test/Average loss', test_loss / len(cifar100_test_loader.dataset), epoch)
    writer.add_scalar('Test/Accuracy', correct.float() / len(cifar100_test_loader.dataset), epoch)

    return correct.float() / len(cifar100_test_loader.dataset)

In [None]:
best_acc = 0.0
for epoch in range(1, settings.EPOCH):
    train(epoch)
    train_scheduler.step()
    acc = eval_training(epoch)
    
    send_message('Epoch ' + str(epoch) + ' complete. Accuracy: ' + str(acc.item()), channel=args.net)

    #start to save best performance model after learning rate decay to 0.01 
    if epoch > settings.MILESTONES[1] and best_acc < acc:
        torch.save(net.state_dict(), checkpoint_path.format(net=args.net, epoch=epoch, type='best'))
        best_acc = acc
        continue

    if not epoch % settings.SAVE_EPOCH:
        torch.save(net.state_dict(), checkpoint_path.format(net=args.net, epoch=epoch, type='regular'))

writer.close()