In [None]:
!nvidia-smi

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [None]:
# MXNet package
import mxnet as mx
from mxnet import nd, init, cpu, gpu, gluon, autograd
# from mxnet.gluon import nn
from mxnet.gluon.data import DataLoader
from mxnet.gluon.data.vision import CIFAR10, transforms as T
from gluoncv.data import transforms as gcv_T
from gluoncv.model_zoo import cifar_resnet20_v1

# Normal package
import time
from tensorboardX import SummaryWriter

In [None]:
class Config(object):
    # Model
    num_class = 10
    
    # Train
    max_steps = 80000
    train_batch_size = 128
    val_batch_size = 256
    train_num_workers = 4
    val_num_workers = 4
    lr = 0.1
    wd = 0.0001
    momentum = 0.9
    lr_decay = [(40000, 0.1), (60000, 0.1)]
    
    # Record
    ckpt_dir = "./tmp/checkpoints"
    main_tag = 'baseline'
    ckpt_prefix = 'baseline'
    train_record_per_steps = 200
    val_per_steps = 400
    spotter_starts_at = 10000
    spotter_window_size = 10
    patience = 20
    snapshot_per_steps = 400

In [None]:
if not os.path.exists(Config.ckpt_dir):
    os.makedirs(Config.ckpt_dir)

In [None]:
datetime_stamp = time.strftime('%Y%m%d_%H%M%S',time.localtime(time.time()))
writer = SummaryWriter(log_dir="tmp/runs/{}_{}".format(Config.main_tag, datetime_stamp))

In [None]:
net = cifar_resnet20_v1()
net.initialize()
net.collect_params().reset_ctx(gpu(0))

In [None]:
def evaluate(net, num_class, loss_func, dataloader, ctx):
    t = time.time()
    test_num_correct = 0
    eval_loss = 0.

    actives = 0
    for X, y in dataloader:
        X = X.as_in_context(ctx)
        y = y.as_in_context(ctx)

        outputs = net(X)
        loss = loss_func(outputs, y)
        eval_loss += loss.sum().asscalar()
        pred = outputs.argmax(axis=1)
        test_num_correct += (pred == y.astype('float32')).sum().asscalar()

    eval_loss /= len(test_dataset)
    eval_acc = test_num_correct / len(test_dataset)
    return eval_loss, eval_acc, time.time()-t

In [None]:
train_transformer = T.Compose([
    gcv_T.RandomCrop(32, pad=4),
    T.RandomFlipLeftRight(),
    T.ToTensor(),
    T.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
])

eval_transformer = T.Compose([
    T.ToTensor(),
    T.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
])

In [None]:
train_dataset = CIFAR10(train=True).transform_first(train_transformer)
train_loader = DataLoader(dataset=train_dataset, 
                          batch_size=Config.train_batch_size,
                          num_workers=Config.train_num_workers,
                          last_batch='discard')
test_dataset = CIFAR10(train=False).transform_first(eval_transformer)
test_loader = DataLoader(dataset=test_dataset, 
                         batch_size=Config.val_batch_size, 
                         shuffle=False,
                         num_workers=Config.val_num_workers, 
                         last_batch='keep')

In [None]:
# Summary
train_size = len(train_dataset)
val_size = len(test_dataset)
print(f'trainset size => {train_size}')
print(f'valset size => {val_size}')
steps_per_epoch = train_size / Config.train_batch_size
print(f'{steps_per_epoch} steps for per epoch (BATCH_SIZE={Config.train_batch_size})')
print("record per {} steps ({} samples, {} times per epoch)".format(
                                                            Config.train_record_per_steps,
                                                            Config.train_record_per_steps * Config.train_batch_size,
                                                            steps_per_epoch / Config.train_record_per_steps))
print("evaluate per {} steps ({} times per epoch)".format(
                                                    Config.val_per_steps,
                                                    steps_per_epoch / Config.val_per_steps))
print("spotter start at {} steps ({} epoches)".format(
                                                Config.spotter_starts_at,
                                                Config.spotter_starts_at / steps_per_epoch))
print("size of spotter window is {} ({} steps)".format(
                                                Config.spotter_window_size,
                                                Config.spotter_window_size * Config.val_per_steps))
print("max patience: {} ({} steps; {} samples; {} epoches)".format(
                                                            Config.patience,
                                                            Config.patience * Config.val_per_steps,
                                                            Config.patience * Config.val_per_steps * Config.train_batch_size,
                                                            Config.patience * Config.val_per_steps / steps_per_epoch))
print("snapshot per {} steps ({} times per epoch)".format(
                                                    Config.snapshot_per_steps,
                                                    steps_per_epoch / Config.snapshot_per_steps))

In [None]:
global_steps = 0
good_acc_window = [0.] * Config.spotter_window_size
estop_loss_window = [0.] * Config.patience
loss_func = gluon.loss.SoftmaxCrossEntropyLoss()

In [None]:
trainer = gluon.Trainer(net.collect_params(), 'nag', 
                        {'learning_rate': Config.lr, 'wd': Config.wd, 'momentum': Config.momentum})

In [None]:
size_per_record = Config.train_record_per_steps * Config.train_batch_size
flag_early_stop = False
train_loss = 0.
train_num_correct = 0
t = time.time()

while global_steps < Config.max_steps and not flag_early_stop:
    for X, y in train_loader:
        global_steps += 1
        # Move data to gpu
        X = X.as_in_context(gpu(0))
        y = y.as_in_context(gpu(0))
        
        # Forward & Backward
        with autograd.record():
            outputs = net(X)
            loss = loss_func(outputs, y)
        
        loss.backward()
        trainer.step(Config.train_batch_size)
        
        train_loss += loss.sum().asscalar()
        pred = outputs.argmax(axis=1)
        train_num_correct += (pred == y.astype('float32')).sum().asscalar()
        
        # Record training info
        if global_steps and global_steps % Config.train_record_per_steps == 0:
            writer.add_scalars(f'{Config.main_tag}/Loss', {'train': train_loss/size_per_record}, global_steps)
            writer.add_scalars(f'{Config.main_tag}/Acc', {'train': train_num_correct/size_per_record}, global_steps)
            train_loss = 0.
            train_num_correct = 0
            
        # Evaluate
        if global_steps and global_steps % Config.val_per_steps == 0:
            # Evaluate
            eval_loss, eval_acc, __ = evaluate(net, Config.num_class, loss_func, test_loader, ctx=gpu(0))
            writer.add_scalar(f'{Config.main_tag}/Speed', Config.val_per_steps / (time.time() - t), global_steps)
            writer.add_scalars(f'{Config.main_tag}/Loss', {'val': eval_loss}, global_steps)
            writer.add_scalars(f'{Config.main_tag}/Acc', {'val': eval_acc}, global_steps)
            
#             # Spotter
#             good_acc_window.pop(0)
#             if global_steps >= Config.spotter_starts_at and eval_acc > max(good_acc_window):
#                 print( "catch a good model with acc {:.6f} at {} step".format(eval_acc, global_steps) )
#                 writer.add_text(Config.main_tag, "catch a good model with acc {:.6f}".format(eval_acc), global_steps)
#                 net.save_parameters("{}/{}-{:06d}.params".format(Config.ckpt_dir, Config.ckpt_prefix, global_steps))
#             good_acc_window.append(eval_acc)

#             # Early stop
#             estop_loss_window.pop(0)
#             estop_loss_window.append(eval_loss)
#             if global_steps > Config.val_per_steps * len(estop_loss_window):
#                 min_index = estop_loss_window.index( min(estop_loss_window) )
#                 writer.add_scalar(f'{Config.main_tag}/val/Patience', min_index, global_steps)
#                 if min_index == 0:
#                     flag_early_stop = True
#                     print("early stop at {} steps".format(global_steps))
#                     break
            
            t = time.time()
        
            # lr decay
            if Config.lr_decay and Config.lr_decay[0][0] <= global_steps:
                new_lr = trainer.learning_rate * Config.lr_decay.pop(0)[1]
                print(f"LR Decay: {trainer.learning_rate} -> {new_lr}")
                trainer.set_learning_rate(new_lr)
        
#         # Snapshot
#         if global_steps and global_steps % Config.snapshot_per_steps == 0:
#             net.save_parameters("{}/{}-{:06d}.params".format(Config.ckpt_dir, Config.ckpt_prefix, global_steps))