In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
# from IPython.core.debugger import Tracer

In [None]:
import numpy
import numpy as np
import pandas as pd
import torch
import torchvision
import torch.nn as nn
import torch.functional as F
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import os, copy

from torch.utils.tensorboard import SummaryWriter

from models.nets import *
from utils.data_loader import get_train_valid_loader, get_test_loader
from utils.utils import num_params, save_summary, read_summary, format_scientific

In [None]:

from fastai.torch_core import defaults
from fastai.vision.data import ImageDataBunch

import fastai
from fastai.vision import *
from fastai.vision.data import *
from fastai.basics import *
from fastai.basic_data import *
from fastai.callbacks import EarlyStoppingCallback, CSVLogger
from utils.callbacks import ReduceLROnPlateauCallback, SaveModelCallback, MetricTracker
from utils.tensorboard import LearnerTensorboardWriter

In [None]:
cuda = 1 # device
k = 1
model = gaussian_resnet20(k)
model_code = 'gaussian_resnet20' + '(k={})'.format(k)

In [None]:
data_path = Path('/root/data/cifar10')
logs_path = Path('logs')  # relative to project directory
model_saves_dir = Path('model_saves')
csv_logs_dir = Path('csv_logs')
tb_dir = Path('tensorboard')

In [None]:
save_model = True
log = True
write = True

In [None]:
max_lr = 1e-1
min_lr = 1e-4
epochs = 150

momentum = 0.9
weight_decay = 1e-4
nesterov = False

bs = 128  # as used in resnet paper. Takes 1.5 MB of RAM, so not an issue
num_workers = 4  # optimal for the given machine. sometimes gives an error if num_workers>0
pin_memory = False  # no difference for the given machine

In [None]:
device = torch.device("cuda:" + str(cuda) if torch.cuda.is_available() else "cpu")
defaults.device = device
model.to(device);

# if torch.cuda.device_count() > 1:
#     print("Let's use", torch.cuda.device_count(), "GPUs!")
#     model = nn.DataParallel(model)

In [None]:
defaults.device

In [None]:
train_loader, valid_loader = get_train_valid_loader(
    data_dir=data_path, valid_size=0.1, augment=True, random_seed=42,
    batch_size=bs, num_workers=num_workers, shuffle=True,  
    pin_memory=pin_memory, show_sample=False)

test_loader = get_test_loader(
    data_dir=data_path, 
    batch_size=bs, num_workers=num_workers, shuffle=False,
    pin_memory=pin_memory)

train_epoch_len = len(train_loader)

In [None]:
callback_fns = [
    partial(ReduceLROnPlateauCallback, monitor='valid_loss', mode='auto', patience=10, factor=0.1, min_delta=0, min_lr=min_lr),
    partial(EarlyStoppingCallback, monitor='valid_loss', min_delta=0, patience=20),
    partial(MetricTracker, func=accuracy, train=True, name='train_accu'),  # additionally track train accuracy
]
if save_model: callback_fns.append(partial(
    SaveModelCallback, every='improvement', monitor='accuracy', 
    mode='max', name=model_code))
if log: callback_fns.append(partial(
    CSVLogger, append=False, filename=csv_logs_dir/model_code))
if write: callback_fns.append(partial(
    LearnerTensorboardWriter, base_dir=logs_path/tb_dir, name=model_code,
    stats_iters=10*train_epoch_len, hist_iters=10*train_epoch_len))

In [None]:
bunch = ImageDataBunch(train_loader, valid_loader, test_dl=test_loader, 
                       device=device, path=data_path)
# lr is set by fit
sgd = partial(torch.optim.SGD, momentum=momentum, weight_decay=weight_decay, nesterov=nesterov)

learn = Learner(bunch, model, loss_func=nn.CrossEntropyLoss(), opt_func=sgd, true_wd=False, wd=weight_decay, 
                metrics=[accuracy], callback_fns=callback_fns,
                path=logs_path, model_dir=model_saves_dir)

In [None]:
learn.fit(epochs, lr=max_lr, wd=weight_decay)

In [None]:
best_epoch, best_value = learn.save_model_callback.best_epoch, learn.save_model_callback.best
time_to_best_epoch = learn.save_model_callback.time_to_best_epoch
changed_lr_on_epochs = learn.reduce_lr_on_plateau_callback.changed_lr_on_epochs

print("Best model was found at epoch {} with accuracy value {:.4f} in {:.2f} seconds.".format(best_epoch, best_value, time_to_best_epoch))

In [None]:
loss_train, accu_train = learn.validate(dl=learn.data.train_dl)
loss_valid, accu_valid = learn.validate(dl=learn.data.valid_dl)
loss_test,  accu_test  = learn.validate(dl=learn.data.test_dl)
# accu_train, accu_valid, accu_test = accu_train.item(), accu_valid.item(), accu_test.item()

In [None]:
n_params, n_layers = num_params(model)

In [None]:
val_dict = {'name': gaussian_resnet20.__name__+'(k={})'.format(k),
            'accu_test': accu_test * 100,
            'n_params': n_params,
            'epochs': best_epoch + 1,
            'time': time_to_best_epoch,
            'changed_lr_on': ','.join(map(format_scientific, changed_lr_on_epochs.keys())),
            'loss_train': loss_train, 
            'loss_valid': loss_valid, 
            'loss_test':  loss_test, 
            'accu_train': accu_train * 100, 
            'accu_valid': accu_valid * 100,
            'accu_test (again)': accu_test * 100,
            'other': '',
           }
val_dict

In [None]:
save_summary(logs_path/'models_summary.csv', val_dict)

In [None]:
read_summary(logs_path/'models_summary.csv')