
Experiments Management with Tensorboard
=====================

실제 딥러닝 연구나 개발을 하게 되면 꼭 마주치게 되는 것이 바로 수십개의 실험 결과를 관리하고 하이퍼파라미터에 따른 성능변화를 모니터링하고, 해당 실험 결과를 다시 재현하는 것입니다.  

본 실습에서는 Tensorboard와 Hparam 기능을 활용해서 Cifar10 데이터셋에 대한 다수의 실험 결과를 관리하고 성능을 모니터링하는지 알아봅니다. 

Cifar 10 튜토리얼 코드는 [이곳](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html)을 참조하였습니다.

## 1. Dataset과 DataLoader 준비하기  

Cifar 10 데이터셋을 준비합니다. 

In [2]:
import torch
import torchvision
import torchvision.transforms as transforms

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)

trainset, _ = torch.utils.data.random_split(trainset, [5000, 45000])
trainset, valset = torch.utils.data.random_split(trainset, [4000, 1000])

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testset, _ = torch.utils.data.random_split(testset, [2000, 8000])


classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


## 2. Model Construction

간단한 CNN 아키텍쳐를 구현해봅니다.  
두개의 `Conv` 모듈과 `MaxPool` 모듈을 통과한 후 `FC` 레이어를 통해 10개의 클래스를 분류합니다. 이 때 `Dropout` 모듈의 활성화 확률을 외부 변수로 입력 받습니다.

In [18]:
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter


class Net(nn.Module):
    def __init__(self, args):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        self.dropout = nn.Dropout(p=args.dp_rate)
        self.relu = nn.ReLU()


    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.dropout(self.relu(self.fc2(x)))
        x = self.fc3(x)
        return x

## 3. Train, Validation, Test

Data, Model, Loss, Optimization을 모두 같이 사용하여 봅시다. Epoch 별로 train과 validation, test가 이루어질 수 있게 함수를 나누었습니다. 이 때 train_loss, val_loss, accuracy가 기록되도록 합니다.

In [21]:
def train(model, dataloader, optimizer, criterion):
    
    epoch_train_loss = 0
    cnt_iter = 0
    for batch_idx, (images, y) in enumerate(dataloader):
        images, y = images.to(args.device), y.to(args.device)

        model.train()
        optimizer.zero_grad()
        pred_y = model(images)
        train_loss = criterion(pred_y, y)
        epoch_train_loss += train_loss.item()
        
        train_loss.backward()
        optimizer.step()
        
    epoch_train_loss /= len(dataloader)
    return model, epoch_train_loss


def validate(model, dataloader, criterion):
    
    epoch_val_loss = 0
    with torch.no_grad():
        for batch_idx, (images, y) in enumerate(dataloader):
            images, y = images.to(args.device), y.to(args.device)

            model.eval()
            pred_y = model(images)
            val_loss = criterion(pred_y, y)
            epoch_val_loss += val_loss.item()

    epoch_val_loss /= len(dataloader)
    return epoch_val_loss


def test(model, dataloader):
    
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (images, labels) in enumerate(dataloader):
            images, labels = images.to(args.device), labels.to(args.device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
    accuracy = 100 * correct / total
    return accuracy


def experiment(partition, args):
    
    seed = 123
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    writer = SummaryWriter()

    
    model = Net(args)    
    model.to(args.device)
    criterion = nn.CrossEntropyLoss()
    
    # Initialize Optimizer
    trainable_parameters = filter(lambda p: p.requires_grad, model.parameters())
    if args.optim == 'ADAM':
        optimizer = optim.Adam(trainable_parameters, lr=args.lr, weight_decay=args.l2_coef)
    elif args.optim == 'RMSProp':
        optimizer = optim.RMSprop(trainable_parameters, lr=args.lr, weight_decay=args.l2_coef)
    elif args.optim == 'SGD':
        optimizer = optim.SGD(trainable_parameters, lr=args.lr, weight_decay=args.l2_coef)
    else:
        assert False, "Undefined Optimizer Type"
      
    args.best_acc = 0
    for epoch in range(args.epoch):
        model, train_loss = train(model, partition['train'], optimizer, criterion)
        val_loss = validate(model, partition['val'], criterion)
        accuracy = test(model, partition['test'])
        
        if accuracy > args.best_acc:
            args.best_acc = accuracy
            args.best_epoch = epoch
            
        writer.add_scalar('Loss/train', train_loss, epoch)
        writer.add_scalar('Loss/val', val_loss, epoch)
        writer.add_scalar('Metric/acc', accuracy, epoch)
        
    writer.add_hparams(
        hparam_dict=vars(args),
        metric_dict={'best_acc':args.best_acc, 'best_epoch':args.best_epoch}
    )
                
    return model, args 

In [22]:
import argparse
import time 
import numpy as np


parser = argparse.ArgumentParser()
args = parser.parse_args("")


# ==== Model Architecture Config ==== #
args.dp_rate = 0.3


# ==== Optimizer Config ==== #
args.lr = 0.00005
args.l2_coef = 0.0001
args.optim = 'ADAM'


# ==== Training Config ==== #
args.epoch = 10
args.batch_size = 256
args.device = 'cuda' if torch.cuda.is_available() else 'cpu'
args.exp_name = 'exp1_lr_stage'


# ==== DataLoader Preparation ==== #
trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size,
                                          shuffle=True)
valloader = torch.utils.data.DataLoader(valset, batch_size=args.batch_size,
                                          shuffle=False)
testloader = torch.utils.data.DataLoader(testset, batch_size=args.batch_size*2,
                                         shuffle=False)
partition = {'train': trainloader, 'val': valloader, 'test': testloader}


# ==== Experiment ==== #
#list_n_layer = [1]
list_lr = [0.001, 0.005]

cnt_exp = 0
for lr in list_lr:
    args.lr = lr

    model, result = experiment(partition, args)

    cnt_exp += 1
    print('[Exp {:2}] got acc: {:2.3f}, at epoch {:2}'.format(cnt_exp, result.best_acc, result.best_epoch))

[Exp  1] got acc: 36.850, at epoch  7
[Exp  2] got acc: 43.400, at epoch  9
