In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import time
from copy import deepcopy
import argparse
import sys
import traceback
import json
from tqdm import tqdm_notebook, tqdm

import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
from torch.utils import data
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_absolute_error

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import gridspec
from matplotlib.font_manager import FontProperties
from utils import *


parser = argparse.ArgumentParser()

seed = 123
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7fae98028110>

# 1. Prepare Dataset

In [4]:
partition = make_partition(50000, 0.1, 0.1, 123)

# 2. Model Architecture

Create char_to_ix Ref: https://github.com/pytorch/tutorials/blob/master/beginner_source/nlp/word_embeddings_tutorial.py  
Pre-defined Embedding Layer Ref: https://medium.com/@martinpella/how-to-use-pre-trained-word-embeddings-in-pytorch-71ca59249f76  
ResNet Variation Ref: https://towardsdatascience.com/an-overview-of-resnet-and-its-variants-5281e2f56035

# 3. Train, Validate, Evaluate

In [90]:
def train(model, partition, optimizer, criterion, args, **kwargs):
    data_iter = DataLoader(
        partition['train'],
        batch_size=args.batch_size,
        shuffle=args.shuffle
    )
    
    epoch_train_loss = 0
    list_train_loss = list()
    cnt_iter = 0
    for batch_idx, batch in enumerate(data_iter):
        X, A, y = batch
        X, A, y = X.to(args.device).long(), A.to(args.device).long(), y.to(args.device).float()
    
        model.train()
        optimizer.zero_grad()

        pred_y = model(X, A)
#         pred_y.require_grad = False
        train_loss = criterion(pred_y, y)
        epoch_train_loss += train_loss.item()
        list_train_loss.append({'epoch':batch_idx/len(data_iter)+kwargs['epoch'], 'train_loss':train_loss.item()})
        train_loss.backward()
        optimizer.step()
        
        cnt_iter += 1
        args.bar.update(len(X))
    return model, list_train_loss

def validate(model, partition, criterion, args):
    data_iter = DataLoader(
        partition['val'],
        batch_size=args.test_batch_size,
        shuffle=args.shuffle
    )
    
    epoch_val_loss = 0
    cnt_iter = 0
    for batch_idx, batch in enumerate(data_iter):
        X, A, y = batch
        X, A, y = X.to(args.device).long(), A.to(args.device).long(), y.to(args.device).float()
    
        model.eval()
        pred_y = model(X, A)
        pred_y.require_grad = False
        val_loss = criterion(pred_y, y)
        epoch_val_loss += val_loss.item()
        cnt_iter += 1

    return epoch_val_loss/cnt_iter

def test(model, partition, args, **kwargs):
    data_iter = DataLoader(
        partition['test'],
        batch_size=args.test_batch_size,
        shuffle=False
    )
    
    list_y, list_pred_y = list(), list()
    for batch_idx, batch in enumerate(data_iter):
        X, A, y = batch
        X, A, y = X.to(args.device).long(), A.to(args.device).long(), y.to(args.device).float()
    
        model.eval()
        pred_y = model(X, A)
        list_y += y.cpu().detach().numpy().tolist()
        list_pred_y += pred_y.cpu().detach().numpy().tolist()
        args.bar.update(len(X))

    mae = mean_absolute_error(list_y, list_pred_y)
    std = np.std(np.array(list_y)-np.array(list_pred_y))
    return mae, std, list_y, list_pred_y

def experiment(partition, args):
    ts = time.time()
    args.vocab_size = 40
    args.max_len = 50
    args.input_shape = (args.max_len, args.vocab_size)
    model = Net(args)
    model.to(args.device)
    criterion = nn.MSELoss()
    
    # Initialize Optimizer
    trainable_parameters = filter(lambda p: p.requires_grad, model.parameters())
    if args.optim == 'ADAM':
        optimizer = optim.Adam(trainable_parameters, lr=args.lr, weight_decay=args.l2_coef)
    elif args.optim == 'RMSProp':
        optimizer = optim.RMSprop(trainable_parameters, lr=args.lr, weight_decay=args.l2_coef)
    elif args.optim == 'SGD':
        optimizer = optim.SGD(trainable_parameters, lr=args.lr, weight_decay=args.l2_coef)
    else:
        assert False, "Undefined Optimizer Type"
        
    # Train, Validate, Evaluate
    list_train_loss = list()
    list_val_loss = list()
    list_mae = list()
    list_std = list()
    
    args.best_mae = 10000
    for epoch in range(args.epoch):
        model, train_losses = train(model, partition, optimizer, criterion, args, **{'epoch':epoch})
        val_loss = validate(model, partition, criterion, args)
        mae, std, true_y, pred_y = test(model, partition, args, **{'epoch':epoch})

        list_train_loss += train_losses
        list_val_loss.append({'epoch':epoch, 'val_loss':val_loss})
        list_mae.append({'epoch':epoch, 'mae':mae})
        list_std.append({'epoch':epoch, 'std':std})
        
        if args.best_mae > mae or epoch==0:
            args.best_epoch = epoch
            args.best_mae = mae
            args.best_std = std
            args.best_true_y = true_y
            args.best_pred_y = pred_y
    
    te = time.time()
    
    # Logging Experiment Results
    args.elapsed = te-ts
    args.train_losses = list_train_loss
    args.val_losses = list_val_loss
    args.maes = list_mae
    args.stds = list_std
    return args

# Experiment.1  n_stage vs n_layer

In [91]:
class GConv(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(GConv, self).__init__()
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, X, A):
        x = self.fc(X)
        x = torch.matmul(A, x)
        return x

In [92]:
class ResBlock(nn.Module):
    def __init__(self, in_dim, out_dim, use_bn, dp_rate, sc_type):
        super(ResBlock, self).__init__()   
        self.use_bn = use_bn
        self.sc_type = sc_type
        self.gconv = nn.GConv(in_dim, out_dim)
        self.bn1 = nn.BatchNorm1d(out_dim)
        self.bn2 = nn.BatchNorm1d(out_dim)

        self.dropout = nn.Dropout2d(dp_rate)
        self.relu = nn.ReLU()

        if self.sc_type != 'no':
            self.shortcut = nn.Sequential()
            if in_filter != out_filter:
                self.shortcut.add_module(
                    'shortcut', nn.Linear(in_dim, out_dim, bias=False)
                )
                
        if self.sc_type == 'gsc':
            self.g_fc1 = nn.Linear(in_dim, out_dim, bias=True)
            self.g_fc2 = nn.Linear(out_dim, out_dim, bias=True)
            self.sigmoid = nn.Sigmoid()

    def forward(self, _x):
        if self.sc_type == 'no': #no skip-connection
            x = self.relu(self.bn1(self.gconv(_x))) if self.use_bn else self.relu(self.gconv(_x))
            return self.dropout(x)
        
        elif self.sc_type == 'sc': # basic skip-connection
            x = self.relu(self.bn1(self.gconv(_x))) if self.use_bn else self.relu(self.gconv(_x))
            x = x + self.shortcut(_x)
            return self.dropout(self.relu(self.bn2(x)) if self.use_bn else self.dropout(self.relu(x)))
        
        elif self.block_type == 'gsc': # gated skip-connection
            x = self.relu(self.bn1(self.gconv(_x))) if self.use_bn else self.relu(self.convg(_x))
            x1, x2 = self.g_fc1(_x), self.g_fc2(x)
            gate_coef = self.sigmoid(x1+x2)
            
            x = self.relu(self.bn2(self.conv2(x))) if self.use_bn else self.relu(self.conv2(x))
            return self.dropout(x + self.shortcut(_x))
        
        elif self.block_type == 'd': # ReLU-only pre-activation
            x = self.bn1(self.conv1(self.relu(_x))) if self.use_bn else self.conv1(self.relu(_x))
            x = self.bn2(self.conv2(self.relu(x))) if self.use_bn else self.conv2(self.relu(x))
            return self.dropout(x + self.shortcut(_x))
        
        elif self.block_type == 'e': # full pre-activation
            x = self.conv1(self.relu(self.bn1(_x))) if self.use_bn else self.conv1(self.relu(_x))
            x = self.conv2(self.relu(self.bn2(x))) if self.use_bn else self.conv2(self.relu(x))
            return self.dropout(x + self.shortcut(_x))
             
            

class Net(nn.Module):
    def __init__(self, args):
        super(Net, self).__init__()   
        
        # Create Atom Element embedding layer
        self.embedding = self.create_emb_layer(args.vocab_size, args.emb_train)
        self.gconv1 = GConv(58, args.hidden_dim1)
        self.gconv2 = GConv(args.hidden_dim1, args.hidden_dim1)
        
        # Create Residual Convolution layer
        list_gconvs = list()
        """
        hidden_dim = args.start_hidden_dim
        for i in range(args.n_stage):
            if i==0:
                list_gconvs.append(ResBl(n_channel, n_channel*args.start_channel, args.stride, args.use_bn, args.dp_rate, args.block_type))
                n_channel *= args.start_channel
            else:
                list_res_blocks.append(ResBlock(n_channel, n_channel*2, args.stride, args.use_bn, args.dp_rate, args.block_type))
                n_channel *= 2
            for j in range(args.n_layer-1):
                list_res_blocks.append(ResBlock(n_channel, n_channel, 1, args.use_bn, args.dp_rate, args.block_type))
        self.gconvs = nn.Sequential(*list_res_blocks)
        
        # Create MLP layers
#         fc_shape = self._estimate_fc_shape((1, args.max_len,))
        """
        self.fc1 = nn.Linear(args.hidden_dim1*args.max_len, args.hidden_dim2)
        self.fc2 = nn.Linear(args.hidden_dim2, args.hidden_dim2//2)
        self.fc3 = nn.Linear(args.hidden_dim2//2, 1)
        self.relu = nn.ReLU()
    

    def forward(self, x, A):
        A = A.float()
        x = self._embed(x)
        x = self.gconv1(x, A)
        x = self.gconv2(x, A)

#         x = self._gconv_forward(x)
        x = x.view(x.shape[0], -1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return torch.squeeze(x)
        
    
    def _embed(self, x):
        embed_x = self.embedding(x[:,:,1])
        x = torch.cat((embed_x, x[:,:,1:].float()), 2)
        return x 
        
    
    def _gconv_forward(self, x):
        embeds = self.embedding(x)
        embeds = embeds.view(embeds.shape[0], 1, embeds.shape[1], embeds.shape[2])
        x = self.gconvs(embeds)
        return x
    
    def _estimate_fc_shape(self, input_shape):
        dummy_input = torch.zeros(input_shape).long()
        dummy_output = self._gconv_forward(dummy_input)
        fc_shape = dummy_output.view(dummy_output.shape[0], -1).shape
        return fc_shape
        

    def create_emb_layer(self, vocab_size, emb_train):
        emb_layer = nn.Embedding(vocab_size, vocab_size)
        weight_matrix = torch.zeros((vocab_size, vocab_size))
        for i in range(vocab_size):
            weight_matrix[i][i] = 1
        emb_layer.load_state_dict({'weight': weight_matrix})

        if not emb_train:
            emb_layer.weight.requires_grad = False
        return emb_layer

In [93]:
a = torch.zeros((256, 50, 19))
b = a[:,:,1]
print(b.shape)

torch.Size([256, 50])


In [99]:
exp_name = 'exp1_layer_stage'
args = parser.parse_args("")
args.exp_name = exp_name
args.n_layer = 2
args.n_stage = 1
args.hidden_dim1 = 64
args.hidden_dim2 = 256
args.lr = 0.00005
args.l2_coef = 0.0001
args.optim = 'ADAM'
args.epoch = 20
args.batch_size= 256
args.test_batch_size= 256
args.emb_train = False
args.start_channel = 8
args.stride = 2
args.use_bn = True
args.dp_rate = 0.3
args.block_type = 'a'
args.max_len = 50
args.shuffle = True
args.device = 'cuda' if torch.cuda.is_available() else 'cpu'
writer = Writer(prior_keyword=['n_layer', 'n_stage', 'block_type', 'use_bn', 'dp_rate', 'emb_train', 'epoch', 'batch_size'])

n_iter = args.epoch * (len(partition['train']) + len(partition['test']))

args.bar = tqdm_notebook(total=n_iter, file=sys.stdout, position=0)

result = experiment(partition, args)
"""
# Define Hyperparameter Search Space
list_n_layer = [1,2,3,4]
list_n_stage = [1,2,3,4]

# Initialize num iteration, num experiment, progress bar
n_iter = args.epoch * (len(partition['train']) + len(partition['test']))
n_exp = len(list_n_layer)*len(list_n_stage)
cnt_exp = 0
bar = tqdm_notebook(total=n_exp*n_iter, file=sys.stdout, position=0)
bar.set_description('P {:2}/{} Exp'.format(cnt_exp, n_exp))

# writer.clear(exp_name)
for n_layer in list_n_layer:
    for n_stage in list_n_stage:
        # Update hyperparameter
        args.n_layer = n_layer
        args.n_stage = n_stage
        args.bar = bar
        result = experiment(partition, args)
        writer.write(result)
        torch.cuda.empty_cache()

        cnt_exp += 1
        bar.set_description('P {:2}/{} Exp'.format(cnt_exp, n_exp))
        print('[Exp {:2}] got mae: {:2.3f}, std: {:2.3f} at epoch {:2}'.format(cnt_exp, result.best_mae, result.best_std, result.epoch))
"""

HBox(children=(IntProgress(value=0, max=910000), HTML(value='')))

"\n# Define Hyperparameter Search Space\nlist_n_layer = [1,2,3,4]\nlist_n_stage = [1,2,3,4]\n\n# Initialize num iteration, num experiment, progress bar\nn_iter = args.epoch * (len(partition['train']) + len(partition['test']))\nn_exp = len(list_n_layer)*len(list_n_stage)\ncnt_exp = 0\nbar = tqdm_notebook(total=n_exp*n_iter, file=sys.stdout, position=0)\nbar.set_description('P {:2}/{} Exp'.format(cnt_exp, n_exp))\n\n# writer.clear(exp_name)\nfor n_layer in list_n_layer:\n    for n_stage in list_n_stage:\n        # Update hyperparameter\n        args.n_layer = n_layer\n        args.n_stage = n_stage\n        args.bar = bar\n        result = experiment(partition, args)\n        writer.write(result)\n        torch.cuda.empty_cache()\n\n        cnt_exp += 1\n        bar.set_description('P {:2}/{} Exp'.format(cnt_exp, n_exp))\n        print('[Exp {:2}] got mae: {:2.3f}, std: {:2.3f} at epoch {:2}'.format(cnt_exp, result.best_mae, result.best_std, result.epoch))\n"

In [100]:
print(result.best_mae)

0.7916863008257992


In [None]:
results = writer.read(exp_name='exp1_layer_stage')
results = results.loc[results['epoch']==50]
variable1 = 'n_stage'
variable2 = 'n_layer'


plot_performance(results, variable1, variable2,
                'Performance depends on {} vs {}'.format(variable1, variable2),
                'exp1_Performance {} vs {}'.format(variable1, variable2))

plot_distribution(results, variable1, variable2, 'true_y', 'pred_y', 
                  'Prediction results depends on {} vs {}'.format(variable1, variable2),
                  'exp1_Prediction {} vs {}'.format(variable1, variable2))

plot_loss(results, variable1, variable2, 'epoch', 'loss', 
                  'Loss depends on {} vs {}'.format(variable1, variable2),
                  'exp1_Loss {} vs {}'.format(variable1, variable2))

plt.show()


# Experiment.1 n_stage vs n_layer  

In this experiment, regression performance was measured by 'mae' metric among the variation of the number of the residual block(n_stage) and the number of the layer per block(n_layer).  

**Variable Domain**  
- n_stage = [1, 2, 3, 4]  
- n_layer = [1, 2, 3, 4]

# Results and Discussion

1. The first figure shows the mae and std variation depends on the number of residual block and number of convolution layer in the residual block. Also, it notify the experiment settings.  
2. The second figure shows the distribution between ground truth y and predicted y with y=x line(dashed).  
3. The last figure shows the train loss, validation loss(left y-axis) and mae score (right y-axis).  

**Notable Results**  
- As the number of the residual block increased, overall performance was increased.  
- As the number of the layer in a residual block increased, overall performance was decreased.  
- Therefore, 4 block with 1 layer per block achieved highest performance. 

**Discussion**  
- I expected that 4 block with 4 layer per block would outperform other models, however it was not. This results should be reconsidered since the loss chart of 4 block with 4 layer per block shows that the validation loss is still lower than the train loss. Therefore, longer training should be conducted. 
- As expected, the models with less residual block shows poor prediction performance. 

# Experiment.2 Block Type vs Batch Normalization

In [None]:
exp_name = 'exp2_block_type_batch_norm'
args = parser.parse_args("")
args.exp_name = exp_name
args.n_layer = 1
args.n_stage = 4
args.lr = 0.00005
args.l2_coef = 0.0001
args.optim = 'ADAM'
args.epoch = 50
args.batch_size= 256
args.test_batch_size= 256
args.emb_train = False
args.start_channel = 8
args.stride = 2
args.dp_rate = 0.3
args.max_len= 120
args.shuffle = True
args.device = 'cuda' if torch.cuda.is_available() else 'cpu'
writer = Writer(prior_keyword=['n_layer', 'n_stage','block_type', 'use_bn', 'dp_rate', 'emb_train', 'epoch', 'batch_size'])
partition = list_partition[0]

# Define Hyperparameter Search Space
list_use_bn = [True, False]
list_block_type = ['a', 'b', 'c', 'd']


# Initialize num iteration, num experiment, progress bar
n_iter = args.epoch * (len(partition['train']) + len(partition['test']))
n_exp = len(list_use_bn)*len(list_block_type)
cnt_exp = 0
bar = tqdm_notebook(total=n_exp*n_iter, file=sys.stdout, position=0)
bar.set_description('P {:2}/{} Exp'.format(cnt_exp, n_exp))

# writer.clear(exp_name)
for use_bn in list_use_bn:
    for block_type in list_block_type:
        # Update hyperparameter
        args.use_bn = use_bn
        args.block_type = block_type
        args.bar = bar
        result = experiment(partition, args)
        writer.write(result)
        torch.cuda.empty_cache()

        cnt_exp += 1
        bar.set_description('P {:2}/{} Exp'.format(cnt_exp, n_exp))
        print('[Exp {:2}] got mae: {:2.3f}, std: {:2.3f} at epoch {:2}'.format(cnt_exp, result.best_mae, result.best_std, result.epoch))


In [None]:
results = writer.read(exp_name='exp2_block_type_batch_norm')
results = results.loc[results['epoch']==50]

variable1 = 'block_type'
variable2 = 'use_bn'


plot_performance(results, variable1, variable2,
                'Performance depends on {} vs {}'.format(variable1, variable2),
                'exp1_Performance {} vs {}'.format(variable1, variable2))

plot_distribution(results, variable1, variable2, 'true_y', 'pred_y', 
                  'Prediction results depends on {} vs {}'.format(variable1, variable2),
                  'exp1_Prediction {} vs {}'.format(variable1, variable2), top=0.9)

plot_loss(results, variable1, variable2, 'epoch', 'loss', 
                  'Loss depends on {} vs {}'.format(variable1, variable2),
                  'exp1_Loss {} vs {}'.format(variable1, variable2), top=0.9)

plt.show()


# Experiment.2 Block Type vs Batch Normalization

In this experiment, regression performance was measured by 'mae' metric among the variation of the type of the residual block(block_type) and the usage of the batch normalization layer in each residual block(use_bn).     

**Variable Domain**  
- block_type = ['a':'original, 'b':BN after addition, 'c':ReLU before addition, 'd':ReLU-only pre-activation]  
- use_bn = [True, False]

# Results and Discussion

1. The first figure shows the mae and std variation depends on the residual block type and the usage of the batch normalization layer. Also, it notify the experiment settings.  
2. The second figure shows the distribution between ground truth y and predicted y with y=x line(dashed).  
3. The last figure shows the train loss, validation loss(left y-axis) and mae score (right y-axis).  

**Notable Results**  
- Among 4 types of residual block, block A and C types are outperformed.
- When the batch normalization layer is used, the mae was almost reduced as half. 
- Also, when the batch normalization is applied, the mae values recorded less variation among different block types.

**Discussion**  
- The batch normalization layer boosts up the performance more than I expected. (very powerful).  
- Different residual block has less significant impact on the results.  

# Experiment.3 Trainable Embedding vs Start Channel

In [None]:
exp_name = 'exp3_emb_train_start_channel'
args = parser.parse_args("")
args.exp_name = exp_name
args.n_layer = 1
args.n_stage = 4
args.lr = 0.00005
args.l2_coef = 0.0001
args.optim = 'ADAM'
args.epoch = 50
args.batch_size= 256
args.test_batch_size= 256
args.start_channel = 8
args.stride = 2
args.dp_rate = 0.3
args.max_len= 120
args.shuffle = True
args.use_bn = True
args.block_type = 'a'
args.device = 'cuda' if torch.cuda.is_available() else 'cpu'
writer = Writer(prior_keyword=['n_layer', 'n_stage','block_type', 'use_bn', 'dp_rate', 'emb_train', 'epoch', 'start_channel'])
partition = list_partition[0]

# Define Hyperparameter Search Space
list_emb_train = [True, False]
list_start_channel = [4,8,16,32]


# Initialize num iteration, num experiment, progress bar
n_iter = args.epoch * (len(partition['train']) + len(partition['test']))
n_exp = len(list_emb_train)*len(list_start_channel)
cnt_exp = 0
bar = tqdm_notebook(total=n_exp*n_iter, file=sys.stdout, position=0)
bar.set_description('P {:2}/{} Exp'.format(cnt_exp, n_exp))

# writer.clear(exp_name)
for emb_train in list_emb_train:
    for start_channel in list_start_channel:
        # Update hyperparameter
        args.emb_train = emb_train
        args.start_channel = start_channel
        args.bar = bar
        
        ts = time.time()
        result = experiment(partition, args)
        writer.write(result)
        torch.cuda.empty_cache()
        te = time.time()
        
        cnt_exp += 1
        bar.set_description('P {:2}/{} Exp'.format(cnt_exp, n_exp))
        print('[Exp {:2}] got mae: {:2.3f}, std: {:2.3f} at epoch {:2} Took {:3.1f}'.format(cnt_exp, result.best_mae, result.best_std, result.epoch, te-ts))


In [None]:
results = writer.read(exp_name='exp3_emb_train_start_channel')
# results = results.loc[results['epoch']==50]

variable1 = 'start_channel'
variable2 = 'emb_train'


plot_performance(results, variable1, variable2,
                'Performance depends on {} vs {}'.format(variable1, variable2),
                'exp1_Performance {} vs {}'.format(variable1, variable2))

plot_distribution(results, variable1, variable2, 'true_y', 'pred_y', 
                  'Prediction results depends on {} vs {}'.format(variable1, variable2),
                  'exp1_Prediction {} vs {}'.format(variable1, variable2), top=0.9)

plot_loss(results, variable1, variable2, 'epoch', 'loss', 
                  'Loss depends on {} vs {}'.format(variable1, variable2),
                  'exp1_Loss {} vs {}'.format(variable1, variable2), top=0.9)

plt.show()


# Experiment.3 Trainable Embedding vs Start Channel

In this experiment, regression performance was measured by 'mae' metric among the number of the starting convolution filters(start_channel) and the usage of trainable atom embedding(emb_train).  

**Variable Domain**  
- emb_train = [True, False]
- start_channel = [4,8,16,32,64]

# Results and Discussion

1. The first figure shows the mae and std variation depends on the number of convolution channel of the first residual block(start_channel) and the trainability of the atom embedding vector(emb_train). Also, it notify the experiment settings.  
2. The second figure shows the distribution between ground truth y and predicted y with y=x line(dashed).  
3. The last figure shows the train loss, validation loss(left y-axis) and mae score (right y-axis).  

**Notable Results**  
- As the number of the start channel increased, the performance drastically improved. 
- When the embedding vectors were trained, the performance was slightly increased. 

**Discussion**  
- Increasing the number of the convolution filter results in the increasing performance. However lots of computation power required.     
- Training original embedding vector is meaningful to increase the performance. 

In [None]:
exp_name = 'exp4_lr_l2'
args = parser.parse_args("")
args.exp_name = exp_name
args.n_layer = 1
args.n_stage = 4
args.emb_train = True
args.optim = 'ADAM'
args.epoch = 50
args.batch_size= 512
args.test_batch_size= 512
args.start_channel = 16
args.stride = 2
args.dp_rate = 0.3
args.max_len= 120
args.shuffle = True
args.use_bn = True
args.block_type = 'a'
args.device = 'cuda' if torch.cuda.is_available() else 'cpu'
writer = Writer(prior_keyword=['n_layer', 'n_stage','block_type', 'use_bn', 'dp_rate', 'emb_train', 'epoch', 'start_channel'])
partition = list_partition[0]

# Define Hyperparameter Search Space
list_lr = [0.00005, 0.0005, 0.005, 0.05]
list_l2_coef = [0.0001, 0.001, 0.01, 0.1]

# Initialize num iteration, num experiment, progress bar
n_iter = args.epoch * (len(partition['train']) + len(partition['test']))
n_exp = len(list_lr)*len(list_l2_coef)
cnt_exp = 0
bar = tqdm_notebook(total=n_exp*n_iter, file=sys.stdout, position=0)
bar.set_description('P {:2}/{} Exp'.format(cnt_exp, n_exp))

# writer.clear(exp_name)
for lr in list_lr:
    for l2_coef in list_l2_coef:
        # Update hyperparameter
        args.lr = lr
        args.l2_coef = l2_coef
        args.bar = bar
        
        ts = time.time()
        result = experiment(partition, args)
        writer.write(result)
        torch.cuda.empty_cache()
        te = time.time()
        
        cnt_exp += 1
        bar.set_description('P {:2}/{} Exp'.format(cnt_exp, n_exp))
        print('[Exp {:2}] got mae: {:2.3f}, std: {:2.3f} at epoch {:2} Took {:3.1f}sec'.format(cnt_exp, result.best_mae, result.best_std, result.epoch, te-ts))
