In [1]:
%load_ext autoreload
%autoreload 2

In [26]:
import time
from copy import deepcopy
import argparse
import sys
import traceback
import json
from tqdm import tqdm_notebook, tqdm
from pprint import pprint

import numpy as np
import scipy.sparse as sp
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter
from torch.autograd import Variable
import torch.optim as optim
from torch.utils import data
from torch.utils.data import Dataset, DataLoader
import torchtext
from torchtext.data import Field, RawField, NestedField, TabularDataset, BucketIterator
from sklearn.metrics import mean_absolute_error

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import gridspec
from matplotlib.font_manager import FontProperties
from utils import *
from visual_tool import *

from tensorboardX import SummaryWriter



parser = argparse.ArgumentParser()

seed = 123
np.random.seed(seed)
torch.manual_seed(seed)


<torch._C.Generator at 0x7f829a2c4830>

# 1. Data Loading & Vocabulary Building

torchtext character embedding ref : http://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/  
torchtext ref : http://mlexplained.com/2018/02/15/language-modeling-tutorial-in-torchtext-practical-torchtext-part-2/  
NestedField ref : https://github.com/pytorch/text/blob/master/test/sequence_tagging.py  


In [3]:
SMILE = Field(sequential=True, tokenize=list, lower=False)
CHAR_SMILE = NestedField(SMILE) #, init_token="<bos>", eos_token="<eos>")
LOGP = RawField(preprocessing=float)
LENGTH = RawField(preprocessing=int)

In [4]:
datafields = [('smile', CHAR_SMILE),
              ('logp', LOGP),
              ('mr', None), 
              ('tpsa', None), 
              ('length', LENGTH)]

train_dataset, val_dataset = TabularDataset.splits(
    path="../Data/",
    train="train000000.csv", validation="val000000.csv",
    format='csv',
    skip_header=True,
    fields=datafields
)

CHAR_SMILE.build_vocab(train_dataset.smile, val_dataset.smile)

In [32]:
print("# === Integer to Char === #")
print(CHAR_SMILE.vocab.itos)
print("# === Char to Integer === #")
print(CHAR_SMILE.vocab.stoi)

# === Integer to Char === #
['<unk>', '<pad>', 'C', 'c', '(', ')', '1', 'O', '@', 'N', '2', '=', '[', ']', 'H', 'n', 'F', '3', 'S', 'l', '-', 's', 'B', 'r', '#', 'o', '4', '+', '/', '\\', 'I', '5', 'i', 'P', '6']
# === Char to Integer === #
defaultdict(<function _default_unk_index at 0x7f825b8b5d90>, {'<unk>': 0, '<pad>': 1, 'C': 2, 'c': 3, '(': 4, ')': 5, '1': 6, 'O': 7, '@': 8, 'N': 9, '2': 10, '=': 11, '[': 12, ']': 13, 'H': 14, 'n': 15, 'F': 16, '3': 17, 'S': 18, 'l': 19, '-': 20, 's': 21, 'B': 22, 'r': 23, '#': 24, 'o': 25, '4': 26, '+': 27, '/': 28, '\\': 29, 'I': 30, '5': 31, 'i': 32, 'P': 33, '6': 34})


# 2. Model Architecture

In [33]:
class BN1d(nn.Module):
    def __init__(self, out_dim):
        super(BN1d, self).__init__()
        self.bn = nn.BatchNorm1d(out_dim)
             
    def forward(self, x):
        origin_shape = x.shape
        x = x.view(-1, origin_shape[-1])
        x = self.bn(x)
        x = x.view(origin_shape)
        return x
    
class LSTM(nn.Module):

    def __init__(self, vocab_size, hidden_dim, batch_size, output_dim=1,
                 num_layers=2, dropout=0.1, 
                 bidirectional=True, emb_train=True, skip_connection=True, 
                 cell_type='lstm', norm_type='ln'):
        
        super(LSTM, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.batch_size = batch_size
        self.highway_dim = hidden_dim * 2 if bidirectional else hidden_dim
        
        self.num_layers = num_layers
        self.dropout = dropout
        
        self.bidirectional = bidirectional
        self.emb_train = emb_train
        self.skip_connection = skip_connection
        
        self.cell_type = cell_type
        self.norm_type = norm_type
        
        # Define embedding layer
        self.embedding = self.create_emb_layer(self.vocab_size, self.emb_train)
        self.feeding_fc = nn.Linear(self.vocab_size, self.highway_dim)
        
        # Define Normalization layer
        if self.norm_type == 'bn':
            self.norm = BN1d(self.highway_dim)
        elif self.norm_type == 'ln':
            self.norm = nn.LayerNorm(self.highway_dim)
        else:
            self.norm = nn.Sequential()
        
        # Define the Recurrent layer
        self.lstms = nn.ModuleList()
        for i in range(self.num_layers):
            if self.cell_type == 'lstm':
                self.lstms.append(nn.LSTM(self.highway_dim, self.hidden_dim, 
                                       self.num_layers, dropout=self.dropout, 
                                       bidirectional=self.bidirectional))
            elif self.cell_type == 'gru':
                self.lstms.append(nn.GRU(self.highway_dim, self.hidden_dim, 
                                         self.num_layers, dropout=self.dropout, 
                                         bidirectional=self.bidirectional))
        # Define the output layer
        self.fc1 = nn.Linear(self.highway_dim, self.highway_dim // 2)
        self.fc2 = nn.Linear(self.highway_dim // 2, self.highway_dim )
        self.fc3 = nn.Linear(self.highway_dim, self.output_dim)
        self.dropout = nn.Dropout(p=self.dropout)
        self.relu = nn.ReLU()

    def init_hidden(self):
        # This is what we'll initialise our hidden state as
        return (torch.zeros(self.num_layers, self.batch_size, self.highway_dim),
                torch.zeros(self.num_layers, self.batch_size, self.highway_dim))

    def forward(self, input):
        # Forward pass through LSTM layer
        # shape of self.hidden: (a, b), where a and b both 
        # have shape (num_layers, batch_size, hidden_dim).

        # shape of input: [seq_length, batch_size, 1]
        # shape of emb_input: [seq_length, batch_size, 1, vocab_size]
        # shape of lstm_out: [input_size, batch_size, hidden_dim]

        lstm_input = self.feeding_fc(self.embedding(input).squeeze())
         
        for i, lstm in enumerate(self.lstms):
            lstm_out, self.hidden = lstm(lstm_input.view(len(input), self.batch_size, -1))
            lstm_input = lstm_out + lstm_input if self.skip_connection else lstm_out # skip connection
            print(lstm_input.shape)
            lstm_input = self.norm(lstm_input)
            
        # Only take the output from the final timetep
        # Can pass on the entirety of lstm_out to the next layer if it is a seq2seq prediction
        x = self.dropout(self.relu(self.fc1(lstm_out[-1].view(self.batch_size, -1))))
        x = self.dropout(self.relu(self.fc2(x)))
        x = self.fc3(x)
        return x.view(-1)
    
    def create_emb_layer(self, vocab_size, emb_train):
        emb_layer = nn.Embedding(vocab_size, vocab_size)
        weight_matrix = torch.zeros((vocab_size, vocab_size))
        for i in range(vocab_size):
            weight_matrix[i][i] = 1
        emb_layer.load_state_dict({'weight': weight_matrix})

        if not emb_train:
            emb_layer.weight.requires_grad = False
        return emb_layer

# 3. Train & Validation

In [None]:
def train(model, data_iter, optimizer, criterion, args, **kwargs):
        
    epoch_train_loss = 0
    list_train_loss = list()
    cnt_iter = 0
    
    for batch_idx, batch in enumerate(data_iter):
        if batch.smile.shape[0] != args.batch_size:
            continue

        model.train()
        model.zero_grad()
        model.hidden = model.init_hidden()
        optimizier.zero_grad()

        input_smile = batch.smile.transpose(0, 2).transpose(1, 2)
        true_logp = Variable(torch.Tensor(batch.logp)).to(args.device)
        pred_logp = model(input_smile)
        
        train_loss = criterion(pred_logp, true_logp)        
        train_loss.backward()
        optimizer.step()
        
        epoch_train_loss += train_loss.item()
        list_train_loss.append({'epoch':batch_idx/len(data_iter)+kwargs['epoch'], 'train_loss':train_loss.item()})
        cnt_iter += 1
        args.bar.update(len(X))

    return model, list_train_loss


def validate(model, data_iter, criterion, args):
    
    epoch_val_loss = 0
    cnt_iter = 0
    for batch_idx, batch in enumerate(data_iter):
        if batch.smile.shape[0] != args.batch_size:
            continue

        model.eval()
        model.zero_grad()
        model.hidden = model.init_hidden()

        input_smile = batch.smile.transpose(0, 2).transpose(1, 2)
        true_logp = Variable(torch.Tensor(batch.logp)).to(args.device)
        pred_logp = model(input_smile)
        
        val_loss = criterion(pred_logp, true_logp)        
        epoch_val_loss += val_loss.item()
        cnt_iter += 1

    return epoch_val_loss/cnt_iter


def test(model, data_iter, args, **kwargs):

    list_y, list_pred_y = list(), list()
    for batch_idx, batch in enumerate(data_iter):
        
        if batch.smile.shape[0] != args.batch_size:
            continue

        model.eval()
        model.zero_grad()
        model.hidden = model.init_hidden()

        input_smile = batch.smile.transpose(0, 2).transpose(1, 2)
        true_logp = Variable(torch.Tensor(batch.logp)).to(args.device)
        pred_logp = model(input_smile)
        
        list_y += true_logp.cpu().detach().numpy().tolist()
        list_pred_y += pred_logp.cpu().detach().numpy().tolist()
        args.bar.update(len(X))

    mae = mean_absolute_error(list_y, list_pred_y)
    std = np.std(np.array(list_y)-np.array(list_pred_y))
    return mae, std, list_y, list_pred_y

def experiment(partition, args):
    ts = time.time()

    # ===== Construct Model ===== #
    model = LSTM(args.vocab_size, args.hidden_dim, batch_size=args.batch_size, 
                 output_dim=args.output_dim, num_layers=args.n_layer, dropout=0.1,
                 bidirectional=args.bidirectional, emb_train=args.emb_train, 
                 skip_connection=args.skip_connection, cell_type=args.cell_type, norm_type=args.norm_type)    
    model.to(args.device)
    criterion = nn.MSELoss().to(args.device)
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    overall_params = sum(p.numel() for p in model.parameters())

    print("##############################################")
    print("Total Model Parameters : {}".format(trainable_parameters))
    print("Trainable   Parameters : {}".format(overall_params))
    print("##############################################")
    
    # Initialize Optimizer
    trainable_parameters = filter(lambda p: p.requires_grad, model.parameters())
    if args.optim == 'ADAM':
        optimizer = optim.Adam(trainable_parameters, lr=args.lr, weight_decay=args.l2_coef)
    elif args.optim == 'RMSProp':
        optimizer = optim.RMSprop(trainable_parameters, lr=args.lr, weight_decay=args.l2_coef)
    elif args.optim == 'SGD':
        optimizer = optim.SGD(trainable_parameters, lr=args.lr, weight_decay=args.l2_coef)
    else:
        assert False, "Undefined Optimizer Type"
    print("{} Optimizer is Constructed".format(str(optimizier)))
    print("##############################################")

    # Train, Validate, Evaluate
    list_train_loss = list()
    list_val_loss = list()
    list_mae = list()
    list_std = list()
    
    args.best_mae = 10000
    for epoch in range(args.epoch):
        model, train_losses = train(model, partition['train'], optimizer, criterion, args, **{'epoch':epoch})
        val_loss = validate(model, partition['val'], criterion, args)
        mae, std, true_y, pred_y = test(model, partition['val'], args, **{'epoch':epoch})

        list_train_loss += train_losses
        list_val_loss.append({'epoch':epoch, 'val_loss':val_loss})
        list_mae.append({'epoch':epoch, 'mae':mae})
        list_std.append({'epoch':epoch, 'std':std})
        
        if args.best_mae > mae or epoch==0:
            args.best_epoch = epoch
            args.best_mae = mae
            args.best_std = std
            args.best_true_y = true_y
            args.best_pred_y = pred_y
    
    te = time.time()
    
    # Logging Experiment Results
    args.elapsed = te-ts
    args.train_losses = list_train_loss
    args.val_losses = list_val_loss
    args.maes = list_mae
    args.stds = list_std
    return args

In [25]:
seed = 123
np.random.seed(seed)
torch.manual_seed(seed)

exp_name = 'exp1_test'
args = parser.parse_args("")
args.exp_name = exp_name
args.n_layer = 4
args.hidden_dim = 128
args.output_dim = 1
args.vocab_size = len(CHAR_SMILE.vocab)

args.dropout = 0.1
args.emb_train = True
args.bidirectional = True
args.skip_connection = True
args.cell_type = 'lstm'
args.norm_type = 'bn'

args.lr = 0.01
args.l2_coef = 0.001
args.optim = 'ADAM'
args.epoch = 100
args.batch_size= 8
args.test_batch_size= 8
args.shuffle = True
args.device = 'cuda' if torch.cuda.is_available() else 'cpu'
writer = Writer(prior_keyword=['n_layer', 'n_stage', 'sc_type', 'use_bn', 'use_attn', 'dp_rate', 'emb_train', 'epoch', 'l2_coef', 'lr'])


train_iter, val_iter = BucketIterator.splits(
    (train_dataset, val_dataset),
    batch_sizes=(args.batch_size, args.test_batch_size),
    device='cuda',
    sort_key=lambda x: x.length,
    sort=True,
    sort_within_batch=True,
    repeat=False,
    shuffle=args.shuffle,
)



print(num_params)
for epoch in range(1):
    epoch_loss = 0
    print('start', epoch)
    for batch in tqdm(train_iter):
        if batch.smile.shape[0] != args.batch_size:
            continue

        model.zero_grad()
        optimizier.zero_grad()
        model.hidden = model.init_hidden()

        input_smile = batch.smile.transpose(0, 2).transpose(1, 2)
        true_logp = Variable(torch.Tensor(batch.logp)).to(args.device)

        pred_logp = model(input_smile)
        loss = criterion(pred_logp, true_logp)
#         print(loss.item())
        loss.backward()
        optimizier.step()
        epoch_loss += loss.item()

        break

    #     break
    print(epoch, epoch_loss)
torch.cuda.empty_cache()

    

  0%|          | 0/24649 [00:00<?, ?it/s]

6401354
start 0
torch.Size([24, 8, 256])
torch.Size([24, 8, 256])
torch.Size([24, 8, 256])
torch.Size([24, 8, 256])
0 2.9893321990966797





In [None]:
torch.cuda.empty_cache()
