# ChemVAE Implementation

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

from utils import *
from model import ConvEncoder, GRUDecoder, ChemVAE
from trainer import train

## Import and Pre-Process Data

In [2]:
args_input = '../data/zinc.csv'
args_train = 1000
args_val = 100
args_output = True
args_colc = 'SMILES'

In [3]:
# GitHub data directory contains only tar.gz version
# import pandas as pd
# df = pd.read_csv('../data/zinc.tar.gz', compression='gzip', header=0, sep=',', error_bad_lines=False)
# df.columns[0] = 'SMILES'

df = import_data(args_input)
X_train, X_test = return_splits(df, n_train=args_train, n_test=args_val, col_chem=args_colc)
char2idx, idx2char, train_idx, test_idx = create_data(X_train, X_test, colname=args_colc)   
train_oh, test_oh = check_conversions(idx2char, train_idx, X_train, test_idx, X_test)

# print(max([len(i) for i in X_train]))
# print(max([len(i) for i in X_test]))

There are 0 training index conversion errors
There are 0 testing index conversion errors

There are 0 training one-hot conversion errors
There are 0 testing one-hot conversion errors


In [4]:
args_lr = 0.001
args_dynlr = True
args_batch_size = 10 #200
## per Gómez-Bombarelli Zinc GRU hidden dim 488
args_latent_size = 488
arg_seed = 123
args_model_path = '../weights/' + args_colc + '/'

In [5]:
n_length = train_oh.shape[1]
n_char = train_oh.shape[2]

In [6]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# device = "cpu"

enc = ConvEncoder(args_latent_size, n_length, n_char).to(device)
dec = GRUDecoder(args_latent_size, n_length, n_char).to(device)
model = ChemVAE(enc, dec).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr = args_lr)
if args_dynlr:
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', 
                                                           factor = 0.8, 
                                                           patience = 3,
                                                           min_lr = 0.0001)

X_train = torch.from_numpy(train_oh.astype(np.float32))
X_test = torch.from_numpy(test_oh.astype(np.float32))

torch.manual_seed(arg_seed)

train_loader = torch.utils.data.DataLoader(X_train, 
                                           batch_size=args_batch_size,
                                           shuffle=True, 
                                           num_workers=6,
                                           drop_last = True)

test_loader = torch.utils.data.DataLoader(X_test, 
                                          batch_size=args_batch_size,
                                          shuffle=True, 
                                          num_workers=6,
                                          drop_last = True)

In [10]:
history = train(model, optimizer, scheduler, train_loader, test_loader, args_model_path, device)

train() called: model=ChemVAE, opt=Adam(lr=0.001000), epochs=100, device=cuda:0

Epoch   1/100, train loss: 140.14, train acc:  0.53, val loss: 117.47, val acc:  0.59


KeyboardInterrupt: 

In [None]:
output = model(X_test)

In [199]:
output_idx = list(output.argmax(axis=2).cpu().numpy())

In [200]:
output_idx[0]

array([25, 11, 11, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
       19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19,
        4, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28],
      dtype=int64)

In [201]:
output_char = [convert_num2str(i, idx2char) for i in output_idx]

In [203]:
output_char

['CCccccccccccccccccccccccccccccccc1',
 'CCcccccccccccccccccccccccccccccccccc1',
 'CCcccccccccccccccccccccccccccccccc',
 'CCcccccccccccccccccccccccc1',
 'CCcccccccccccccccccccccccccccccccc',
 'CCccccccccccccccccccccccccc1',
 'CCcccccccccccccccccccccc11',
 'CCccccccccccccccccccccccc1',
 'CCccccccccccccccccccccccccccccc',
 'CCccccccccccccccccccccccccc1',
 'CCcccccccccccccccccccccccccccccc1',
 'CCccccccccccccccccccccccccccc1',
 'CCccccccccccccccccccccccccccc1',
 'CCccccccccccccccccccccccccccccccccccc1',
 'CCcccccccccccccccccccccccccccccccc',
 'CCcccccccccccccccccccccccccc1',
 'CCccccccccccccccccccccccc1',
 'CCcccccccccccccccccccccccccccccc',
 'CCccccccccccccccccccccccccccccccccc1',
 'CCccccccccccccccccccccccccccc1',
 'CCcccccccccccccccccccccccccccc1',
 'CCccccccccccccccccccccccccccc1',
 'CCccccccccccccccccccccccccccc1',
 'CCcccccccccccccccccccccccccc1',
 'CCccccccccccccccccccccccccccccc',
 'CCcccccccccccccccccccccccccc1',
 'CCccccccccccccccccccccccccccc1',
 'CCcccccccccccccccccccccccccccc

In [None]:
def init_weights(layer):
    '''
    Initialize weights based on layer type
    
    Args:
        layer (torch.nn): neural network whose weights to initialize
    '''
    if type(layer) == nn.Conv1d:
        init.normal_(m.weight.data)
        m.bias.data.fill_(0.01)
    if type(layer) == nn.Linear:
        n = m.in_features
        y = 1.0/np.sqrt(n)
        m.weight.data.uniform_(-y, y)
        m.bias.data.fill_(0)
    if type(layer) == nn.GRU:
        for param in m.parameters():
            if len(param.shape) >= 2:
                init.orthogonal_(param.data)
            else:
                init.normal_(param.data)      