In [None]:
! python --version

In [None]:
%env CUDA_VISIBLE_DEVICES=2

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from __future__ import print_function
from __future__ import division
import math
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import torch 
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable


torch.manual_seed(1)

In [None]:
#from sklearn.metrics import r2_score
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Chem.Fingerprints import FingerprintMols

In [None]:
import time
import sys

In [None]:
import data_preprocessing as dp
import csv

In [None]:
filename = './data/jak2_data.csv' 

In [None]:
from data import read_smiles_property_file
from data import cross_validation_split
from data import PredictorData

In [None]:
my_data = PredictorData(filename)

In [None]:
tokens = ['<', '>', '#', '%', ')', '(', '+', '-', '/', '.', '1', '0', '3', '2', '5', '4', '7',
          '6', '9', '8', '=', 'A', '@', 'C', 'B', 'F', 'I', 'H', 'O', 'N', 'P', 'S', '[', ']',
          '\\', 'c', 'e', 'i', 'l', 'o', 'n', 'p', 's', 'r', ' ']
char2idx = {}
my_data.load_dictionary(tokens, char2idx)

In [None]:
lens = []
for sm in my_data.smiles:
    lens.append(len(sm))
max_len = max(lens)

In [None]:
for i in range(len(my_data.smiles)):
    l = len(my_data.smiles[i])
    my_data.smiles[i] = my_data.smiles[i] + ' '*(max_len - l)

In [None]:
cross_val_data, cross_val_labels = cross_validation_split(my_data.smiles, my_data.property)

In [None]:
def batch_char_tensor(smiles, use_cuda):
    tensor = torch.zeros(len(smiles), len(smiles[0])).long()
    for i in range (len(smiles)):
        string = smiles[i]
        for c in range(len(string)):
            tensor[i, c] = self.all_characters.index(string[c])
    if use_cuda:
        return Variable(tensor.cuda())
    else:
        return Variable(tensor)

In [None]:
def iterate_minibatches(X, y, batchsize=100):
    n = X.shape[0]
    ind = np.random.permutation(n)
    for start_index in range(0, n, batchsize):
        X_batch = batch_char_tensor(X[ind[start_index:start_index + batchsize]])
        y_batch = y[ind[start_index:start_index + batchsize], :]
        if use_cuda:
            yield (X_batch, torch.from_numpy(y_batch).float().cuda())
        else:
            yield (X_batch, torch.from_numpy(y_batch).float())

## SMILES based QSAR with Recurrent Neural Network

In [None]:
from RecurrentQSAR import RecurrentQSAR

In [None]:
models = []
train_logs = []
val_logs = []
num_epochs = 100
batch_size = 100

for i in range(5):
    
    train_loss_log = []
    val_loss_log = []
    
    models.append(RecurrentQSAR(input_dim=my_data.n_characters, data=my_data))
    models[i].cuda()
    criterion = nn.MSELoss()
    optimizer = optim.Adadelta(models[i].parameters(), lr=0.1,  weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
    
    trX = np.concatenate(cross_val_data[:i] + cross_val_data[i+1:])
    trY = np.concatenate(cross_val_labels[:i] + cross_val_labels[i+1:])
    teX = np.array(cross_val_data[i])
    teY = np.array(cross_val_labels[i])

    for epoch in range(num_epochs):
        scheduler.step()
        models[i].fit(criterion, optimizer, trX, trY.reshape(-1), train_loss_log, num_epochs=1, batch_size=batch_size)
        models[i].validate(teX, teY, batch_size = batch_size, val_loss_log=val_loss_log)
        
    train_logs.append(train_loss_log)
    val_logs.append(val_loss_log)
    plt.plot(train_loss_log)
    plt.plot(val_loss_log)

In [None]:
optimizer.state_dict()