In [1]:
! python --version

Python 3.6.6 :: Anaconda custom (64-bit)


In [2]:
%env CUDA_VISIBLE_DEVICES=2

env: CUDA_VISIBLE_DEVICES=2


In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from __future__ import print_function
from __future__ import division
import math
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
import torch 
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable


torch.manual_seed(1)

<torch._C.Generator at 0x7fb8a1712650>

In [6]:
#from sklearn.metrics import r2_score
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.Chem.Fingerprints import FingerprintMols

In [7]:
import time
import sys
import csv

In [8]:
filename = './data/jak2_data.csv'

In [9]:
from data import read_smiles_property_file
from data import cross_validation_split
from data import PredictorData

In [10]:
my_data = PredictorData(filename)

In [11]:
tokens = ['<', '>', '#', '%', ')', '(', '+', '-', '/', '.', '1', '0', '3', '2', '5', '4', '7',
          '6', '9', '8', '=', 'A', '@', 'C', 'B', 'F', 'I', 'H', 'O', 'N', 'P', 'S', '[', ']',
          '\\', 'c', 'e', 'i', 'l', 'o', 'n', 'p', 's', 'r', ' ']
char2idx = {}
my_data.load_dictionary(tokens, char2idx)

In [12]:
lens = []
for sm in my_data.smiles:
    lens.append(len(sm))
max_len = max(lens)

In [13]:
for i in range(len(my_data.smiles)):
    l = len(my_data.smiles[i])
    my_data.smiles[i] = my_data.smiles[i] + ' '*(max_len - l)

In [14]:
my_data.smiles[1]

'O=c1cc(-c2nc(-c3ccc(-c4cn(CCP(=O)(O)O)nn4)cc3)[nH]c2-c2ccc(F)cc2)cc[nH]1              '

In [15]:
max_len

86

In [17]:
cross_val_data, cross_val_labels = cross_validation_split(my_data.smiles, my_data.property)

In [18]:
cross_val_labels

[[7.7,
  5.96,
  7.38,
  8.3,
  6.95,
  5.19,
  10.36,
  7.56,
  8.9,
  7.26,
  7.54,
  5.87,
  6.44,
  7.49,
  10.7,
  8.62,
  8.72,
  7.41,
  6.42,
  7.8,
  7.36,
  10.53,
  6.28,
  8.41,
  7.61,
  7.15,
  6.32,
  7.03,
  8.73,
  6.67,
  7.75,
  8.16,
  8.22,
  6.56,
  7.77,
  9.05,
  7.52,
  8.52,
  8.22,
  8.22,
  8.1,
  8.22,
  7.77,
  5.0,
  8.4,
  7.77,
  6.87,
  6.49,
  6.28,
  8.13,
  6.46,
  8.22,
  7.35,
  7.98,
  7.65,
  7.66,
  7.57,
  8.3,
  6.56,
  8.51,
  8.22,
  7.61,
  6.0,
  7.36,
  6.29,
  9.18,
  7.75,
  6.16,
  5.88,
  6.46,
  7.05,
  5.8,
  5.25,
  7.51,
  8.72,
  6.32,
  5.54,
  7.49,
  5.6,
  6.76,
  8.4,
  7.89,
  6.38,
  7.3,
  8.05,
  7.49,
  6.84,
  7.29,
  7.39,
  6.91,
  6.73,
  6.66,
  8.52,
  7.33,
  8.68,
  6.25,
  7.34,
  7.55,
  9.77,
  7.92,
  7.75,
  5.55,
  8.4,
  7.08,
  5.0,
  7.77,
  6.67,
  7.82,
  7.85,
  9.0,
  5.52,
  7.0,
  5.5,
  6.87,
  6.65,
  8.87,
  5.3,
  10.33,
  7.85,
  7.25,
  7.73,
  7.8,
  7.62,
  6.43,
  5.97,
  9.57,
  7.7,
  

In [19]:
def batch_char_tensor(smiles, use_cuda):
    tensor = torch.zeros(len(smiles), len(smiles[0])).long()
    for i in range (len(smiles)):
        string = smiles[i]
        for c in range(len(string)):
            tensor[i, c] = self.all_characters.index(string[c])
    if use_cuda:
        return Variable(tensor.cuda())
    else:
        return Variable(tensor)

In [20]:
def iterate_minibatches(X, y, batchsize=100):
    n = X.shape[0]
    ind = np.random.permutation(n)
    for start_index in range(0, n, batchsize):
        X_batch = batch_char_tensor(X[ind[start_index:start_index + batchsize]])
        y_batch = y[ind[start_index:start_index + batchsize], :]
        if use_cuda:
            yield (X_batch, torch.from_numpy(y_batch).float().cuda())
        else:
            yield (X_batch, torch.from_numpy(y_batch).float())

## SMILES based QSAR with Recurrent Neural Network

In [21]:
from __future__ import print_function
from __future__ import division

import torch
import torch.nn as nn
from torch.autograd import Variable

import time
import numpy as np

from rdkit import Chem
from rdkit import DataStructs
from sklearn.ensemble import RandomForestRegressor as RFR

torch.manual_seed(1)


class RecurrentQSAR(nn.Module):
    def __init__(self, input_dim, data):
        super(RecurrentQSAR, self).__init__()
        
        self.data = data
        self.embedding = nn.Embedding(num_embeddings=input_dim, embedding_dim=250,
                                      padding_idx=0)  # Output: (N, W, embedding_dim)
        self.lstm = nn.LSTM(input_size=250, hidden_size=100, bidirectional=True,
                            num_layers=1)  # input(seq_len, batch, input_size)
        self.linear = torch.nn.Linear(in_features=200, out_features=100)
        self.relu = torch.nn.LeakyReLU() #SELU()logp,  #LeakyReLU()
        self.batch_norm = torch.nn.BatchNorm1d(num_features=100)
        self.output = torch.nn.Linear(in_features=100, out_features=1)
        

    def forward(self, inp, dropout=False):
        embedded = self.embedding(inp).permute(1, 0, 2)
        output = embedded
        output, _ = self.lstm(output)
        output = output[-1, :, :]
        output = self.linear(output)
        output = self.relu(output)
        output = self.output(output)
        return output

    def step(self, x, y, criterion, optimizer):
        # Reset gradient
        optimizer.zero_grad()
        fx = self.forward(x)
        loss = criterion(fx, y)
        loss.backward()
        torch.nn.utils.clip_grad_norm(self.parameters(), max_norm=100)
        optimizer.step()
        return loss.data

    def predict(self, x):
        output = self.forward(x, dropout=False)
        return output.data
    
    def batch_char_tensor(self, smiles, use_cuda=True):
        tensor = torch.zeros(len(smiles), len(smiles[0])).long()
        for i in range (len(smiles)):
            string = smiles[i]
            for c in range(len(string)):
                tensor[i, c] = self.data.all_characters.index(string[c])
        if use_cuda:
            return Variable(tensor.cuda())
        else:
            return Variable(tensor)

    def iterate_minibatches(self, X, y, batchsize=100, use_cuda=True):
        n = X.shape[0]
        ind = np.random.permutation(n)
        for start_index in range(0, n, batchsize):
            X_batch = self.batch_char_tensor(X[ind[start_index:start_index + batchsize]])
            y_batch = y[ind[start_index:start_index + batchsize]]
            if use_cuda:
                yield (X_batch, Variable(torch.from_numpy(y_batch).float().cuda()))
            else:
                yield (X_batch, Variable(torch.from_numpy(y_batch).float()))

    def fit(self, criterion, optimizer, trX, trY, train_loss_log=[], num_epochs=100, batch_size=100):
        for epoch in range(num_epochs):
            train_err = 0
            train_batches = 0
            start_time = time.time()
            for batch in self.iterate_minibatches(trX, trY, batch_size):
                inputs, targets = batch
                train_err_batch = self.step(inputs, targets, criterion, optimizer)
                train_err += train_err_batch.cpu().numpy().mean()
                train_batches += 1
            print("Epoch {} of {} took {:.3f}s".format(
                epoch + 1, num_epochs, time.time() - start_time))
            train_loss_log.append(train_err / train_batches / batch_size)
            print("  training loss (in-iteration):\t\t{:.6f}".format(train_err / train_batches / batch_size))
            # print("  train accuracy:\t\t{:.2f} %".format(
            #    train_acc / train_batches * 100))

        return train_loss_log

    def validate(self, teX, teY, batch_size=100, val_loss_log=[]):
        # Full pass over the validation data:
        val_loss = 0
        val_batches = 0
        for batch in self.iterate_minibatches(teX, teY, batch_size):
            inputs, targets = batch
            pred = self.predict(inputs)
            val_loss += ((pred - targets.data) ** 2).cpu().numpy().mean()
            val_batches += 1
        val_loss_log.append(val_loss / val_batches / batch_size)
        print("  validation loss:\t\t{:.6f}".format(
            val_loss / val_batches / batch_size))

        return val_loss_log

ModuleNotFoundError: No module named 'sklearn'

In [None]:
models = []
train_logs = []
val_logs = []
num_epochs = 100
batch_size = 100

for i in range(5):
    
    train_loss_log = []
    val_loss_log = []
    
    models.append(RecurrentQSAR(input_dim=my_data.n_characters, data=my_data))
    models[i].cuda()
    criterion = nn.MSELoss()
    optimizer = optim.Adadelta(models[i].parameters(), lr=0.1,  weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
    
    trX = np.concatenate(cross_val_data[:i] + cross_val_data[i+1:])
    trY = np.concatenate(cross_val_labels[:i] + cross_val_labels[i+1:])
    teX = np.array(cross_val_data[i])
    teY = np.array(cross_val_labels[i])

    for epoch in range(num_epochs):
        scheduler.step()
        models[i].fit(criterion, optimizer, trX, trY.reshape(-1), train_loss_log, num_epochs=1, batch_size=batch_size)
        models[i].validate(teX, teY, batch_size = batch_size, val_loss_log=val_loss_log)
        
    train_logs.append(train_loss_log)
    val_logs.append(val_loss_log)
    plt.plot(train_loss_log)
    plt.plot(val_loss_log)

In [None]:
optimizer.state_dict()