In [1]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

#Run on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [25]:
#Load SMILES data as integer labels and as one-hot encoding
data = np.load("ohesmiles.npz")
data = data["arr_0"]

intdata = np.load("intsmiles.npz")
intdata = intdata["arr_0"]

data = torch.from_numpy(data).view(np.shape(data)[0], 1, np.shape(data)[1])
intdata = torch.from_numpy(intdata)
print("Dataset size: " + str(data.size()))
print("Integer dataset size: " + str(intdata.size()))

Dataset size: torch.Size([34131372, 1, 55])
Integer dataset size: torch.Size([34131372])


In [3]:
#Get input tensor
def inp(i, shuffle):
    
    #Input (does not include last character in SMILES)
    inp = data[int(shuffle[i] * seq_length) : int((shuffle[i] * seq_length) + seq_length - 1), :, :]

    return inp

In [32]:
#Get target tensor
def target(i, shuffle):
    
    #Target (does not include first character in SMILES)
    target = intdata[int((shuffle[i] * seq_length) + 1) : int((shuffle[i] * seq_length) + seq_length)]

    return target

In [27]:
#Define model
class Model(nn.Module):
    
    #Define model parameters
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(Model, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        
        self.lstm = nn.LSTM(input_size = input_size, hidden_size = hidden_size, num_layers = num_layers, dropout = dropout)
        self.linear = nn.Linear(hidden_size, input_size)
    
    #Define forward propagation
    def forward(self, inp, hidden):
        output, hidden = self.lstm(inp, hidden)
        output = self.linear(output)
        return output, hidden
        
    #Define initial hidden and cell states
    def init_states(self, num_layers, hidden_size):
        hidden = (Variable(torch.zeros(num_layers, 1, hidden_size)),
                  Variable(torch.zeros(num_layers, 1, hidden_size)))
        
        return hidden
        
        

In [None]:
#Define training
def train(epochs):
    
    #Initialize hidden and cell states
    hidden = model.init_states(num_layers, hidden_size)
    
    #Iterate over desired number of epochs 
    for e in range(epochs):
        #Get random order of SMILES molecules (shuffle data)
        shuffle = np.arange(np.shape(data)[0] / seq_length)
        random.shuffle(shuffle)
        
        #Set initial gradients
        model.zero_grad()
    
        #Set initial loss
        loss = 0 
        
        #Iterate over each molecule in dataset
        for i in range(int(np.shape(data)[0] / seq_length)):
            
            #Get input and target
            input_data = inp(i, shuffle).float()
            target_data = target(i, shuffle).long()
                
            #Run model, calculate loss
            output, hidden = model(input_data, hidden)
            loss += criterion(output.squeeze(), target_data.squeeze())
                
            #Backpropagate loss
            loss.backward(retain_graph=True)
            optimizer.step()

            print(loss.data.item())
                

In [None]:
#Initialize model parameters
input_size = np.shape(data)[2]
hidden_size = 256
num_layers = 3
dropout = 0.2
learning_rate = 0.001
epochs = 10
seq_length = 76

In [None]:
#Call model, set optimizer and loss function
model = Model(input_size, hidden_size, num_layers, dropout)
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
criterion = nn.CrossEntropyLoss()

In [None]:
#Train
train(epochs)