In [4]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import time
import sys

#Set device
USE_CUDA = torch.cuda.is_available()
if USE_CUDA:
    device = torch.device("cuda")
    cuda = True
else:
    device = torch.device("cpu")
    cuda = False
    
print("Device =",device)
gpus = [0]

Device = cuda


In [5]:
def time_elapsed(start_time):
    elapsed = time.time() - start_time
    hours = int(elapsed/3600)
    minutes = int(int(elapsed/60)%60)
    seconds = int(elapsed%60)
    
    return hours, minutes, seconds

In [6]:
#Load SMILES data as integer labels and as one-hot encoding
data = np.load("ohesmiles.npz")
data = data["arr_0"]

intdata = np.load("intsmiles.npz")
intdata = intdata["arr_0"]

data = torch.from_numpy(data).view(np.shape(data)[0], 1, np.shape(data)[1])
intdata = torch.from_numpy(intdata)

print("Dataset size: " + str(data.size()))
print("Integer dataset size: " + str(intdata.size()))

Dataset size: torch.Size([34131372, 1, 55])
Integer dataset size: torch.Size([34131372])


In [118]:
#Get input tensor
def inp(i, shuffle, batch_size):

    inp = torch.zeros(seq_length-1, 1, data.shape[-1]).float()
    
    for j in range(batch_size):
        inputs = data[int(shuffle[i] * seq_length * j) : int((shuffle[i] * seq_length * j) + seq_length - 1), :, :].float()
        inp = torch.cat((inp, inputs), 1)
    
    inp = inp[:, 1:, :]
    
    #Input (does not include last character in SMILES)
    return inp

In [119]:
#Get target tensor
def target(i, shuffle, batch_size):
    
    target = torch.zeros(seq_length-1, 1).float()
    
    for j in range(batch_size):
        targets = intdata[int((shuffle[i] * seq_length * j) + 1) : int((shuffle[i] * seq_length * j) + seq_length)].view(-1, 1).float()
        target = torch.cat((target, targets), 1)

    target = target[:, 1:]

    #Target (does not include first character in SMILES)
    return target

In [130]:
#Define model
class Model(nn.Module):
    
    #Define model parameters
    def __init__(self, input_size, hidden_size, num_layers, dropout, batch_size):
        super(Model, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.batch_size = batch_size

        self.lstm = nn.LSTM(input_size = input_size, hidden_size = hidden_size, num_layers = num_layers, dropout = dropout)
        self.linear = nn.Linear(hidden_size, batch_size)
        
        self.cuda()  
        
    #Define initial hidden and cell states
    def init_states(self, num_layers, batch_size, hidden_size):
        hidden = [Variable(torch.zeros(num_layers, batch_size, hidden_size)),
                  Variable(torch.zeros(num_layers, batch_size, hidden_size))]
        
        return hidden
    
    #Define forward propagation
    def forward(self, inp, hidden):
        output, hidden = self.lstm(inp, hidden)
        output = self.linear(output)
        
        return output, hidden
    

In [515]:
#Set start time
start_time = time.time()

#Define training
def train(epochs):
    #Iterate over desired number of epochs 
    for e in range(epochs):
        
        #Get random order of SMILES molecules (shuffle data)
        shuffle = np.arange((np.shape(data)[0] / batch_size) / seq_length)
        random.shuffle(shuffle)
        
        #Iterate over each batch of molecules and their characters in dataset
        for i in range(int((np.shape(data)[0] / batch_size) / seq_length)):
            
            #Initialize hidden and cell states
            hidden = model.init_states(num_layers, batch_size, hidden_size)
            
            #Run on GPU if available
            if cuda:
                hidden = (hidden[0].cuda(), hidden[1].cuda())
        
            #Set initial gradients
            model.zero_grad()
    
            #Set initial loss
            loss = 0 
            
            #Get input and target
            input_data = inp(i, shuffle, batch_size).float()
            target_data = target(i, shuffle, batch_size).long()
            
            #Run on GPU if available
            if cuda:
                input_data = input_data.cuda()
                target_data = target_data.cuda()
                
            #Run model, calculate loss
            output, hidden = model(input_data, hidden)
            print(output.size())
            print(target_data.size())
            loss += criterion(output.squeeze(), target_data.squeeze())
                
            #Backpropagate loss
            loss.backward()
            optimizer.step()
            
            if i % 10 == 0:
                hours, minutes, seconds = time_elapsed(start_time)
                losses.append(loss.data.item() / seq_length)
                losses[0 * e] = loss.data.item() / seq_length
                print("Loss: {:0.8f}".format(loss.data.item() / seq_length) + " | ΔLoss10: {:+0.4f}".format(losses[-1] - losses[-2]) + " | ΔLossTotal: {:+0.4f}".format(losses[-1] - losses[1]) +" | Epoch: {0:02d}".format(e) + " | Iteration: {0:04d}".format(i) + " | Time elapsed: {0:02d}".format(hours) + "h {0:02d}".format(minutes) + "m {0:02d}".format(seconds) + "s")

In [516]:
#Initialize model parameters
input_size = np.shape(data)[2]
hidden_size = 256
num_layers = 3
dropout = .2
learning_rate = 0.0001
epochs = 10
seq_length = 76
batch_size = 1

#List of losses
losses = [0]

In [517]:
#Call model, set optimizer and loss function
model = Model(input_size, hidden_size, num_layers, dropout, batch_size)
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
criterion = nn.CrossEntropyLoss()

#Run on GPU if available
if cuda:
    model.cuda()
    criterion.cuda()  

In [518]:
#Total number of parameters
total_params = sum(p.numel() for p in model.parameters())
print("Total number of parameters in network: " + str(total_params))

Total number of parameters in network: 1373441


In [519]:
#Train
train(epochs)

torch.Size([75, 1, 1])
torch.Size([75, 1])


RuntimeError: Dimension out of range (expected to be in range of [-1, 0], but got 1)