In [23]:
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import time
import sys

#Set device
USE_CUDA = torch.cuda.is_available()
if USE_CUDA:
    device = torch.device("cuda")
    cuda = True
else:
    device = torch.device("cpu")
    cuda = False
    
print("Device =", device)
gpus = [0]

Device = cuda


In [24]:
def time_elapsed(start_time):
    elapsed = time.time() - start_time
    hours = int(elapsed/3600)
    minutes = int(int(elapsed/60)%60)
    seconds = int(elapsed%60)
    
    return hours, minutes, seconds

In [25]:
#Load SMILES data as integer labels and as one-hot encoding
data = np.load("ohesmiles.npz")
data = data["arr_0"]

intdata = np.load("intsmiles.npz")
intdata = intdata["arr_0"]

data = torch.from_numpy(data).view(np.shape(data)[0], 1, np.shape(data)[1])
intdata = torch.from_numpy(intdata)

print("Dataset size: " + str(data.size()))
print("Integer dataset size: " + str(intdata.size()))

Dataset size: torch.Size([24825178, 1, 53])
Integer dataset size: torch.Size([24825178])


In [102]:
#Get input tensor
def inptarg(i, batch_size):
    while True:
        try:
            #Define input and target tensor sizes
            inp = torch.Tensor((seq_length - 1) * batch_size, 1, np.shape(data)[2])
            target = torch.Tensor((seq_length - 1) * batch_size)
    
            #SMILES molecules in batch 
            inputs = data[int((seq_length * i)) : int((seq_length * i) + (seq_length * batch_size)), :, :]
            targets = intdata[int((seq_length * i)) : int((seq_length * i) + (seq_length * batch_size))]

            #Index counters for input, target
            r = 0
            s = 0 
            for p in range(seq_length * batch_size - 1):
        
                if (p % seq_length != (seq_length - 1)):
                    #Input data (does not include last character in SMILES)
                    inp[r, :, :] = inputs[p, :, :]
                    r += 1
            
                if (p % seq_length != 0):
                    #Target data (does not include first character in SMILES)
                    target[s] = targets[p]
                    s += 1
                    
            return inp, target
        
        except:
            continue
        break

In [103]:
#Define model
class Model(nn.Module):
    
    #Define model parameters
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(Model, self).__init__()
        
        #Model parameters
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        
        #Model layers
        self.lstm = nn.LSTM(input_size = input_size, hidden_size = hidden_size, num_layers = num_layers, dropout = dropout)
        self.linear = nn.Linear(hidden_size, input_size)
        
    #Define initial hidden and cell states
    def init_states(self, num_layers, hidden_size):
        hidden = [Variable(torch.zeros(num_layers, 1, hidden_size)),
                  Variable(torch.zeros(num_layers, 1, hidden_size))]
         
        #Initialize forget gate bias to 1    
        for names in self.lstm._all_weights:
            for name in filter(lambda n: "bias" in n, names):
                bias = getattr(self.lstm, name)
                n = bias.size(0)
                start, end = n//4, n//2
                
                nn.init.constant_(bias.data[start:end], 1.0)
        
        return hidden
    
    #Define forward propagation
    def forward(self, inp, hidden):
        #LSTM
        output, hidden = self.lstm(inp, hidden)

        #Linear Layer
        output = self.linear(output)
        
        return output, hidden

In [109]:
#Define training
def train():
    #Set start time
    start_time = time.time()
        
    #Iterate set of seq_length characters
    for i in range(int((np.shape(data)[0] - seq_length + 1) / batch_size)):
            
        #Initialize hidden and cell states
        hidden = model.init_states(num_layers, hidden_size)
            
        #Run on GPU if available
        if cuda:
            hidden = (hidden[0].cuda(), hidden[1].cuda())
        
        #Set initial gradients
        model.zero_grad()
    
        #Set initial loss
        loss = 0 
            
        #Get input and target
        input_data, target_data = inptarg(i, batch_size)
        input_data = input_data.float()
        target_data = target_data.long()
            
        #Run on GPU if available
        if cuda:
            input_data = input_data.cuda()
            target_data = target_data.cuda()
                
        #Run model, calculate loss
        output, hidden = model(input_data, hidden)
        loss += criterion(output.squeeze(), target_data.squeeze())
                
        #Backpropagate loss
        loss.backward()
        
        #Clip gradients
        nn.utils.clip_grad_norm_(model.parameters(), 3.0)
        
        #Optimize
        optimizer.step()
            
        #Update list of losses
        if (i % 50 == 0):
            losses[0] = loss.data.item() / seq_length   
        losses.append(loss.data.item() / seq_length)
            
        #Intermediary saves
        if (i % 1000 == 0):
                torch.save(model.state_dict(), "network.pth")

        #Print training info
        hours, minutes, seconds = time_elapsed(start_time)
        print("Loss: {:0.6f}".format(loss.data.item() / seq_length) + " | ΔLossTotal: {:+0.4f}".format(losses[-1] - losses[1]) + " | Iteration: {0:04d}".format(i + 1) + " | Time elapsed: {0:02d}".format(hours) + "h {0:02d}".format(minutes) + " m {0:02d}".format(seconds) + " s")

In [110]:
#Initialize model parameters
input_size = np.shape(data)[2]
hidden_size = 1024
num_layers = 3
dropout = .2
learning_rate = 0.001
seq_length = 75
batch_size = 128

#List of losses
losses = [0]

In [111]:
#Call model, set optimizer and loss function
model = Model(input_size, hidden_size, num_layers, dropout)
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
criterion = nn.CrossEntropyLoss()

#Run on GPU if available
if cuda:
    model.cuda()
    criterion.cuda()

In [112]:
#Total number of parameters
total_params = sum(p.numel() for p in model.parameters())
print("Total number of parameters in network: " + str(total_params))

Total number of parameters in network: 21267509


In [113]:
#Train
train()

Loss: 0.053576 | ΔLossTotal: +0.0000 | Iteration: 0001 | Time elapsed: 00h 00 m 04 s
Loss: 0.048896 | ΔLossTotal: -0.0047 | Iteration: 0002 | Time elapsed: 00h 00 m 08 s
Loss: 0.039394 | ΔLossTotal: -0.0142 | Iteration: 0003 | Time elapsed: 00h 00 m 13 s
Loss: 0.037177 | ΔLossTotal: -0.0164 | Iteration: 0004 | Time elapsed: 00h 00 m 17 s
Loss: 0.036670 | ΔLossTotal: -0.0169 | Iteration: 0005 | Time elapsed: 00h 00 m 21 s
Loss: 0.036317 | ΔLossTotal: -0.0173 | Iteration: 0006 | Time elapsed: 00h 00 m 26 s
Loss: 0.036205 | ΔLossTotal: -0.0174 | Iteration: 0007 | Time elapsed: 00h 00 m 30 s
Loss: 0.036008 | ΔLossTotal: -0.0176 | Iteration: 0008 | Time elapsed: 00h 00 m 34 s
Loss: 0.036190 | ΔLossTotal: -0.0174 | Iteration: 0009 | Time elapsed: 00h 00 m 39 s
Loss: 0.036199 | ΔLossTotal: -0.0174 | Iteration: 0010 | Time elapsed: 00h 00 m 43 s
Loss: 0.036084 | ΔLossTotal: -0.0175 | Iteration: 0011 | Time elapsed: 00h 00 m 47 s
Loss: 0.036081 | ΔLossTotal: -0.0175 | Iteration: 0012 | Time ela

Loss: 0.035920 | ΔLossTotal: -0.0177 | Iteration: 0098 | Time elapsed: 00h 07 m 04 s
Loss: 0.035897 | ΔLossTotal: -0.0177 | Iteration: 0099 | Time elapsed: 00h 07 m 09 s
Loss: 0.035955 | ΔLossTotal: -0.0176 | Iteration: 0100 | Time elapsed: 00h 07 m 13 s
Loss: 0.035982 | ΔLossTotal: -0.0176 | Iteration: 0101 | Time elapsed: 00h 07 m 17 s
Loss: 0.035963 | ΔLossTotal: -0.0176 | Iteration: 0102 | Time elapsed: 00h 07 m 22 s
Loss: 0.035952 | ΔLossTotal: -0.0176 | Iteration: 0103 | Time elapsed: 00h 07 m 26 s
Loss: 0.035953 | ΔLossTotal: -0.0176 | Iteration: 0104 | Time elapsed: 00h 07 m 30 s
Loss: 0.035990 | ΔLossTotal: -0.0176 | Iteration: 0105 | Time elapsed: 00h 07 m 35 s
Loss: 0.035968 | ΔLossTotal: -0.0176 | Iteration: 0106 | Time elapsed: 00h 07 m 39 s
Loss: 0.035961 | ΔLossTotal: -0.0176 | Iteration: 0107 | Time elapsed: 00h 07 m 43 s
Loss: 0.035947 | ΔLossTotal: -0.0176 | Iteration: 0108 | Time elapsed: 00h 07 m 48 s
Loss: 0.035978 | ΔLossTotal: -0.0176 | Iteration: 0109 | Time ela

Loss: 0.035805 | ΔLossTotal: -0.0178 | Iteration: 0195 | Time elapsed: 00h 14 m 03 s
Loss: 0.035767 | ΔLossTotal: -0.0178 | Iteration: 0196 | Time elapsed: 00h 14 m 08 s
Loss: 0.035832 | ΔLossTotal: -0.0177 | Iteration: 0197 | Time elapsed: 00h 14 m 12 s
Loss: 0.035874 | ΔLossTotal: -0.0177 | Iteration: 0198 | Time elapsed: 00h 14 m 16 s
Loss: 0.035851 | ΔLossTotal: -0.0177 | Iteration: 0199 | Time elapsed: 00h 14 m 21 s
Loss: 0.035825 | ΔLossTotal: -0.0178 | Iteration: 0200 | Time elapsed: 00h 14 m 25 s
Loss: 0.035824 | ΔLossTotal: -0.0178 | Iteration: 0201 | Time elapsed: 00h 14 m 30 s
Loss: 0.035800 | ΔLossTotal: -0.0178 | Iteration: 0202 | Time elapsed: 00h 14 m 34 s
Loss: 0.035816 | ΔLossTotal: -0.0178 | Iteration: 0203 | Time elapsed: 00h 14 m 38 s
Loss: 0.035843 | ΔLossTotal: -0.0177 | Iteration: 0204 | Time elapsed: 00h 14 m 43 s
Loss: 0.035847 | ΔLossTotal: -0.0177 | Iteration: 0205 | Time elapsed: 00h 14 m 47 s
Loss: 0.035870 | ΔLossTotal: -0.0177 | Iteration: 0206 | Time ela

Loss: 0.032524 | ΔLossTotal: -0.0211 | Iteration: 0292 | Time elapsed: 00h 21 m 03 s
Loss: 0.032335 | ΔLossTotal: -0.0212 | Iteration: 0293 | Time elapsed: 00h 21 m 07 s
Loss: 0.032148 | ΔLossTotal: -0.0214 | Iteration: 0294 | Time elapsed: 00h 21 m 11 s
Loss: 0.032142 | ΔLossTotal: -0.0214 | Iteration: 0295 | Time elapsed: 00h 21 m 16 s
Loss: 0.032133 | ΔLossTotal: -0.0214 | Iteration: 0296 | Time elapsed: 00h 21 m 20 s
Loss: 0.031982 | ΔLossTotal: -0.0216 | Iteration: 0297 | Time elapsed: 00h 21 m 24 s
Loss: 0.031720 | ΔLossTotal: -0.0219 | Iteration: 0298 | Time elapsed: 00h 21 m 29 s
Loss: 0.031606 | ΔLossTotal: -0.0220 | Iteration: 0299 | Time elapsed: 00h 21 m 33 s
Loss: 0.031746 | ΔLossTotal: -0.0218 | Iteration: 0300 | Time elapsed: 00h 21 m 37 s
Loss: 0.032028 | ΔLossTotal: -0.0215 | Iteration: 0301 | Time elapsed: 00h 21 m 42 s
Loss: 0.032118 | ΔLossTotal: -0.0215 | Iteration: 0302 | Time elapsed: 00h 21 m 46 s
Loss: 0.030860 | ΔLossTotal: -0.0227 | Iteration: 0303 | Time ela

KeyboardInterrupt: 

In [28]:
torch.save(model.state_dict(), "network.pth")

In [29]:
with open('losses.txt', 'w') as f:
    for item in losses:
        f.write("%s\n" % item)