In [10]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import random
import time

#Set device
USE_CUDA = torch.cuda.is_available()
if USE_CUDA:
    device = torch.device("cuda")
    cuda = True
else:
    device = torch.device("cpu")
    cuda = False
    
print("Device =", device)
gpus = [0]

Device = cuda


In [11]:
def time_elapsed(start_time):
    elapsed = time.time() - start_time
    hours = int(elapsed/3600)
    minutes = int(int(elapsed/60)%60)
    seconds = int(elapsed%60)
    
    return hours, minutes, seconds

In [12]:
#Load SMILES data as one-hot encoding
data = np.load("ohesmiles.npz")
data = data["arr_0"]

data = torch.from_numpy(data).view(np.shape(data)[0], 1, np.shape(data)[1])

print("Dataset size: " + str(data.size()))

Dataset size: torch.Size([24825178, 1, 53])


In [13]:
#Load vocab dictionary as numpy object array
vocab = np.load("vocab.npy")
print(vocab)
print("Vocab encodings size: " + str(np.shape(vocab)))

[['\n' 0 1.0 ... 0.0 0.0 0.0]
 ['#' 1 0.0 ... 0.0 0.0 0.0]
 ['(' 2 0.0 ... 0.0 0.0 0.0]
 ...
 ['r' 50 0.0 ... 1.0 0.0 0.0]
 ['s' 51 0.0 ... 0.0 1.0 0.0]
 ['t' 52 0.0 ... 0.0 0.0 1.0]]
Vocab encodings size: (53, 55)


In [14]:
#Define model
class Model(nn.Module):
    
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(Model, self).__init__()
        
        #Model parameters
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        
        #Model layers
        self.lstm = nn.LSTM(input_size = input_size, hidden_size = hidden_size, num_layers = num_layers, dropout = dropout)
        self.linear = nn.Linear(hidden_size, input_size)
        
    #Define initial hidden and cell states
    def init_states(self, num_layers, hidden_size):
        hidden = [Variable(torch.zeros(num_layers, 1, hidden_size)),
                  Variable(torch.zeros(num_layers, 1, hidden_size))]
        
        return hidden
    
    #Define forward propagation
    def forward(self, inp, hidden):
        output, hidden = self.lstm(inp, hidden)
        output = self.linear(output)
        
        return output, hidden

In [20]:
#Initialize model and generation parameters
input_size = np.shape(data)[2]
hidden_size = 1024
num_layers = 3
dropout = .2
learning_rate = 0.001
epochs = 1
seq_length = 75
batch_size = 128
temperature = .75
char_to_gen = 10000
runs = 750 #GPU can't handle generating larger amounts of characters at once, so done in a loop
prime_string = "G"

In [21]:
#Call and load model
model = Model(input_size, hidden_size, num_layers, dropout)
model.load_state_dict(torch.load("network.pth"))

#Run on GPU
if cuda:
    model = model.cuda()
    
model.eval();

In [22]:
def generate(prime_string, char_to_gen, temperature):
    
    #SMILES character string
    mol = "G"
    
    #Get input tensor from prime string
    prediction = torch.from_numpy(vocab[np.where(vocab == str(prime_string))[0], :][:, 2:].astype(float)).view(1,1,-1).cuda()
    
    hidden = model.init_states(num_layers, hidden_size)
    if cuda:
        hidden = (hidden[0].cuda(), hidden[1].cuda())
    
    for i in range(char_to_gen):
        #Get input tensor
        inp = prediction[i,:,:].view(1,1,-1).float()
            
        #Run on GPU if available
        if cuda:
            inp = inp.cuda()
                
        #Run model
        output, hidden = model(inp, hidden)
        
        #Apply softmax to convert output into probabilities
        output = F.softmax((output / temperature), dim=2)

        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(-1)
        top_i = torch.multinomial(output_dist, 1)[0]
        char = torch.from_numpy(vocab[top_i,2:].astype(float)).view(1,1,-1).cuda()
        
        #Update total prediction with the new character
        prediction = torch.cat((prediction, char), 0)
        
        #Update character string
        smile = vocab[top_i,0]
        mol = mol + str(smile)
        mol = mol.replace("G", "")
             
    return prediction, mol

In [23]:
start_time = time.time()

for i in range(runs):
    
    #File to save generated molecules in
    new = open("generatedsmiles.txt", "a")

    #Generate molecules
    prediction, mol = generate(prime_string, char_to_gen, temperature)

    #Add to file of generated molecules
    new.write(mol)

    hours, minutes, seconds = time_elapsed(start_time)
    print("SMILES run: " + str(i) + " saved." + " | Time elapsed: {0:02d}".format(hours) + "h {0:02d}".format(minutes) + " m {0:02d}".format(seconds) + " s")

SMILES run: 0 saved. | Time elapsed: 00h 00 m 01 s
SMILES run: 1 saved. | Time elapsed: 00h 00 m 02 s
SMILES run: 2 saved. | Time elapsed: 00h 00 m 03 s
SMILES run: 3 saved. | Time elapsed: 00h 00 m 04 s
SMILES run: 4 saved. | Time elapsed: 00h 00 m 06 s
SMILES run: 5 saved. | Time elapsed: 00h 00 m 07 s
SMILES run: 6 saved. | Time elapsed: 00h 00 m 08 s
SMILES run: 7 saved. | Time elapsed: 00h 00 m 09 s
SMILES run: 8 saved. | Time elapsed: 00h 00 m 11 s
SMILES run: 9 saved. | Time elapsed: 00h 00 m 12 s
SMILES run: 10 saved. | Time elapsed: 00h 00 m 13 s
SMILES run: 11 saved. | Time elapsed: 00h 00 m 14 s
SMILES run: 12 saved. | Time elapsed: 00h 00 m 16 s
SMILES run: 13 saved. | Time elapsed: 00h 00 m 17 s
SMILES run: 14 saved. | Time elapsed: 00h 00 m 18 s
SMILES run: 15 saved. | Time elapsed: 00h 00 m 19 s
SMILES run: 16 saved. | Time elapsed: 00h 00 m 20 s
SMILES run: 17 saved. | Time elapsed: 00h 00 m 22 s
SMILES run: 18 saved. | Time elapsed: 00h 00 m 23 s
SMILES run: 19 saved. 

KeyboardInterrupt: 