In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical
import random
import time

#Set device
USE_CUDA = torch.cuda.is_available()
if USE_CUDA:
    device = torch.device("cuda")
    cuda = True
else:
    device = torch.device("cpu")
    cuda = False
    
print("Device =", device)
gpus = [0]

In [None]:
def time_elapsed(start_time):
    elapsed = time.time() - start_time
    hours = int(elapsed/3600)
    minutes = int(int(elapsed/60)%60)
    seconds = int(elapsed%60)
    
    return hours, minutes, seconds

In [None]:
#Load SMILES data as one-hot encoding
data = np.load("ohesmiles.npz")
data = data["arr_0"]

data = torch.from_numpy(data).view(np.shape(data)[0], 1, np.shape(data)[1])

print("Dataset size: " + str(data.size()))

In [None]:
#Load vocab dictionary as numpy object array
vocab = np.load("vocab.npy")
print(vocab)
print("Vocab encodings size: " + str(np.shape(vocab)))

In [None]:
#Define model
class Model(nn.Module):
    
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(Model, self).__init__()
        
        #Model parameters
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        
        #Model layers
        self.lstm = nn.LSTM(input_size = input_size, hidden_size = hidden_size, num_layers = num_layers, dropout = dropout)
        self.linear = nn.Linear(hidden_size, input_size)
        
    #Define initial hidden and cell states
    def init_states(self, num_layers, hidden_size):
        hidden = [Variable(torch.zeros(num_layers, 1, hidden_size)),
                  Variable(torch.zeros(num_layers, 1, hidden_size))]
        
        return hidden
    
    #Define forward propagation
    def forward(self, inp, hidden):
        output, hidden = self.lstm(inp, hidden)
        output = self.linear(output)
        
        return output, hidden

In [None]:
#Initialize model and generation parameters
input_size = np.shape(data)[2]
hidden_size = 1024
num_layers = 3
dropout = .2
learning_rate = 0.001
seq_length = 75
batch_size = 128
temperature = 1
char_to_gen = 10000
runs = 100 #GPU can't handle generating larger amounts of characters at once, so done in a loop
prime_string = "G"

In [None]:
#Call and load model
model = Model(input_size, hidden_size, num_layers, dropout)
model.load_state_dict(torch.load("network.pth"))

#Run on GPU
if cuda:
    model = model.cuda()
    
model.eval();

In [None]:
def generate(prime_string, char_to_gen, temperature):
    
    #SMILES character string
    mol = "G"
    
    #Get input tensor from prime string
    prediction = torch.from_numpy(vocab[np.where(vocab == str(prime_string))[0], :][:, 2:].astype(float)).view(1,1,-1).cuda()
    
    hidden = model.init_states(num_layers, hidden_size)
    if cuda:
        hidden = (hidden[0].cuda(), hidden[1].cuda())
    
    for i in range(char_to_gen):
        #Get input tensor
        inp = prediction[i,:,:].view(1,1,-1).float()
            
        #Run on GPU if available
        if cuda:
            inp = inp.cuda()
                
        #Run model
        output, hidden = model(inp, hidden)
        
        #Apply softmax to convert output into probabilities
        output = F.softmax((output / temperature), dim=2)
    
        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(-1)
        top_i = torch.multinomial(output_dist, 1)[0]
        char = torch.from_numpy(vocab[top_i,2:].astype(float)).view(1,1,-1).cuda()

        #Update total prediction with the new character
        prediction = torch.cat((prediction, char), 0)
        
        #SMILES character predicted
        smile = vocab[top_i,0]
        
        if(smile == "\n"):
            hidden = model.init_states(num_layers, hidden_size)
            if cuda:
                hidden = (hidden[0].cuda(), hidden[1].cuda())
        
        #Update character string
        mol = mol + str(smile)
        mol = mol.replace("G", "")
             
    return prediction, mol

In [None]:
start_time = time.time()

for i in range(runs):
    
    #File to save generated molecules in
    new = open("gen.txt", "a")

    #Generate molecules
    prediction, mol = generate(prime_string, char_to_gen, temperature)

    #Add to file of generated molecules
    new.write(mol)

    hours, minutes, seconds = time_elapsed(start_time)
    print("SMILES run: " + str(i) + " saved." + " | Time elapsed: {0:02d}".format(hours) + "h {0:02d}".format(minutes) + " m {0:02d}".format(seconds) + " s")