In [27]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

#Set device
USE_CUDA = torch.cuda.is_available()
if USE_CUDA:
    device = torch.device("cuda")
    cuda = True
else:
    device = torch.device("cpu")
    cuda = False
    
print("Device =", device)
gpus = [0]

Device = cuda


In [28]:
#Load SMILES data as one-hot encoding
data = np.load("ohesmiles.npz")
data = data["arr_0"]

data = torch.from_numpy(data).view(np.shape(data)[0], 1, np.shape(data)[1])

print("Dataset size: " + str(data.size()))

Dataset size: torch.Size([25246858, 1, 53])


In [29]:
#Load vocab dictionary as numpy object array
vocab = np.load("vocab.npy")
print(vocab)
print("Vocab encodings size: " + str(np.shape(vocab)))

[['\n' 0 1.0 ... 0.0 0.0 0.0]
 ['#' 1 0.0 ... 0.0 0.0 0.0]
 ['(' 2 0.0 ... 0.0 0.0 0.0]
 ...
 ['r' 50 0.0 ... 1.0 0.0 0.0]
 ['s' 51 0.0 ... 0.0 1.0 0.0]
 ['t' 52 0.0 ... 0.0 0.0 1.0]]
Vocab encodings size: (53, 55)


In [30]:
#Define model
class Model(nn.Module):
    
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(Model, self).__init__()
        
        #Model parameters
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        
        #Model layers
        self.lstm = nn.LSTM(input_size = input_size, hidden_size = hidden_size, num_layers = num_layers, dropout = dropout)
        self.linear = nn.Linear(hidden_size, input_size)
        
    #Define initial hidden and cell states
    def init_states(self, num_layers, hidden_size):
        hidden = [Variable(torch.zeros(num_layers, 1, hidden_size)),
                  Variable(torch.zeros(num_layers, 1, hidden_size))]
        
        return hidden
    
    #Define forward propagation
    def forward(self, inp, hidden):
        output, hidden = self.lstm(inp, hidden)
        output = self.linear(output)
        
        return output, hidden

In [41]:
#Initialize model and generation parameters
input_size = np.shape(data)[2]
hidden_size = 1024
num_layers = 3
dropout = .2
learning_rate = 0.001
epochs = 1
seq_length = 75
batch_size = 64

char_to_gen = 10000
temperature = .7
prime_string = "G"

In [36]:
#Call and load model
model = Model(input_size, hidden_size, num_layers, dropout)
model.load_state_dict(torch.load("jan15LSTM1024-1.pth"))

#Run on GPU
if cuda:
    model = model.cuda()
    
model.eval();

In [42]:
def generate(prime_string, char_to_gen, temperature):
    #SMILES character string
    mol = "G"
    
    #Get input tensor from prime string
    prediction = torch.from_numpy(vocab[np.where(vocab == str(prime_string))[0], :][:, 2:].astype(float)).view(1,1,-1).cuda()
    
    hidden = model.init_states(num_layers, hidden_size)
    if cuda:
        hidden = (hidden[0].cuda(), hidden[1].cuda())
    
    for i in range(char_to_gen):
        #Get input tensor
        inp = prediction[i,:,:].view(1,1,-1).float()
            
        #Run on GPU if available
        if cuda:
            inp = inp.cuda()
                
        #Run model
        output, hidden = model(inp, hidden)

        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = torch.multinomial(output_dist, 1)[0]
        char = torch.from_numpy(vocab[top_i,2:].astype(float)).view(1,1,-1).cuda()
        
        #Update total prediction with the new character
        prediction = torch.cat((prediction, char), 0)
        
        #Update character string
        smile = vocab[top_i,0]
        mol = mol + str(smile)
        mol = mol.replace("G", "")
        
    return prediction, mol

In [43]:
prediction, mol = generate(prime_string, char_to_gen, temperature)
print(mol)

COc1ccccc1N2CCN(CCN3C=CC(=O)CC3)CC2
CCOC(=O)c1ncn2c1CN(C)C(=O)c3cc(C=C=C)ccc23
Oc1c(I)cc(I)cc1C(=O)Nc2ccc3c(OCc4cccc(Br)cc4)cccc3c2
COc1cccc2NC(=O)\C(=C/c3[nH]c4C(=O)OCCc34)\c12
CN1CCN(CCCNc2ncc3cc(c(NC(=O)NCCNC(=O)OC(C)(C)C)nc3n2)c4c(Cl)ccc4Cl)CC1
CC(=O)c1ccc(NC(=O)CCN2CCN(CC2)c3ccccn3)cc1
Cc1ccccc1NC(=O)NC2CC3CCC(C2)N3C
Nc1ncc2cc(c(N)nc2n1)c3c(Br)cccc3Br
CC(=CCC(N(Cc1ccc(C)cc1)Cc2cc(on2)c3ccccc3)C(=O)N)C
Cc1cccc(NC(=O)CCN2CCN(CC2)c3ccccn3)c1
CC1[C@H]2Cc3ccc(NC=O)cc3[C@]1(C)CN2CC4CC4
COc1cccc(NC(=O)Nc2ccnc3ccccc23)c1
NC(=O)C(Cc1c[nH]cn1)N(Cc2ccc(cc2)C#N)Cc3cc(on3)c4ccccc4
Fc1ccc(cc1)N2CN(CCN3C(=O)CCc4c(Cl)cccc34)CC2
CCOC(=O)c1ncn2c1CN(C)Cc3ccccc3Br
OC(=O)CC(Cc1ccccc1)NC(=O)C(Cc2ccccc2)C(=O)O
COc1cc(cc1OC)N(C)Cc2ccc3nc(N)nc(N)c3n2
COC1=C(N)C(=O)c2c(ccnc2c3ccccn3)C1=O
CC1=C(Sc2ccccn2)N(COCCSc3ccccc3)C(=O)NC1=O
COc1cccc2N(CCCCN3CCN(CC3)C4CCCCC4)C(=NC2=O)SCc5ccc(F)cc5
C[C@H]1[C@H](O)[C@@H](NCc2ccccc2)c3ccccc3N1C(=O)c4ccc(C)cc4
COC(=O)c1c(C)nc2c(c1N)c3ccccc3n2C4CCCC4
CNc1c(C(=O)OC)c(C)n2c1