In [27]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

#Set device
USE_CUDA = torch.cuda.is_available()
if USE_CUDA:
    device = torch.device("cuda")
    cuda = True
else:
    device = torch.device("cpu")
    cuda = False
    
print("Device =", device)
gpus = [0]

Device = cuda


In [28]:
#Load SMILES data as one-hot encoding
data = np.load("ohesmiles.npz")
data = data["arr_0"]

data = torch.from_numpy(data).view(np.shape(data)[0], 1, np.shape(data)[1])

print("Dataset size: " + str(data.size()))

Dataset size: torch.Size([25246858, 1, 53])


In [29]:
#Load vocab dictionary as numpy object array
vocab = np.load("vocab.npy")
print(vocab)
print("Vocab encodings size: " + str(np.shape(vocab)))

[['\n' 0 1.0 ... 0.0 0.0 0.0]
 ['#' 1 0.0 ... 0.0 0.0 0.0]
 ['(' 2 0.0 ... 0.0 0.0 0.0]
 ...
 ['r' 50 0.0 ... 1.0 0.0 0.0]
 ['s' 51 0.0 ... 0.0 1.0 0.0]
 ['t' 52 0.0 ... 0.0 0.0 1.0]]
Vocab encodings size: (53, 55)


In [30]:
#Define model
class Model(nn.Module):
    
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(Model, self).__init__()
        
        #Model parameters
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        
        #Model layers
        self.lstm = nn.LSTM(input_size = input_size, hidden_size = hidden_size, num_layers = num_layers, dropout = dropout)
        self.linear = nn.Linear(hidden_size, input_size)
        
    #Define initial hidden and cell states
    def init_states(self, num_layers, hidden_size):
        hidden = [Variable(torch.zeros(num_layers, 1, hidden_size)),
                  Variable(torch.zeros(num_layers, 1, hidden_size))]
        
        return hidden
    
    #Define forward propagation
    def forward(self, inp, hidden):
        output, hidden = self.lstm(inp, hidden)
        output = self.linear(output)
        
        return output, hidden

In [31]:
#Initialize model and generation parameters
input_size = np.shape(data)[2]
hidden_size = 1024
num_layers = 3
dropout = .2
learning_rate = 0.001
epochs = 1
seq_length = 75
batch_size = 64

char_to_gen = 1000
temperature = .7
prime_string = "G"

In [32]:
#Call and load model
model = Model(input_size, hidden_size, num_layers, dropout)
model.load_state_dict(torch.load("jan15LSTM1024-1.pth"))

#Run on GPU
if cuda:
    model = model.cuda()
    
model.eval();

In [33]:
def generate(prime_string, char_to_gen, temperature):
    #SMILES character string
    mol = "G"
    
    #Get input tensor from prime string
    prediction = torch.from_numpy(vocab[np.where(vocab == str(prime_string))[0], :][:, 2:].astype(float)).view(1,1,-1).cuda()
    
    hidden = model.init_states(num_layers, hidden_size)
    if cuda:
        hidden = (hidden[0].cuda(), hidden[1].cuda())
    
    for i in range(char_to_gen):
        #Get input tensor
        inp = prediction[i,:,:].view(1,1,-1).float()
            
        #Run on GPU if available
        if cuda:
            inp = inp.cuda()
                
        #Run model
        output, hidden = model(inp, hidden)

        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = torch.multinomial(output_dist, 1)[0]
        char = torch.from_numpy(vocab[top_i,2:].astype(float)).view(1,1,-1).cuda()
        
        #Update total prediction with the new character
        prediction = torch.cat((prediction, char), 0)
        
        #Update character string
        smile = vocab[top_i,0]
        mol = mol + str(smile)
        
    return prediction, mol

In [34]:
prediction, mol = generate(prime_string, char_to_gen, temperature)
print(mol)

GCOc1ccccc1N2CCN(CCN3C=CC(=O)CC3)c4cc(Cl)ccc4l
GCCOC(=O)c1ncn2CCCN(Cc3ccc(cc3)C(=O)N)cc1O
GCCC1(C)CC(=O)N(CCN2CCN(CC2)c3ccccn3)C(=O)C4=C(CCC4)O1
GCCc1ccc2nc3c(cccc3cc2c1)C(=O)NCCN(C)C
GCCCCN(C)C(=O)[C@H](C1CCCCC1)NC(=O)[C@H](CC(C)C)NC(=O)Cc2cc(OC)cc(OC)c2
GCCNCC(O)COc1ccc2C(=O)C=C(Oc2c1)c3ccccc3
GOC(=O)CCNC(=O)NNC(=O)CCCNc1ccccn1
GFc1ccc(cc1)C2OOC3C4CCC(C4)C3(OO2)c5ccccc5
G[I-].C[C@@H]1O[C@H](C[N+](C)(C)C)CS1(=O)=O)C(Cc2ccccc2)NC(=O)NCc3ccccc3
GC[C@@H](Nc1nc(N)c2cnn(c3ccccc3)c2n1)C(=O)OCC(C)(C)C
GCOc1ccc(cc1OC)N(C)Cc2ccc3nc(N)nc(N)c3n2
GCOC1=C(N)C(=O)c2c(ccnc2c3ccccn3)C1=O
GCCCCN(C)C(=O)[C@H](C1CCCCC1)NC(=O)[C@H](CC(C)C)NC(=O)Cc2cccc(C)c2
GCc1cc2c(OC[C@@H](O)CN3CCC(CC3)c4cc5cc(Cl)ccc5s4)cccc2[nH]1
GCCCOc1cccc(c1)N2CCN(CCCN3C(=O)CCc4c(Cl)cccc34)CC2
GCCOC(=O)c1ncc(n1)c2ccc(CCc3ccccc3)cc2
GCCC(CCCN1CCCC(C1)C2OC3CC3)CC2
GOC(=O)c1ccc(cc1)C(=O)Nc2ccc3c(OCc4cccc(Br)cc4)cccc3c2
GCOc1cccc2CN(CCCc3c[nH]c4ccc(F)cc34)CCc12
GOC(=O)CN1C(=O)N(Cc2ccccc2)C(=O)C1=O
GCC(=O)c1ccc(NC(=O)CCN2CCN(CC2)c3ccccn