In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

#Set device
USE_CUDA = torch.cuda.is_available()
if USE_CUDA:
    device = torch.device("cuda")
    cuda = True
else:
    device = torch.device("cpu")
    cuda = False
    
print("Device =", device)
gpus = [0]

Device = cuda


In [14]:
#Load SMILES data as integer labels and as one-hot encoding
data = np.load("ohesmiles.npz")
data = data["arr_0"]

intdata = np.load("intsmiles.npz")
intdata = intdata["arr_0"]

data = torch.from_numpy(data).view(np.shape(data)[0], 1, np.shape(data)[1])
intdata = torch.from_numpy(intdata)

print("Dataset size: " + str(data.size()))
print("Integer dataset size: " + str(intdata.size()))

Dataset size: torch.Size([34131372, 1, 55])
Integer dataset size: torch.Size([34131372])


In [97]:
#Load vocab dictionary as numpy object array
vocab = np.load("vocab.npy")
print(vocab)
print("Vocab encodings size: " + str(np.shape(vocab)))

[['\n' 0 1.0 ... 0.0 0.0 0.0]
 ['#' 1 0.0 ... 0.0 0.0 0.0]
 ['(' 2 0.0 ... 0.0 0.0 0.0]
 ...
 ['u' 52 0.0 ... 1.0 0.0 0.0]
 ['{' 53 0.0 ... 0.0 1.0 0.0]
 ['}' 54 0.0 ... 0.0 0.0 1.0]]
Vocab encodings size: (55, 57)


In [92]:
#Define model
class Model(nn.Module):
    
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super(Model, self).__init__()
        
        #Model parameters
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        
        #Model layers
        self.lstm = nn.LSTM(input_size = input_size, hidden_size = hidden_size, num_layers = num_layers, dropout = dropout)
        self.linear = nn.Linear(hidden_size, input_size)
        
    #Define initial hidden and cell states
    def init_states(self, num_layers, hidden_size):
        hidden = [Variable(torch.zeros(num_layers, 1, hidden_size)),
                  Variable(torch.zeros(num_layers, 1, hidden_size))]
        
        return hidden
    
    #Define forward propagation
    def forward(self, inp, hidden):
        output, hidden = self.lstm(inp, hidden)
        output = self.linear(output)
        
        return output, hidden
    

In [127]:
#Initialize model and generation parameters
input_size = np.shape(data)[2]
hidden_size = 256
num_layers = 3
dropout = .2
learning_rate = 0.001
epochs = 1
seq_length = 76
batch_size = 128

char_to_gen = 300
predict_len = 1
temperature = 1
prime_string = "{"

In [128]:
#Call and load model
model = Model(input_size, hidden_size, num_layers, dropout)
model.load_state_dict(torch.load("LSTMtrained.pth"))

#Run on GPU
if cuda:
    model = model.cuda()
    
model.eval();

In [131]:
def generate(prime_string, predict_len, temperature):
    #SMILES character string
    mol = "{"
    
    #Get input tensor from prime string
    prediction = torch.from_numpy(vocab[np.where(vocab == str(prime_string))[0], :][:, 2:].astype(float)).view(1,1,-1).cuda()
    
    hidden = model.init_states(num_layers, hidden_size)
    if cuda:
        hidden = (hidden[0].cuda(), hidden[1])
    
    for i in range(char_to_gen):
        #Initialize hidden and cell states
        hidden = model.init_states(num_layers, hidden_size)
            
        #Run on GPU if available
        if cuda:
            hidden = (hidden[0].cuda(), hidden[1].cuda())
            
        #Get input tensor
        inp = prediction[i,:,:].view(1,1,-1).float()
            
        #Run on GPU if available
        if cuda:
            inp = inp.cuda()
                
        #Run model
        output, hidden = model(inp, hidden)

        
        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = torch.multinomial(output_dist, 1)[0]
        char = torch.from_numpy(vocab[top_i,2:].astype(float)).view(1,1,-1).cuda()
        
        #Update total prediction with the new character
        prediction = torch.cat((prediction, char), 0)
        
        #Update character string
        smile = vocab[top_i,0]
        mol = mol + str(smile)
        
    return prediction, mol

In [134]:
prediction, mol = generate(prime_string, predict_len, temperature)
print(mol)

{C-\BCCS/FCOF)CCO(\C[CVRM=[CCC(F)OM2p6c5BC+C{[C2S}S/]}7}S-C.CN)c7}\c5/C2/CNK2
.C3O[NV
}\NF}PC[NNpBr
O}\C.Ct)c
S}Fp2.CVu\O
N/Cb}
Zb([N}ZCo2PCAuZKOS}}}
[PSCBBF)8AC1
}\C}F=A-TTc{C]N(r}F}c2}=CaHc2
Cl=Ccl

[Cc27})-PCSc[Cn.O/CO.KSl
o.IrZ/O.CioAC]F
F)\CC[CCPaC\CO
PP}VtAC
}FPCH{C.CO)C
O.C#C}/NPSOob#CO)N}.[nt
