In [1]:
import numpy as np
import torch
from torch import nn
import pyanitools as pya

# CHEM 277B - HW 10: Recurrent Neural Network, LSTM #

## 1. LSTM applied to SMILES string generation ##

Using the SMILES string from the ANI
dataset with up to 6 heavy atoms, build a LSTM generative model that can generate new smiles string
with given initial character.

## (a) ## 

Process the smiles strings from ANI dataset by adding a starting character at the beginning
and an ending character at the end. Look over the dataset and define the vocabulary, use one hot
encoding to encode your smiles strings.

In [2]:
# Load ANI-1 dataset with 6 heavy atom
ani_data = pya.anidataloader('../Final_Project/ANI-1_release/ani_gdb_s06.h5')
data_iter = ani_data.__iter__()

In [3]:
# Get SMILES strings from ANI dataset
ani_smiles = []
for molecule in data_iter:
    smiles = molecule['smiles']
    smiles = "".join(smiles)
    ani_smiles.append(smiles)
    
# Add starting and ending character to smiles strings
ani_smiles = ['S' + s + 'E' for s in ani_smiles]

In [4]:
# Define vocabulary 
vocabulary = list(set("".join(ani_smiles)))
len(vocabulary)

17

There are 17 unique values including the starting and ending characters.

In [5]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(categories=[vocabulary])
encoder.fit(np.array(vocabulary).reshape(-1, 1))

smiles_vocab = []
for s in ani_smiles:
    # Convert the SMILES string to a list of indices in the vocabulary
    smiles_list = np.array(list(s)).reshape(-1,1)
    smiles_vocab.append(smiles_list)

In [6]:
def batches_gen(smiles, batchsize, encoder):
    '''Create a generator that returns batches of size (batch_size,seq_leng,nchars) from smiles, 
    where seq_leng is the length of the longest smiles string and nchar is the length of one-hot encoded characters (17)
       
       Arguments
       ---------
       smiles: python list(nsmiles,nchar) smiles array shape you want to make batches from
       batchsize: Batch size, the number of sequences per batch
       encoder: one hot encoder

    '''
    arr=[torch.tensor(np.array(encoder.transform(np.array(s).reshape(-1,1)).toarray()),dtype=torch.float) for s in smiles] 
        #size (nsmiles,seq_length(variable),nchars)
        
    # The features
    X = [s[:-1,:] for s in arr]
    # The targets, shifted by one
    y = [s[1:,:] for s in arr]
    # pad sequence so that all smiles are the same length
    X = nn.utils.rnn.pad_sequence(X,batch_first=True)
    y = nn.utils.rnn.pad_sequence(y,batch_first=True)

    
    for i in range(len(arr)//batchsize):
        yield X[i*batchsize:(i+1)*batchsize],y[i*batchsize:(i+1)*batchsize]
        
    #drop last batch that is not the same size due to hidden state constraint

In [7]:
batches = batches_gen(smiles_vocab, 32, encoder)

In [8]:
# Get the first batch
X_batch, y_batch = next(batches)

# Print the shape of the batch data
print('X_batch shape:', X_batch.shape)
print('y_batch shape:', y_batch.shape)

X_batch shape: torch.Size([32, 73, 17])
y_batch shape: torch.Size([32, 73, 17])


## (b) ##

Build a LSTM model with 1 recurrent layer. Starting with the starting character and grow
a string character by character using model prediction until it reaches a ending character. Look
at the string you grown, is it a valid SMILES string?

In [9]:
class LSTM(nn.Module):
    def __init__(self, hidden_size, num_layers=1):
        super().__init__()
        self.n_layers = num_layers
        self.n_hidden = hidden_size

        self.lstm = nn.LSTM(
            input_size=17,
            hidden_size=self.n_hidden,
            num_layers=self.n_layers,
            batch_first=True
        )
        self.out = nn.Linear(self.n_hidden, 17)
        

    def forward(self, x, h_state):
        # x (batch, time_step, input_size)
        # h_state (n_layers, batch, hidden_size)
        # r_out (batch, time_step, hidden_size)
        r_out, h_state = self.lstm(x, h_state)
        outs = self.out(r_out)
        return outs, h_state
    
    def init_state(self, batch_size):
        return (torch.zeros(self.n_layers, batch_size, self.n_hidden), # hidden state
                torch.zeros(self.n_layers, batch_size, self.n_hidden)) # cell state

In [10]:
# Defining a method to generate the next character
def predict(net, inputs, h, top_k=None):
        ''' Given a onehot encoded character, predict the next character.
            Returns the predicted onehot encoded character and the hidden state.
        Arguments:
            net: the lstm model
            inputs: input to the lstm model. shape (batch, time_step/length_of_smiles, input_size) with batchsize of 1
            h: hidden state (h,c)
            top_k: int. sample from top k possible characters
            
        '''
        # detach hidden state from history
        h = tuple([each.data for each in h])  # hidden state, detached from history
        # get the output and new hidden state of the model
        out, h = net(inputs, h) 
        # get the character probabilities #[1, 17], the probability that the character will be that one-hot encoded char
        # i.e. [0.2, 0.5, 0.3, ...]  C, H, O, ...
        p = out.data

        # get top characters, i.e if top_k = 2, we will only get 0.5, 0.3 from p
        if top_k is None:
            top_ch = np.arange(len(net.chars)) #index to choose from
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()
        # select the likely next character with some element of randomness
        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum()) # char is index of char we want to choose
        # return the onehot encoded value of the predicted char and the hidden state
        output = np.zeros(inputs.detach().numpy().shape)
        output[:,:,char] = 1
        output = torch.tensor(output,dtype=torch.float)
        return output, h

# Declaring a method to generate new text
def sample(net, encoder, prime=['SOS'], top_k=None):
    """generate a smiles string starting from prime. I use 'SOS' (start of string) and 'EOS'(end of string). 
    You may need to change this based on your starting and ending character.

    """
    net.eval() # eval mode - gradient not calculated
    # get initial hidden state with batchsize 1
    h = net.init_state(1)  # batchsize = 1, hidden state
    # First off, run through the prime characters
    chars=[]  # the new smiles string
    for ch in prime:
        ch = encoder.transform(np.array([ch]).reshape(-1, 1)).toarray() #(1,17)
        ch = torch.tensor(ch,dtype=torch.float).reshape(1,1,17)
        char, h = predict(net, ch, h, top_k=top_k)
    chars.append(char)
    end  = encoder.transform(np.array(['E']).reshape(-1, 1)).toarray()
    end = torch.tensor(end,dtype=torch.float).reshape(1,1,17)

    # Now pass in the previous character and get a new one
    while not torch.all(end.eq(chars[-1])):  # keep going till you get EOS
        char, h = predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)
    chars =[c.detach().numpy() for c in chars]
    chars = np.array(chars).reshape(-1,17)
    chars = encoder.inverse_transform(chars).reshape(-1)
    # chars should be like ['C', '(', 'H', ')'] 
    return ''.join(chars[:-1])  # => 'C(H)'

In [11]:
# Training the model
def train(net, input, encoder, optimizer, loss_func, epochs, batch_size, print_every=10):
    losses = []
    accuracies = []

    for epoch in range(epochs):
        epoch_loss = 0
        epoch_acc = 0
        for batch, (X, y) in enumerate(batches_gen(input, 32, encoder)):
            optimizer.zero_grad()
            h_state, c_state = lstm.init_state(batch_size)
            output, (h_state, c_state) = lstm(X, (h_state, c_state))
            loss = loss_func(output, y)
            loss.backward()
            optimizer.step()

            if batch % batch_size == 0 and epoch % print_every == 0:
                print(f'Epoch {epoch}, Batch {batch}, Loss: {loss.item()}')

In [12]:
lstm = LSTM(hidden_size=32)
optimizer = torch.optim.Adam(lstm.parameters(), lr=1e-3)
loss_func = nn.MSELoss()
# Training the model
train(lstm, smiles_vocab, encoder, optimizer, loss_func, epochs=100, batch_size=32, print_every=10)

Epoch 0, Batch 0, Loss: 0.03957372531294823
Epoch 0, Batch 32, Loss: 0.02523517608642578
Epoch 10, Batch 0, Loss: 0.008870486170053482
Epoch 10, Batch 32, Loss: 0.009603436104953289
Epoch 20, Batch 0, Loss: 0.007819849997758865
Epoch 20, Batch 32, Loss: 0.008486508391797543
Epoch 30, Batch 0, Loss: 0.007130241487175226
Epoch 30, Batch 32, Loss: 0.007927707396447659
Epoch 40, Batch 0, Loss: 0.006713942624628544
Epoch 40, Batch 32, Loss: 0.007701043970882893
Epoch 50, Batch 0, Loss: 0.006401564460247755
Epoch 50, Batch 32, Loss: 0.007114299573004246
Epoch 60, Batch 0, Loss: 0.0061504049226641655
Epoch 60, Batch 32, Loss: 0.0068791271187365055
Epoch 70, Batch 0, Loss: 0.00598067557439208
Epoch 70, Batch 32, Loss: 0.006749395281076431
Epoch 80, Batch 0, Loss: 0.005845559295266867
Epoch 80, Batch 32, Loss: 0.006755669601261616
Epoch 90, Batch 0, Loss: 0.005826697684824467
Epoch 90, Batch 32, Loss: 0.006512104067951441


In [23]:
sample(lstm, encoder, prime=['S'], top_k=2)

'[H]C([H])([H])N(C1([H])[H])C1([H])C([H])([H])[H]'

![molecule](molecule.png)

The LSTM model was able to produce a valid SMILES string of a molecule!