In [26]:
import os
import pandas as pd
import numpy as np
import pickle
import rdkit
from tqdm import tqdm
import functools

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader
#from sklearn.model_selection import train_test_split
#from rdkit.Chem import Draw
# from rdkit.Chem.Draw import IPythonConsole
from rdkit import RDLogger, Chem
from torch.nn.functional import one_hot
from rdkit.Chem import rdMolDescriptors
import torch.utils.data as torch_data


import warnings
warnings.filterwarnings("ignore")

np.random.seed(1234)

RDLogger.DisableLog('rdApp.*')
print("RDKit: ", rdkit.__version__)


RDKit:  2022.03.5


In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


cuda


In [5]:
with open("smiles_train.txt", "r") as f:
    smiles = []
    for line in f:
        mol = Chem.MolFromSmiles(line.strip())
        if mol is not None:
            smiles.append(Chem.MolToSmiles(mol))


with open("smiles_train.smi", "w") as f:
    for s in smiles:
        f.write(s + "\n")


In [32]:
__encoders__ = {
    0: "<P>", # pad
    1: "<E>", # end
    2: "<S>", # start
}
max_length = 150
smiles = "smiles_train.smi"
train_smiles = open(smiles, 'r').read().split("\n")[:-1]


In [47]:
class SMILESDATA(DataLoader):
    def __init__(self, train_smiles, max_length):
        #self.smiles = open(smiles, 'r').read().split("\n")[:-1]
        self.train_smiles = train_smiles
        self.max_length = max_length
        
        tokens = functools.reduce(
            lambda acc, smile: acc.union(set(smile)), self.train_smiles, set())

        self.idx_map = dict(enumerate(tokens, start=3))
        self.idx_map.update(__encoders__)
        self.token_map = {value: key for key, value in self.idx_map.items()}
        self.ints = [torch.LongTensor([self.token_map[smile] for smile in row]) for row in
                     self.train_smiles]
        self.vocsize = len(tokens) + len(__encoders__)

    def __len__(self):
        return len(self.train_smiles)
    
    def __getitem__(self, i):
        sequence = torch.cat((torch.LongTensor([self.token_map['<S>']]), 
                              self.ints[i], torch.LongTensor([self.token_map['<E>']]),
                              torch.LongTensor([self.token_map["<P>"]]*(self.max_length-len(self.ints[i])-2))), dim=0)
        return one_hot(sequence, self.vocsize).float(), sequence
    
    def decoder(self, indexes):
        return "".join([self.idx_map[idx] for idx in indexes if idx not in __encoders__])


In [56]:

hidden_size = 512
num_layers = 4
num_epochs = 20
lr = 0.001
batch_size = 256
dropout = 0.2


In [49]:
dataset = SMILESDATA(train_smiles=train_smiles, max_length=max_length)
train_loader = DataLoader(
    dataset, batch_size=batch_size, shuffle=True)


In [52]:
class SimplifiedSMILESGRU(nn.Module):
    def __init__(self, vocsize, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.vocsize = vocsize

        self.gru = nn.GRU(vocsize, hidden_size, num_layers,
                          batch_first=True, dropout=dropout)
        self.dropout = nn.Dropout(p=dropout)
        self.fc = nn.Linear(hidden_size, vocsize)

    def forward(self, x):
        out = self.gru(x)[0]
        out = self.dropout(out)
        out = self.fc(out)
        return out
    
    def generate_samples(self, num_samples, sequence_length):
        start_token_id = [key for key,
                          value in __encoders__.items() if value == "<S>"][0]
        input_tokens = torch.LongTensor([start_token_id]*num_samples)
        hidden_state = torch.zeros((self.num_layers, num_samples,
                        self.hidden_size)).to(device)
        sequences = torch.zeros(num_samples, sequence_length)
        for i in range(sequence_length):
            input_tokens_one_hot = one_hot(
                input_tokens, self.vocsize).float().unsqueeze(1).to(device)
            output_tokens, hidden_state = self.gru(
                input_tokens_one_hot, hidden_state)
            next_token = F.softmax(
                self.fc(output_tokens).squeeze(1), dim=1)
            input_tokens = torch.multinomial(next_token, num_samples=1,
                                  replacement=True).squeeze(1)
            sequences[:, i] = input_tokens
        return sequences


In [57]:
model = SimplifiedSMILESGRU(dataset.vocsize, hidden_size, num_layers, dropout).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()

scheduler = ReduceLROnPlateau(
    optimizer, mode='min', factor=0.1, patience=2, verbose=True)


In [58]:
for epoch in range(1, num_epochs+1):
    model.train()
    train_loss = 0
    train_count = 0
    for i, (batch, target) in enumerate(tqdm(train_loader, desc=f"Training Epoch {epoch}/{num_epochs}", leave=False)):

        batch, target = batch.to(device), target.to(device)
        output = model(batch)
 
        #print("Output shape:", output.shape)
        #print("Target shape:", target.shape)
        output = output.transpose(2, 1)
        loss = criterion(output[:, :, :-1], target[:, 1:])
        optimizer.zero_grad()
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()

        train_loss += loss.item()
        train_count += 1

    avg_train_loss = train_loss / train_count

    print(
        f"Epoch [{epoch}/{num_epochs}], Train Loss: {avg_train_loss:.4f}")

    scheduler.step(avg_train_loss)


                                                                        

Epoch [1/20], Train Loss: 0.4699


                                                                        

Epoch [2/20], Train Loss: 0.3456


                                                                        

Epoch [3/20], Train Loss: 0.5964


                                                                        

Epoch [4/20], Train Loss: 0.4207


                                                                        

Epoch [5/20], Train Loss: 0.4832
Epoch 00005: reducing learning rate of group 0 to 1.0000e-04.


                                                                        

Epoch [6/20], Train Loss: 0.3554


                                                                        

Epoch [7/20], Train Loss: 0.3285


                                                                        

Epoch [8/20], Train Loss: 0.3070


                                                                        

Epoch [9/20], Train Loss: 0.2927


                                                                         

Epoch [10/20], Train Loss: 0.2835


                                                                         

Epoch [11/20], Train Loss: 0.2774


                                                                         

Epoch [12/20], Train Loss: 0.2732


                                                                         

Epoch [13/20], Train Loss: 0.2700


                                                                         

Epoch [14/20], Train Loss: 0.2675


                                                                         

Epoch [15/20], Train Loss: 0.2655


                                                                         

Epoch [16/20], Train Loss: 0.2638


                                                                         

Epoch [17/20], Train Loss: 0.2623


                                                                         

Epoch [18/20], Train Loss: 0.2610


                                                                         

Epoch [19/20], Train Loss: 0.2598


                                                                         

Epoch [20/20], Train Loss: 0.2588




In [59]:
torch.save({'tokenizer': dataset.idx_map,
            'model': model.cpu()}, "gru_model_2.pt")


In [60]:
trained_model = torch.load('gru_model_2.pt')


In [61]:
def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None


In [62]:

model, tokenizer = trained_model['model'], trained_model['tokenizer']
model = model.to(device)
model.eval()

valid_smiles = []
num_samples = 300

while len(valid_smiles) < 10001:
    sequences = model.generate_samples(
        num_samples=num_samples, sequence_length=max_length)
    for i in range(sequences.size(0)):
        generated_smiles = "".join([tokenizer[idx]
                          for idx in sequences[i].tolist() if idx not in __encoders__])
        if is_valid_smiles(generated_smiles):
            valid_smiles.append(generated_smiles)



In [63]:
with open("predictions_gru_3.txt", "w") as f:
    for smiles in valid_smiles:
        f.write(smiles + '\n')
