In [1]:
#from fcd import get_fcd, load_ref_model, canonical_smiles, get_predictions, calculate_frechet_distance
import warnings
import os
import pandas as pd
import numpy as np
import numpy
import pickle
import rdkit
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split


# Ignore some warnings from RDKIT and keras
from rdkit import RDLogger, Chem
from rdkit.Chem import rdMolDescriptors
RDLogger.DisableLog('rdApp.*')

warnings.filterwarnings("ignore")

# Load methods from the FCD library

np.random.seed(1234)

print("RDKit: ", rdkit.__version__)


RDKit:  2022.09.5


In [2]:
data = pd.read_csv("smiles_train.txt", header=None)[0]


In [3]:
def to_canonical(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return Chem.MolToSmiles(mol, canonical=True)
    else:
        return None


In [None]:
data_canonical = data.apply(to_canonical).dropna().reset_index(drop=True)


In [22]:
data_canonical.to_csv("canonical_smiles_train.txt", index=False, header=False)


In [2]:
data_canonical = pd.read_csv(
    "canonical_smiles_train.txt", header=None, squeeze=True)


In [6]:
data_canonical

0                COc1ccc(N2CCN(C(=O)c3cc4ccccc4[nH]3)CC2)cc1
1            c1ccc(CCCNC2CCCCN(CCOC(c3ccccc3)c3ccccc3)C2)cc1
2                             Nc1nc(O)c(Br)c(-c2cccc(O)c2)n1
3               CCc1nc2ccc(Br)cc2c(=O)n1-c1nc2c(C)cc(C)cc2s1
4            O=c1cnn(-c2ccc(S(=O)(=O)N3CCCCC3)cc2)c(=O)[nH]1
                                 ...                        
1036638                  CCOc1ccc(-n2c(SC)nc3c(c2=O)SCC3)cc1
1036639        Nc1ncnc2c1nc(I)n2C1SC(COC(=O)c2ccccc2)C(O)C1O
1036640              O=C(O)CCc1sc(C=C2NC(=O)CS2)nc1-c1ccccn1
1036641    CN(c1ncnc2[nH]ccc12)C1CC(CS(=O)(=O)N2CCC(C#N)C...
1036642    CCc1ccc(S(=O)(=O)NC2c3cc(C(=O)NCCc4c[nH]cn4)cc...
Name: 0, Length: 1036643, dtype: object

In [3]:
#train_data, val_data = train_test_split(data, test_size=1/6, random_state=42)
train_data, val_data = train_test_split(
    data_canonical, test_size=0.2, random_state=42)
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)


In [4]:
charset = sorted(list(set(''.join(data_canonical))))
max_length = max([len(smile) for smile in data_canonical]) + 1


In [5]:
def vectorize_smiles(smiles, charset, max_length):
    indices = [charset.index(char) for char in smiles]
    padded_indices = indices + [0] * (max_length - len(indices))
    data = torch.tensor(padded_indices[:-1], dtype=torch.long)
    target = torch.tensor(padded_indices[1:], dtype=torch.long)
    return data, target


In [6]:
class SMILESDataset(Dataset):
    def __init__(self, smiles_data, charset, max_length):
        self.smiles_data = smiles_data
        self.charset = charset
        self.max_length = max_length

    def __len__(self):
        return len(self.smiles_data)

    def __getitem__(self, index):
        input_smiles = self.smiles_data[index % len(self.smiles_data)]
        data, target = vectorize_smiles(
            input_smiles, self.charset, self.max_length)
        one_hot_data = torch.zeros(data.size(0), len(self.charset))
        one_hot_data.scatter_(1, data.unsqueeze(1), 1)
        return one_hot_data, target


In [32]:
train_dataset = SMILESDataset(train_data, charset, max_length)
val_dataset = SMILESDataset(val_data, charset, max_length)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)


In [42]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

input_size = len(charset)
hidden_size = 128#512
output_size = len(charset)
num_layers = 3
num_epochs = 50
learning_rate = 0.001


cuda


In [43]:
class SimplifiedSMILESGRU(nn.Module):
    def __init__(self, input_size=input_size, hidden_size=512, output_size=output_size, num_layers=3):
        super(SimplifiedSMILESGRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.gru = nn.GRU(input_size, hidden_size,
                          num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0),
                         self.hidden_size).to(device)
        out, _ = self.gru(x, h0)
        out = self.fc(out)
        return out


In [44]:
model = SimplifiedSMILESGRU(input_size, hidden_size,
                            output_size, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    model.parameters(), lr=learning_rate, weight_decay=1e-4)

scheduler = ReduceLROnPlateau(
    optimizer, mode='min', factor=0.1, patience=2, verbose=True)


In [45]:
for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0
    train_count = 0
    for one_hot_data, target in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{num_epochs}", leave=False):
        one_hot_data, target = one_hot_data.to(device), target.to(device)

        optimizer.zero_grad()
        output = model(one_hot_data.float())

        loss = criterion(output.transpose(1, 2), target)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_count += 1

    avg_train_loss = train_loss / train_count

    # Validation
    model.eval()
    val_loss = 0
    val_count = 0
    with torch.no_grad():
        for one_hot_data, target in tqdm(val_loader, desc=f"Validating Epoch {epoch + 1}/{num_epochs}", leave=False):
            one_hot_data, target = one_hot_data.to(device), target.to(device)
            output = model(one_hot_data.float())
            loss = criterion(output.transpose(1, 2), target)

            val_loss += loss.item()
            val_count += 1

    avg_val_loss = val_loss / val_count

    print(
        f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")
    
    scheduler.step(avg_val_loss)


Training Epoch 1/50:  57%|█████▋    | 3696/6480 [00:49<00:34, 80.29it/s]

In [37]:
torch.save(model.state_dict(), "trained_model_lstm_4.pth")


In [33]:
#model.load_state_dict(torch.load("trained_model_lstm_epoch37.pth"))
#model.to(device)


SMILESLSTM(
  (lstm1): LSTM(37, 128, batch_first=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (lstm2): LSTM(128, 128, batch_first=True)
  (dropout2): Dropout(p=0.2, inplace=False)
  (lstm3): LSTM(128, 128, batch_first=True)
  (dropout3): Dropout(p=0.2, inplace=False)
  (lstm4): LSTM(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=37, bias=True)
)

In [38]:
def unvectorize_smiles(vector, charset):
    smiles = ""
    for index in vector:
        if index == len(charset) - 1:  # End of sequence token
            break
        smiles += charset[index]
    return smiles


In [39]:

def predict(model, input_smiles, charset, max_length):
    model.eval()
    with torch.no_grad():
        data = vectorize_smiles(input_smiles, charset, max_length)
        one_hot_data = torch.zeros(1, data[0].size(0), len(charset)).to(device)
        one_hot_data.scatter_(2, data[0].to(
            device).unsqueeze(0).unsqueeze(2), 1)
        output = model(one_hot_data.float())
        pred_indices = output.argmax(dim=2).squeeze(0).cpu().tolist()
        predicted_smiles = unvectorize_smiles(pred_indices, charset)

        # Remove "#" characters
        cleaned_smiles = predicted_smiles.replace("#", "")

        # Check for "nan" or "inf" in the SMILES string
        if "nan" not in cleaned_smiles.lower() and "inf" not in cleaned_smiles.lower():
            return cleaned_smiles
        else:
            return None


In [40]:
def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None


In [41]:
valid_count = 0
with open("predictions_lstm_4.txt", "w") as f:
    for input_smiles in val_data:
        predicted_smiles = predict(model, input_smiles, charset, max_length)
        #print(f"Predicted SMILES: {predicted_smiles}")
        if predicted_smiles is not None and is_valid_smiles(predicted_smiles):
            f.write(predicted_smiles + "\n")
            valid_count += 1
            if valid_count >= 10001:
                break


In [39]:
def predict(model, input_smiles, charset, max_length):
    model.eval()
    with torch.no_grad():
        data = vectorize_smiles(input_smiles, charset, max_length)
        one_hot_data = torch.zeros(1, data[0].size(0), len(charset)).to(device)
        one_hot_data.scatter_(2, data[0].to(
            device).unsqueeze(0).unsqueeze(2), 1)
        output = model(one_hot_data.float())
        pred_indices = output.argmax(dim=2).squeeze(0).cpu().tolist()
        predicted_smiles = unvectorize_smiles(pred_indices, charset)

        # Check for "nan" or "inf" in the SMILES string
        if "nan" not in predicted_smiles.lower() and "inf" not in predicted_smiles.lower():
            return predicted_smiles
        else:
            return None


with open("predictions_lstm.txt", "w") as f:
    valid_count = 0
    for input_smiles in val_data:
        predicted_smiles = predict(model, input_smiles, charset, max_length)
        if predicted_smiles is not None and is_valid_smiles(predicted_smiles):
            f.write(predicted_smiles + "\n")
            valid_count += 1
            if valid_count >= 10:
                break
