In [1]:
#from fcd import get_fcd, load_ref_model, canonical_smiles, get_predictions, calculate_frechet_distance
import warnings
import os
import pandas as pd
import numpy as np
import numpy
import pickle
import rdkit
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split


# Ignore some warnings from RDKIT and keras
from rdkit import RDLogger, Chem
from rdkit.Chem import rdMolDescriptors
RDLogger.DisableLog('rdApp.*')

warnings.filterwarnings("ignore")

# Load methods from the FCD library

np.random.seed(1234)

print("RDKit: ", rdkit.__version__)


RDKit:  2022.03.5


In [5]:
data = pd.read_csv("smiles_train.txt", header=None)[0]
data


0                COc1ccc(N2CCN(C(=O)c3cc4ccccc4[nH]3)CC2)cc1
1            c1ccc(CCCNC2CCCCN(CCOC(c3ccccc3)c3ccccc3)C2)cc1
2                             Nc1nc(O)c(Br)c(-c2cccc(O)c2)n1
3               CCc1nc2ccc(Br)cc2c(=O)n1-c1nc2c(C)cc(C)cc2s1
4            O=c1cnn(-c2ccc(S(=O)(=O)N3CCCCC3)cc2)c(=O)[nH]1
                                 ...                        
1036638                  CCOc1ccc(-n2c(SC)nc3c(c2=O)SCC3)cc1
1036639        Nc1ncnc2c1nc(I)n2C1SC(COC(=O)c2ccccc2)C(O)C1O
1036640              O=C(O)CCc1sc(C=C2NC(=O)CS2)nc1-c1ccccn1
1036641    CN(c1ncnc2[nH]ccc12)C1CC(CS(=O)(=O)N2CCC(C#N)C...
1036642    CCc1ccc(S(=O)(=O)NC2c3cc(C(=O)NCCc4c[nH]cn4)cc...
Name: 0, Length: 1036643, dtype: object

In [6]:
text = ' '.join(data.tolist())
text = text.replace('\n', ' ')

In [8]:
print(text[:1000])


COc1ccc(N2CCN(C(=O)c3cc4ccccc4[nH]3)CC2)cc1 c1ccc(CCCNC2CCCCN(CCOC(c3ccccc3)c3ccccc3)C2)cc1 Nc1nc(O)c(Br)c(-c2cccc(O)c2)n1 CCc1nc2ccc(Br)cc2c(=O)n1-c1nc2c(C)cc(C)cc2s1 O=c1cnn(-c2ccc(S(=O)(=O)N3CCCCC3)cc2)c(=O)[nH]1 CC(CC(=O)Nc1nnc(C(C)(C)C)s1)c1ccccc1 O=c1n(Cc2nnc(S)s2)nnn1-c1ccccc1 Cc1nn(CC(=O)Nc2ccccc2)c(=O)c2ccccc12 CC1CN(CC(=O)N2CCc3ccccc32)CCN1 Cc1cc(O)c(C(=O)O)c(C=Cc2ccc3ccccc3c2)c1 O=C(NCC1(N2CCOCC2)CCCCC1)c1cccc(S(=O)(=O)N2CCc3ccccc3C2)c1 COc1cc(C(=N)NCCCCCCCCN=C(N)c2ccc(OCc3ccccc3)c(OC)c2)ccc1OCc1ccccc1 O=C(Nc1cccc([N+](=O)[O-])c1)C1COc2ccccc2O1 CCCCn1c(SCC(=O)NC(C)(C)C)nnc1-c1ccco1 Oc1ccccc1CCNC(=S)Nc1nccs1 Cc1c(OCCCCOc2ccccc2C(=O)O)ccc(C(=O)CC2CCCC2)c1O CC(N)C(=O)NC(CCCN)C(=O)O CCC12C=CC(O1)C(C(=O)c1ccccc1)C2c1cccc(OC)c1 COC(=O)c1sc2ccccc2c1NC(=O)c1ccc(S(=O)(=O)N2CC(C)CC(C)C2)cc1 COc1ccc(OCCCCOc2cc(C3(C#N)CCC(C(=O)O)CC3)ccc2OC)cc1 CCOC(=O)C1=C(COC(=O)C=Cc2ccc(O)c(OC)c2)NC(=O)NC1C S=C(NN=Cc1cc2ccccc2nc1Cl)NC1CCCCCCC1 Cc1cccc(C)c1-c1cccc(COc2ccc(CCC(=O)O)nn2)c1 NS(=O)(=O)Oc1c

In [3]:
train_data, val_data = train_test_split(
    data, test_size=1/6, random_state=42
)


In [50]:
charset = sorted(list(set(''.join(data))))
max_length = max([len(smile) for smile in data]) + 1


In [6]:
epochs = 10
batch_size = 128
lr = 0.001



In [51]:
def vectorize_smiles(smiles, charset, max_length):
    indices = [charset.index(char) for char in smiles]
    padded_indices = indices + [0] * (max_length - len(indices))
    data = torch.tensor(padded_indices[:-1], dtype=torch.long)
    target = torch.tensor(padded_indices[1:], dtype=torch.long)
    return data, target


In [64]:
class SMILESDataset(Dataset):
    def __init__(self, smiles, charset, max_length):
        self.smiles = smiles
        self.charset = charset
        self.max_length = max_length

    def __len__(self):
        return len(self.smiles)

    def __getitem__(self, idx):
        return vectorize_smiles(self.smiles[idx], self.charset, self.max_length)


In [65]:
train_dataset = SMILESDataset(train_data, charset, max_length)
val_dataset = SMILESDataset(val_data, charset, max_length)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)


In [66]:
class SimpleFFN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleFFN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x


In [30]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

input_size = len(charset)
hidden_size = 128
output_size = len(charset)


cuda


In [70]:
model = SimpleFFN(input_size, hidden_size, output_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [71]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for batch_idx, (data, target) in enumerate(tqdm(train_loader)):
        one_hot_data = torch.zeros(data.size(0), data.size(1), len(charset))
        one_hot_data.scatter_(2, data.unsqueeze(2), 1)
        one_hot_data, target = one_hot_data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(one_hot_data.float())
        loss = criterion(output.transpose(1, 2), target)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(tqdm(val_loader)):
            one_hot_data = torch.zeros(
                data.size(0), data.size(1), len(charset))
            one_hot_data.scatter_(2, data.unsqueeze(2), 1)
            one_hot_data, target = one_hot_data.to(device), target.to(device)
            output = model(one_hot_data.float())
            loss = criterion(output.transpose(1, 2), target)
            val_loss += loss.item()

    train_loss /= len(train_loader)
    val_loss /= len(val_loader)

    print(
        f'Epoch: {epoch + 1}, Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')


100%|██████████| 6749/6749 [00:49<00:00, 136.08it/s]
100%|██████████| 1350/1350 [00:10<00:00, 131.02it/s]


Epoch: 1, Train Loss: 0.8442, Validation Loss: 0.8116


100%|██████████| 6749/6749 [00:47<00:00, 141.13it/s]
100%|██████████| 1350/1350 [00:09<00:00, 140.75it/s]


Epoch: 2, Train Loss: 0.8116, Validation Loss: 0.8115


100%|██████████| 6749/6749 [00:47<00:00, 141.61it/s]
100%|██████████| 1350/1350 [00:10<00:00, 129.22it/s]


Epoch: 3, Train Loss: 0.8116, Validation Loss: 0.8115


100%|██████████| 6749/6749 [00:46<00:00, 144.87it/s]
100%|██████████| 1350/1350 [00:10<00:00, 129.40it/s]


Epoch: 4, Train Loss: 0.8115, Validation Loss: 0.8115


100%|██████████| 6749/6749 [00:47<00:00, 141.72it/s]
100%|██████████| 1350/1350 [00:10<00:00, 132.04it/s]


Epoch: 5, Train Loss: 0.8115, Validation Loss: 0.8114


100%|██████████| 6749/6749 [00:48<00:00, 139.68it/s]
100%|██████████| 1350/1350 [00:09<00:00, 135.15it/s]


Epoch: 6, Train Loss: 0.8115, Validation Loss: 0.8114


100%|██████████| 6749/6749 [00:49<00:00, 137.48it/s]
100%|██████████| 1350/1350 [00:10<00:00, 131.00it/s]


Epoch: 7, Train Loss: 0.8115, Validation Loss: 0.8115


100%|██████████| 6749/6749 [00:47<00:00, 142.83it/s]
100%|██████████| 1350/1350 [00:10<00:00, 133.08it/s]


Epoch: 8, Train Loss: 0.8115, Validation Loss: 0.8113


100%|██████████| 6749/6749 [00:49<00:00, 136.63it/s]
100%|██████████| 1350/1350 [00:10<00:00, 130.80it/s]


Epoch: 9, Train Loss: 0.8115, Validation Loss: 0.8113


100%|██████████| 6749/6749 [00:49<00:00, 136.70it/s]
100%|██████████| 1350/1350 [00:10<00:00, 127.04it/s]

Epoch: 10, Train Loss: 0.8114, Validation Loss: 0.8114





In [72]:
torch.save(model.state_dict(), "trained_model_1.pth")


In [119]:
def unvectorize_smiles(vector, charset):
    smiles = ""
    for index in vector:
        if index == len(charset) - 1:  # End of sequence token
            break
        smiles += charset[index]
    # Remove any trailing '#' characters
    return smiles.rstrip('#')


In [122]:
def predict(model, input_smiles, charset, max_length):
    model.eval()
    data, _ = vectorize_smiles(input_smiles, charset, max_length)
    one_hot_data = torch.zeros(1, data.size(0), len(charset))
    one_hot_data.scatter_(2, data.unsqueeze(0).unsqueeze(2), 1)
    one_hot_data = one_hot_data.to(device)
    output = model(one_hot_data.float())
    pred_indices = output.argmax(dim=2).squeeze(0).tolist()
    return unvectorize_smiles(pred_indices, charset)


In [123]:
with open("predictions_1.txt", "w") as f:
    for input_smiles in val_data:
        predicted_smiles = predict(model, input_smiles, charset, max_length)
        if 'nan' in predicted_smiles.lower() or 'inf' in predicted_smiles.lower():
            continue
        f.write(predicted_smiles + "\n")


KeyboardInterrupt: 