In [1]:
#from fcd import get_fcd, load_ref_model, canonical_smiles, get_predictions, calculate_frechet_distance
import warnings
import os
import pandas as pd
import numpy as np
import numpy
import pickle
import rdkit
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from rdkit.Chem import Draw
#from rdkit.Chem.Draw import IPythonConsole


# Ignore some warnings from RDKIT and keras
from rdkit import RDLogger, Chem
from torch.nn.functional import one_hot
import itertools
from functools import reduce
from rdkit.Chem import rdMolDescriptors
import torch.utils.data as torch_data

RDLogger.DisableLog('rdApp.*')

warnings.filterwarnings("ignore")

# Load methods from the FCD library

np.random.seed(1234)

print("RDKit: ", rdkit.__version__)


RDKit:  2022.09.5


In [164]:
data = pd.read_csv("smiles_train.txt", header=None)[0]


In [None]:
with open("smiles_train.txt", "r") as f:
    smiles = []
    for line in f:
        # Convert each line to a SMILES string and append to the list
        mol = Chem.MolFromSmiles(line.strip())
        if mol is not None:
            smiles.append(Chem.MolToSmiles(mol))

# Write the list of SMILES to a new .smi file
with open("smiles_train.smi", "w") as f:
    for s in smiles:
        f.write(s + "\n")


In [3]:
def to_canonical(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return Chem.MolToSmiles(mol, canonical=True)
    else:
        return None


In [None]:
data_canonical = data.apply(to_canonical).dropna().reset_index(drop=True)


In [22]:
data_canonical.to_csv("canonical_smiles_train.txt", index=False, header=False)


In [144]:
data = pd.read_csv(
    "canonical_smiles_train.txt", header=None, squeeze=True)


In [2]:
with open("smiles_train.smi", "r") as f:
    smiles = [line.strip() for line in f]


In [3]:
def get_char2index(smiles):
    char_set = set()
    for smile in smiles:
        char_set.update(list(smile))
    char_set = sorted(list(char_set))
    char_set.extend(["B", "E", " "])  # Add B, E, and space characters
    char2index = {char: i for i, char in enumerate(char_set)}
    return char2index


In [4]:
char2index = get_char2index(smiles)
max_length = max([len(smile) for smile in smiles]) + 2


In [5]:
train_data, val_data = smiles[:int(
    0.9 * len(smiles))], smiles[int(0.9 * len(smiles)):]


In [6]:
def vectorize_smiles(smiles, char2index, max_length):
    target_sequences = []
    for smi in smiles:
        target_sequence = [char2index[c] for c in smi[1:]]
        target_sequence.append(char2index['E'])
        if len(target_sequence) < max_length:
            target_sequence.extend(
                [char2index['P']] * (max_length - len(target_sequence)))
        else:
            target_sequence = target_sequence[:max_length]

        # Check for invalid characters
        if any(x >= len(char2index) for x in target_sequence):
            print(f"Invalid characters found in SMILES: {smi}")
            continue

        target_sequences.append(target_sequence)

    # Use np.int64 instead of np.long
    return np.array(target_sequences, dtype=np.int64)


In [7]:
class SMILESData(torch_data.Dataset):
    def __init__(self, smiles, char2index, max_length):
        self.smiles = smiles
        self.char2index = char2index
        self.max_length = max_length
        self.target_sequences = vectorize_smiles(
            smiles, char2index, max_length)
        bad_indices = np.where((self.target_sequences < 0) | (
            self.target_sequences >= len(char2index)))


        print("Bad indices:", bad_indices)
        print("Bad values:", self.target_sequences[bad_indices])

        assert np.all(self.target_sequences >= 0) and np.all(
            self.target_sequences < len(char2index)), f"Invalid target values. Min: {self.target_sequences.min()}, Max: {self.target_sequences.max()}, Unique values: {np.unique(self.target_sequences)}"



    def __len__(self):
        return len(self.smiles)

    def __getitem__(self, index):
        smile = self.smiles[index]
        one_hot = np.zeros((self.max_length - 1, len(  # Change the padding range to self.max_length - 1
            self.char2index)), dtype=np.float32)
        smile = "B" + smile + "E"

        for j, char in enumerate(smile[:-1]):  # Remove the last character "E"
            one_hot[j, self.char2index[char]] = 1.0

        return one_hot, self.target_sequences[index]


In [8]:
train_dataset = SMILESData(train_data, char2index, max_length)
val_dataset = SMILESData(val_data, char2index, max_length)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


Bad indices: (array([], dtype=int64), array([], dtype=int64))
Bad values: []
Bad indices: (array([], dtype=int64), array([], dtype=int64))
Bad values: []
cuda


In [9]:
input_size = len(char2index)
hidden_size = 512
output_size = len(char2index)
num_layers = 3
num_epochs = 5
learning_rate = 0.001


In [10]:
class SimplifiedSMILESGRU(nn.Module):
    def __init__(self, input_size=input_size, hidden_size=512, output_size=output_size, num_layers=3, dropout=0.2):
        super(SimplifiedSMILESGRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.gru = nn.GRU(input_size, hidden_size, num_layers,
                          batch_first=True, dropout=dropout)
        self.dropout = nn.Dropout(p=dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0),
                         self.hidden_size).to(device)
        out, _ = self.gru(x, h0)
        out = self.dropout(out)
        out = self.fc(out)
        return out


In [11]:
# Model initialization
model = SimplifiedSMILESGRU(input_size, hidden_size,
                            output_size, num_layers).to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    model.parameters(), lr=learning_rate, weight_decay=1e-4)

# Learning rate scheduler
scheduler = ReduceLROnPlateau(
    optimizer, mode='min', factor=0.1, patience=2, verbose=True)



In [12]:
# Training loop
for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0
    train_count = 0
    for one_hot_data, target in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{num_epochs}", leave=False):
        
        one_hot_data, target = one_hot_data.to(device), target.to(device)

        optimizer.zero_grad()
        output = model(one_hot_data.float())

        print("Output shape:", output.shape)
        print("Target shape:", target.shape)

        loss = criterion(output.transpose(1, 2), target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()

        train_loss += loss.item()
        train_count += 1

    avg_train_loss = train_loss / train_count

    # Validation
    model.eval()
    val_loss = 0
    val_count = 0
    with torch.no_grad():
        for one_hot_data, target in tqdm(val_loader, desc=f"Validating Epoch {epoch + 1}/{num_epochs}", leave=False):
            one_hot_data, target = one_hot_data.to(device), target.to(device)
            output = model(one_hot_data.float())
            loss = criterion(output.transpose(1, 2), target)

            val_loss += loss.item()
            val_count += 1

    avg_val_loss = val_loss / val_count

    print(
        f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

    scheduler.step(avg_val_loss)


Training Epoch 1/5:   0%|          | 0/7289 [00:00<?, ?it/s]

One-hot data shape: torch.Size([128, 102, 39])
Target data shape: torch.Size([128, 103])
One-hot data min, max values: tensor(0.) tensor(1.)
Target data min, max values: tensor(0) tensor(38)
Output shape: torch.Size([128, 102, 39])
Target shape: torch.Size([128, 103])


                                                            

RuntimeError: Expected target size [128, 102], got [128, 103]

In [156]:
def vectorize_smiles(smiles, char2index, max_length):
    target_sequences = []

    for smile in smiles:
        smile = "<<BOS>>" + smile + "<<EOS>>"
        target_sequence = [char2index[char] for char in smile[1:]] + \
            [char2index["<<PAD>>"]] * (max_length - len(smile))
        target_sequences.append(target_sequence)

    return np.array(target_sequences, dtype=np.long)


In [157]:
class SMILESData(torch_data.Dataset):
    def __init__(self, smiles, char2index, max_length):
        self.smiles = smiles
        self.char2index = char2index
        self.max_length = max_length
        self.target_sequences = vectorize_smiles(
            smiles, char2index, max_length)

    def __len__(self):
        return len(self.smiles)

    def __getitem__(self, index):
        smile = self.smiles[index]
        one_hot = np.zeros((self.max_length, len(
            self.char2index)), dtype=np.float32)
        smile = "<<BOS>>" + smile + "<<EOS>>"

        for j, char in enumerate(smile):
            one_hot[j, self.char2index[char]] = 1.0

        return one_hot, self.target_sequences[index]


In [158]:
train_dataset = SMILESData(train_data, char2index, max_length)
val_dataset = SMILESData(val_data, char2index, max_length)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)


KeyError: '<'

In [123]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

input_size = len(char2index)
hidden_size = 512
output_size = len(char2index)
num_layers = 3
num_epochs = 5
learning_rate = 0.001


cuda


In [73]:
class SimplifiedSMILESGRU(nn.Module):
    def __init__(self, input_size=input_size, hidden_size=512, output_size=output_size, num_layers=3, dropout=0.2):
        super(SimplifiedSMILESGRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.gru = nn.GRU(input_size, hidden_size, num_layers,
                          batch_first=True, dropout=dropout)
        self.dropout = nn.Dropout(p=dropout)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0),
                         self.hidden_size).to(device)
        out, _ = self.gru(x, h0)
        out = self.dropout(out)
        out = self.fc(out)
        return out


In [124]:
# model = SimplifiedSMILESGRU(input_size, hidden_size,
#                             output_size, num_layers).to(device)
model = SimplifiedSMILESLSTM(input_size, hidden_size,
                             output_size, num_layers).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    model.parameters(), lr=learning_rate, weight_decay=1e-4)

scheduler = ReduceLROnPlateau(
    optimizer, mode='min', factor=0.1, patience=2, verbose=True)


In [125]:
for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0
    train_count = 0
    for one_hot_data, target in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{num_epochs}", leave=False):
        one_hot_data, target = one_hot_data.to(device), target.to(device)

        optimizer.zero_grad()
        output = model(one_hot_data.float())

        loss = criterion(output.transpose(1, 2), target)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()

        train_loss += loss.item()
        train_count += 1

    avg_train_loss = train_loss / train_count

    # Validation
    model.eval()
    val_loss = 0
    val_count = 0
    with torch.no_grad():
        for one_hot_data, target in tqdm(val_loader, desc=f"Validating Epoch {epoch + 1}/{num_epochs}", leave=False):
            one_hot_data, target = one_hot_data.to(device), target.to(device)
            output = model(one_hot_data.float())
            loss = criterion(output.transpose(1, 2), target)

            val_loss += loss.item()
            val_count += 1

    avg_val_loss = val_loss / val_count

    print(
        f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")
    
    scheduler.step(avg_val_loss)


                                                            

KeyError: '>'

In [43]:
torch.save(model.state_dict(), "trained_model_lstm_10.pth")


In [13]:
model.load_state_dict(torch.load("trained_model_lstm_8.pth"))
model.to(device)


SimplifiedSMILESGRU(
  (gru): GRU(40, 512, num_layers=3, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=512, out_features=40, bias=True)
)

In [14]:
def vectorize_smiles(smiles, token2index, max_length):
    vector = [1]  # <BOS>
    for char in smiles:
        vector.append(token2index[char])
    vector.append(2)  # <EOS>
    while len(vector) < max_length:
        vector.append(0)  # <PAD>
    return vector[:max_length]


In [15]:
def unvectorize_smiles(vector, index2token):
    smiles = ""
    for index in vector:
        if index in __special__:
            continue
        smiles += index2token[index]
    return smiles


In [16]:
def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None


In [45]:
def predict(model, input_smiles, charset, max_length, temperature=1.6):
    model.eval()
    with torch.no_grad():
        data = vectorize_smiles(input_smiles, charset, max_length)
        one_hot_data = torch.zeros(1, data.size(0), len(charset)).to(device)
        one_hot_data.scatter_(2, data.to(device).unsqueeze(0).unsqueeze(2), 1)
        output = model(one_hot_data.float())

        # Apply temperature
        output.div_(temperature)
        probabilities = F.softmax(output, dim=2)

        # Remove batch dimension
        probabilities = probabilities.squeeze(0)

        # Sample from the distribution
        sampled_indices = torch.multinomial(probabilities, 1).squeeze(1)
        output_smiles = unvectorize_smiles(sampled_indices.tolist(), charset)
        if not output_smiles:
            return None
        return output_smiles


In [46]:
valid_count = 0
smiles_gen = set()

with open("predictions_lstm_10.txt", "w") as f:
    while valid_count < 10001:
        input_smiles = sample_random_seed(list(data))
        predicted_smiles = predict(model, input_smiles, charset, max_length)
        if predicted_smiles is not None and is_valid_smiles(predicted_smiles):
            smiles_gen.add(predicted_smiles)
            valid_count += 1


In [21]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs


In [24]:
def get_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprint(mol, 2)
    return fp


threshold = 0.8

valid_count = 0
smiles_gen = set()
with open("predictions_lstm_test.txt", "w") as f:
    for input_smiles in val_data:
        predicted_smiles = predict(model, input_smiles, charset, max_length)
        # print(f"{predicted_smiles}")
        if predicted_smiles is not None and is_valid_smiles(predicted_smiles):
            if predicted_smiles not in data_canonical:
                # Compute the fingerprint of the predicted SMILES
                fp_pred = get_fingerprint(predicted_smiles)
                # Check if the predicted SMILES is similar to any of the previously generated SMILES
                is_similar = False
                for smiles in smiles_gen:
                    fp = get_fingerprint(smiles)
                    similarity = DataStructs.TanimotoSimilarity(fp, fp_pred)
                    if similarity > threshold:
                        is_similar = True
                        break
                if not is_similar:
                    smiles_gen.add(predicted_smiles)
                    f.write(predicted_smiles + "\n")
                    valid_count += 1
                if valid_count >= 10001:
                    break


KeyboardInterrupt: 