In [10]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset


In [11]:
dataset_path = 'antibiotic_dataset.csv'  
data = pd.read_csv(dataset_path)


In [12]:
smiles_data = data.iloc[:, 0].values


In [13]:
vocab = set("".join(smiles_data))  # Generate unique characters
vocab_size = len(vocab)
char_to_idx = {char: idx for idx, char in enumerate(vocab)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}


In [14]:
def encode_smiles(smiles):
    return [char_to_idx[char] for char in smiles]


In [15]:
sequences = [encode_smiles(smiles) for smiles in smiles_data]


In [16]:
max_length = max(len(seq) for seq in sequences)
sequences = [seq + [0] * (max_length - len(seq)) for seq in sequences]

In [17]:
sequences = torch.tensor(sequences)


In [18]:
class SMILESDataset(Dataset):
    def __init__(self, sequences, seq_length):
        self.sequences = sequences
        self.seq_length = seq_length
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        seq = self.sequences[idx]
        return torch.tensor(seq[:-1]), torch.tensor(seq[1:])  # Predict next character


In [19]:
class SMILES_RNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size):
        super(SMILES_RNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = self.fc(out)
        return out


In [20]:
# Model hyperparameters
embed_size = 128
hidden_size = 256
batch_size = 64
epochs = 10
lr = 0.001

In [21]:
dataset = SMILESDataset(sequences, max_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [22]:
model = SMILES_RNN(vocab_size, embed_size, hidden_size, vocab_size)


In [23]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [24]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for i, (input_seq, target_seq) in enumerate(dataloader):
        optimizer.zero_grad()
        
        # Forward pass
        output = model(input_seq)
        
        # Compute loss
        loss = criterion(output.view(-1, vocab_size), target_seq.view(-1))
        loss.backward()
        
        # Update parameters
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f'Epoch [{epoch + 1}/{epochs}], Loss: {total_loss / len(dataloader):.4f}')


Epoch [1/10], Loss: 2.2951
Epoch [2/10], Loss: 2.0890
Epoch [3/10], Loss: 1.8981
Epoch [4/10], Loss: 1.7237
Epoch [5/10], Loss: 1.5705
Epoch [6/10], Loss: 1.4419
Epoch [7/10], Loss: 1.3383
Epoch [8/10], Loss: 1.2568
Epoch [9/10], Loss: 1.1934
Epoch [10/10], Loss: 1.1436


  return torch.tensor(seq[:-1]), torch.tensor(seq[1:])  # Predict next character


In [30]:
def generate_smiles(model, seed, max_length=100):
    model.eval()
    # Convert seed SMILES to tensor
    input_seq = torch.tensor([char_to_idx[c] for c in seed]).unsqueeze(0)  # Shape: (1, seq_len)
    generated = seed
    
    with torch.no_grad():
        for _ in range(max_length):
            # Get the model's output for the input sequence
            output = model(input_seq)  # Output shape: (batch_size=1, seq_len, vocab_size)
            
            # Get the predicted index for the last position in the sequence
            _, predicted_idx = torch.max(output[:, -1], dim=-1)  # Shape: (1,)
            
            # Convert predicted index to character
            predicted_char = idx_to_char[predicted_idx.item()]
            
            # Append the predicted character to the generated string
            generated += predicted_char
            
            # Update the input sequence with the predicted character
            # Ensure that predicted_idx is reshaped to (1, 1) for concatenation
            predicted_idx = predicted_idx.unsqueeze(0)  # Shape: (1, 1)
            
            # Concatenate the predicted index to the input sequence
            input_seq = torch.cat((input_seq, predicted_idx), dim=1)  # New shape: (1, seq_len + 1)

    return generated


In [31]:
seed = "CCO"  # Example seed SMILES
new_smiles = generate_smiles(model, seed)
print("Generated SMILES:", new_smiles)


Generated SMILES: CCO((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((((
