In [35]:
import os
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm


import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torchvision.models import resnet50
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from sklearn.model_selection import train_test_split



import warnings
warnings.filterwarnings("ignore")

np.random.seed(1234)




In [1]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)


NameError: name 'torch' is not defined

In [36]:
data = pd.read_csv('y_train.csv')

In [39]:
train_data, val_data = train_test_split(
    data, test_size=0.2, random_state=42, stratify=data['cell_line'])


In [40]:
train_data.to_csv("train_data.csv", index=False)
val_data.to_csv("val_data.csv", index=False)


In [82]:
class CellLineDataset(Dataset):
    def __init__(self, img_dir, labels_file=None, transform=None):
        self.img_dir = img_dir
        self.transform = transform
        if labels_file:
            self.labels_df = pd.read_csv(labels_file)
            self.has_labels = True
            # Create a dictionary mapping class names to integers
            self.class_to_idx = {class_name: i for i, class_name in enumerate(
                self.labels_df["cell_line"].unique())}
        else:
            self.has_labels = False

    def __len__(self):
        # because there are 3 images per sample
        return len(os.listdir(self.img_dir)) // 3

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        if self.has_labels:
            row = self.labels_df.iloc[idx]
            sample_id = row['file_id']
            img_paths = [os.path.join(self.img_dir, f"{str(sample_id).zfill(5)}_{color}.png") for color in [
                "blue", "red", "yellow"]]
            imgs = [Image.open(img_path) for img_path in img_paths]
            img = torch.stack(
                [torchvision.transforms.functional.to_tensor(im) for im in imgs]).squeeze(1)

            if self.transform:
                img = self.transform(img)

            # Convert label to integer
            label = self.class_to_idx[row['cell_line']]
            return img, label
        else:
            return img


In [20]:
def calculate_mean_std(loader):
    mean = 0.
    std = 0.
    nb_samples = 0.
    for data, _ in tqdm(loader):
        batch_samples = data.size(0)
        data = data.view(batch_samples, data.size(1), -1)
        mean += data.mean(2).sum(0)
        std += data.std(2).sum(0)
        nb_samples += batch_samples

    mean /= nb_samples
    std /= nb_samples
    return mean, std

In [13]:
epochs = 10
batch_size = 32
lr = 0.001

In [83]:
raw_train_data = CellLineDataset(
    img_dir="images_train/images_train/", labels_file="y_train.csv")
raw_train_loader = DataLoader(raw_train_data, batch_size=batch_size, shuffle=True)

mean, std = calculate_mean_std(raw_train_loader)




100%|██████████| 301/301 [00:13<00:00, 21.85it/s]


In [84]:
transform = transforms.Compose([
    transforms.Normalize(mean=mean, std=std)
])


In [85]:
train_dataset = CellLineDataset(
    img_dir="images_train/images_train/", labels_file="train_data.csv", transform=transform)
val_dataset = CellLineDataset(
    img_dir="images_train/images_train/", labels_file='val_data.csv', transform=transform)


In [86]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [51]:
model = resnet50(pretrained=True)
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, 9)  # output layer for 9 classes
model = model.to(device)


In [52]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)


In [87]:
for epoch in range(1, epochs +1):
    model.train()
    running_loss = 0.0
    pbar = tqdm(train_loader)
    for inputs, labels in pbar:
        # Move inputs and labels to device
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # Update running loss
        running_loss += loss.item() * inputs.size(0)

    # Calculate average losses
    epoch_loss = running_loss / len(train_loader.dataset)
    print('Training Loss: {:.4f}'.format(epoch_loss))

    # Validation on the test set
    model.eval()  # Set the model to evaluation mode
    running_loss = 0.0
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        with torch.no_grad():
            outputs = model(inputs)
            loss = criterion(outputs, labels)

        running_loss += loss.item() * inputs.size(0)

    val_loss = running_loss / len(test_loader.dataset)
    print('Validation Loss: {:.4f}'.format(val_loss))


  0%|          | 0/301 [00:00<?, ?it/s]


IndexError: single positional indexer is out-of-bounds

In [5]:
class SMILESDATA(DataLoader):
    def __init__(self, train_smiles, max_length):
        #self.smiles = open(smiles, 'r').read().split("\n")[:-1]
        self.train_smiles = train_smiles
        self.max_length = max_length
        
        tokens = functools.reduce(
            lambda acc, smile: acc.union(set(smile)), self.train_smiles, set())

        self.idx_map = dict(enumerate(tokens, start=3))
        self.idx_map.update(__encoders__)
        self.token_map = {value: key for key, value in self.idx_map.items()}
        self.ints = [torch.LongTensor([self.token_map[smile] for smile in row]) for row in
                     self.train_smiles]
        self.vocsize = len(tokens) + len(__encoders__)

    def __len__(self):
        return len(self.train_smiles)
    
    def __getitem__(self, i):
        sequence = torch.cat((torch.LongTensor([self.token_map['<S>']]), 
                              self.ints[i], torch.LongTensor([self.token_map['<E>']]),
                              torch.LongTensor([self.token_map["<P>"]]*(self.max_length-len(self.ints[i])-2))), dim=0)
        return one_hot(sequence, self.vocsize).float(), sequence
    
    def decoder(self, indexes):
        return "".join([self.idx_map[idx] for idx in indexes if idx not in __encoders__])


In [11]:

hidden_size = 512
num_layers = 4
num_epochs = 200
lr = 0.0001
batch_size = 256
dropout = 0.2


In [7]:
dataset = SMILESDATA(train_smiles=train_smiles, max_length=max_length)
train_loader = DataLoader(
    dataset, batch_size=batch_size, shuffle=True)


In [8]:
class SimplifiedSMILESGRU(nn.Module):
    def __init__(self, vocsize, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.vocsize = vocsize

        self.gru = nn.GRU(vocsize, hidden_size, num_layers,
                          batch_first=True, dropout=dropout)
        self.dropout = nn.Dropout(p=dropout)
        self.fc = nn.Linear(hidden_size, vocsize)

    def forward(self, x):
        out = self.gru(x)[0]
        out = self.dropout(out)
        out = self.fc(out)
        return out
    
    def generate_samples(self, num_samples, sequence_length):
        start_token_id = [key for key,
                          value in __encoders__.items() if value == "<S>"][0]
        input_tokens = torch.LongTensor([start_token_id]*num_samples)
        hidden_state = torch.zeros((self.num_layers, num_samples,
                        self.hidden_size)).to(device)
        sequences = torch.zeros(num_samples, sequence_length)
        for i in range(sequence_length):
            input_tokens_one_hot = one_hot(
                input_tokens, self.vocsize).float().unsqueeze(1).to(device)
            output_tokens, hidden_state = self.gru(
                input_tokens_one_hot, hidden_state)
            next_token = F.softmax(
                self.fc(output_tokens).squeeze(1), dim=1)
            input_tokens = torch.multinomial(next_token, num_samples=1,
                                  replacement=True).squeeze(1)
            sequences[:, i] = input_tokens
        return sequences


In [12]:
model = SimplifiedSMILESGRU(dataset.vocsize, hidden_size, num_layers, dropout).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()

scheduler = ReduceLROnPlateau(
    optimizer, mode='min', factor=0.1, patience=2, verbose=True)


In [13]:
for epoch in range(1, num_epochs+1):
    model.train()
    train_loss = 0
    train_count = 0
    for i, (batch, target) in enumerate(tqdm(train_loader, desc=f"Training Epoch {epoch}/{num_epochs}", leave=False)):

        batch, target = batch.to(device), target.to(device)
        output = model(batch)
 
        #print("Output shape:", output.shape)
        #print("Target shape:", target.shape)
        output = output.transpose(2, 1)
        loss = criterion(output[:, :, :-1], target[:, 1:])
        optimizer.zero_grad()
        loss.backward()
        #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        optimizer.step()

        train_loss += loss.item()
        train_count += 1

    avg_train_loss = train_loss / train_count

    print(
        f"Epoch [{epoch}/{num_epochs}], Train Loss: {avg_train_loss:.4f}")

    scheduler.step(avg_train_loss)


                                                                         

Epoch [1/200], Train Loss: 0.4513


                                                                         

Epoch [2/200], Train Loss: 0.2965


                                                                         

Epoch [3/200], Train Loss: 0.2777


                                                                         

Epoch [4/200], Train Loss: 0.2686


                                                                         

Epoch [5/200], Train Loss: 0.2633


                                                                         

Epoch [6/200], Train Loss: 0.2592


                                                                         

Epoch [7/200], Train Loss: 0.2564


                                                                         

Epoch [8/200], Train Loss: 0.2541


                                                                         

Epoch [9/200], Train Loss: 0.2524


                                                                          

Epoch [10/200], Train Loss: 0.2508


                                                                          

Epoch [11/200], Train Loss: 0.2576


                                                                          

Epoch [12/200], Train Loss: 0.2508


                                                                          

Epoch [13/200], Train Loss: 0.2480


                                                                          

Epoch [14/200], Train Loss: 0.2472


                                                                          

Epoch [15/200], Train Loss: 0.2461


                                                                          

Epoch [16/200], Train Loss: 0.2451


                                                                          

Epoch [17/200], Train Loss: 0.2445


                                                                          

Epoch [18/200], Train Loss: 0.2435


                                                                          

Epoch [19/200], Train Loss: 0.2434


                                                                          

Epoch [20/200], Train Loss: 0.2423


                                                                          

Epoch [21/200], Train Loss: 0.2420


                                                                          

Epoch [22/200], Train Loss: 0.2415


                                                                          

Epoch [23/200], Train Loss: 0.2411


                                                                          

Epoch [24/200], Train Loss: 0.2405


                                                                          

Epoch [25/200], Train Loss: 0.2399


                                                                          

Epoch [26/200], Train Loss: 0.2398


                                                                          

Epoch [27/200], Train Loss: 0.2392


                                                                          

Epoch [28/200], Train Loss: 0.2387


                                                                          

Epoch [29/200], Train Loss: 0.2386


                                                                          

Epoch [30/200], Train Loss: 0.2383


                                                                          

Epoch [31/200], Train Loss: 0.2378


                                                                          

Epoch [32/200], Train Loss: 0.2374


                                                                          

Epoch [33/200], Train Loss: 0.2373


                                                                          

Epoch [34/200], Train Loss: 0.2370


                                                                          

Epoch [35/200], Train Loss: 0.2368


                                                                          

Epoch [36/200], Train Loss: 0.2467


                                                                          

Epoch [37/200], Train Loss: 0.2368


                                                                          

Epoch [38/200], Train Loss: 0.2363


                                                                          

Epoch [39/200], Train Loss: 0.2364


                                                                          

Epoch [40/200], Train Loss: 0.2359


                                                                          

Epoch [41/200], Train Loss: 0.2361


                                                                          

Epoch [42/200], Train Loss: 0.2355


                                                                          

Epoch [43/200], Train Loss: 0.2355


                                                                          

Epoch [44/200], Train Loss: 0.2371


                                                                          

Epoch [45/200], Train Loss: 0.2351


                                                                          

Epoch [46/200], Train Loss: 0.2348


                                                                          

Epoch [47/200], Train Loss: 0.2346


                                                                          

Epoch [48/200], Train Loss: 0.2344


                                                                          

Epoch [49/200], Train Loss: 0.2345


                                                                          

Epoch [50/200], Train Loss: 0.2343


                                                                          

Epoch [51/200], Train Loss: 0.2340


                                                                          

Epoch [52/200], Train Loss: 0.2369


                                                                          

Epoch [53/200], Train Loss: 0.2359


                                                                          

Epoch [54/200], Train Loss: 0.2341
Epoch 00054: reducing learning rate of group 0 to 1.0000e-05.


                                                                          

Epoch [55/200], Train Loss: 0.2311


                                                                          

Epoch [56/200], Train Loss: 0.2309


                                                                          

Epoch [57/200], Train Loss: 0.2308


                                                                          

Epoch [58/200], Train Loss: 0.2307


                                                                          

Epoch [59/200], Train Loss: 0.2306


                                                                          

Epoch [60/200], Train Loss: 0.2305


                                                                          

Epoch [61/200], Train Loss: 0.2305


                                                                          

Epoch [62/200], Train Loss: 0.2304


                                                                          

Epoch [63/200], Train Loss: 0.2304


                                                                          

Epoch [64/200], Train Loss: 0.2303


                                                                          

Epoch [65/200], Train Loss: 0.2303


                                                                          

Epoch [66/200], Train Loss: 0.2302


                                                                          

Epoch [67/200], Train Loss: 0.2302


                                                                          

Epoch [68/200], Train Loss: 0.2301


                                                                          

Epoch [69/200], Train Loss: 0.2301


                                                                          

Epoch [70/200], Train Loss: 0.2301


                                                                          

Epoch [71/200], Train Loss: 0.2300


                                                                          

Epoch [72/200], Train Loss: 0.2300


                                                                          

Epoch [73/200], Train Loss: 0.2300


                                                                          

Epoch [74/200], Train Loss: 0.2299


                                                                          

Epoch [75/200], Train Loss: 0.2299


                                                                          

Epoch [76/200], Train Loss: 0.2299


                                                                          

Epoch [77/200], Train Loss: 0.2298


                                                                          

Epoch [78/200], Train Loss: 0.2298


                                                                          

Epoch [79/200], Train Loss: 0.2298


                                                                          

Epoch [80/200], Train Loss: 0.2297


                                                                          

Epoch [81/200], Train Loss: 0.2297


                                                                          

Epoch [82/200], Train Loss: 0.2296


                                                                          

Epoch [83/200], Train Loss: 0.2296


                                                                          

Epoch [84/200], Train Loss: 0.2296


                                                                          

Epoch [85/200], Train Loss: 0.2296


                                                                          

Epoch [86/200], Train Loss: 0.2296


                                                                         

KeyboardInterrupt: 

In [14]:
torch.save({'tokenizer': dataset.idx_map,
            'model': model.cpu()}, "gru_model_3.pt")


In [15]:
trained_model = torch.load('gru_model_3.pt')


In [16]:
def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None


In [17]:

model, tokenizer = trained_model['model'], trained_model['tokenizer']
model = model.to(device)
model.eval()

valid_smiles = []
num_samples = 300

while len(valid_smiles) < 10001:
    sequences = model.generate_samples(
        num_samples=num_samples, sequence_length=max_length)
    for i in range(sequences.size(0)):
        generated_smiles = "".join([tokenizer[idx]
                          for idx in sequences[i].tolist() if idx not in __encoders__])
        if is_valid_smiles(generated_smiles):
            valid_smiles.append(generated_smiles)



In [18]:
with open("predictions_gru_4.txt", "w") as f:
    for smiles in valid_smiles:
        f.write(smiles + '\n')
