In [161]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

import pandas as pd
import numpy as np

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs

from torch.utils.data import Dataset
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In [64]:
data = pd.read_csv("data_train.csv", index_col=0).reset_index(drop=True)
data


Unnamed: 0,smiles,task1,task2,task3,task4,task5,task6,task7,task8,task9,task10,task11
0,CC(=O)N(C)c1cccc(-c2ccnc3c(C(=O)c4cccs4)cnn23)c1,0,0,0,0,0,0,0,-1,0,0,0
1,COc1cc(N)c(Cl)cc1C(=O)OCCCN1CCCCC1.Cl,0,0,0,0,0,0,0,-1,0,0,0
2,CCCCNc1c(C(=O)OCC)cnc2c1cnn2CC,0,0,0,0,0,0,0,0,0,1,0
3,C#Cc1cccc(Nc2ncnc3cc(OCCOC)c(OCCOC)cc23)c1.Cl,0,0,0,0,0,0,0,-1,0,0,1
4,CC1OC2(CCCCC2Oc2cccc(Cl)c2)N=C1O,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
11995,CC(C)(C)NC[C@@H](O)COc1nsnc1N1CCOCC1,0,0,0,0,0,0,0,-1,0,0,0
11996,CCC[C@@]1(CCc2ccccc2)CC(O)=C([C@H](CC)c2cccc(N...,0,0,0,0,0,0,0,-1,0,0,0
11997,N=C(O)c1cnc(C2CC2)[nH]1,0,0,0,-1,0,0,0,0,0,0,0
11998,CN=C=O,0,0,0,0,0,0,-1,0,0,0,0


In [120]:
y = data[data.columns[1:]].to_numpy()
y


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], dtype=int64)

In [49]:
fp_length = 1024
fps = np.zeros((len(data), fp_length))

# Calculate Morgan fingerprints and convert to numpy array
for i, smiles in enumerate(tqdm(data['smiles'])):
    mol = Chem.MolFromSmiles(smiles)
    fp_vec = AllChem.GetMorganFingerprintAsBitVect(
        mol, radius=3, nBits=fp_length)
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp_vec, arr)
    fps[i] = arr


100%|██████████| 12000/12000 [00:07<00:00, 1702.94it/s]


In [89]:
X_train, Y_train,x_testm, y_test=train_test_split(fps, y,
                 test_size=0.2, random_state=1234)


In [148]:
class SMILESDataset(Dataset):
    def __init__(self, csv_file):
        self.data = pd.read_csv(csv_file)

        # Calculate Morgan fingerprints and convert to numpy array
        self.fp_length = 2048
        self.fps = np.zeros((len(self.data), self.fp_length))
        for i, smiles in enumerate(self.data["smiles"]):
            mol = Chem.MolFromSmiles(smiles)
            fp_vec = AllChem.GetMorganFingerprintAsBitVect(
                mol, radius=3, nBits=self.fp_length)
            arr = np.zeros((1,))
            DataStructs.ConvertToNumpyArray(fp_vec, arr)
            self.fps[i] = arr

        self.y = self.data.iloc[:, 1:]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.fps[idx], self.y.iloc[idx].to_numpy()


In [149]:
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = nn.ReLU()(x)
        x = self.fc2(x)
        return x


In [133]:
epochs = 20
lr=0.001
batch_size=64
hidden_size=512
output_size=11

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [170]:
dataset = SMILESDataset("data_train.csv")


train_idx, test_idx = train_test_split(
    np.arange(len(dataset)), test_size=0.2, random_state=42)

train_dataset = torch.utils.data.Subset(dataset, train_idx)
for i in range(len(train_dataset)):
    sample = train_dataset[i]
    print(sample)

    
test_dataset = torch.utils.data.Subset(dataset, test_idx)

# Create PyTorch DataLoader objects for training and testing
batch_size = 32


def custom_collate(batch):
    fps = []
    labels = []

    for sample in batch:
        fp = sample[0]
        if isinstance(fp, np.ndarray) and fp.dtype == object:

            continue  # skip this sample as it contains missing values
        fps.append(torch.from_numpy(fp).float())
        labels.append(torch.from_numpy(sample[1]).float())
    
    return torch.stack(fps), torch.stack(labels)

train_loader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate)
test_loader = DataLoader(test_dataset, batch_size=batch_size,
                         shuffle=False, collate_fn=custom_collate)






(array([0., 0., 0., ..., 0., 0., 0.]), array(['O=P(O)(OCc1ccccc1)OCc1ccccc1', 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0],
      dtype=object))
(array([0., 0., 0., ..., 0., 0., 0.]), array(['Cc1cccc(O)c1N', 0, 0, 0, 0, 0, 0, 1, 0, -1, 0, 0], dtype=object))
(array([0., 0., 0., ..., 0., 0., 0.]), array(['OCCOCC#CCOCCO', 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0], dtype=object))
(array([0., 1., 0., ..., 0., 0., 0.]), array(['Cc1cc(OCc2nc(-c3ccccc3Cl)cs2)ccc1OCC(=O)O', 0, 0, 0, 0, 0, 0, 0,
       -1, 0, 0, 0], dtype=object))
(array([0., 0., 0., ..., 0., 0., 0.]), array(['CC(C)(C)CC(=O)OCC(=O)[C@@]12OC(C)(C)O[C@@H]1C[C@H]1[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C@@]3(F)[C@@H](O)C[C@@]12C',
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, -1], dtype=object))
(array([0., 0., 0., ..., 0., 0., 0.]), array(['CC(/C=C1\\SC(=S)N(CC(=O)O)C1=O)=C\\c1ccccc1', 0, 0, 0, 0, 0, -1,
       0, -1, 0, 0, 0], dtype=object))
(array([0., 0., 0., ..., 0., 0., 0.]), array(['O=C(c1ccccc1)c1ccccc1', 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, 0],
      dtype=object)

KeyboardInterrupt: 

In [164]:
model = MLP(input_size=dataset.fp_length,
            hidden_size=hidden_size, output_size=output_size).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)


In [165]:
for epoch in range(epochs):
    train_loss = []
    model.train()
    for batch_x, batch_y in tqdm(train_loader):
        optimizer.zero_grad()
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        output = model(batch_x)
        l = criterion(output, batch_y)
        train_loss.append(l.item())
        l.backward()
        optimizer.step()

    print("Epoch {} - Training loss: {:.3f}".format(epoch +
          1, torch.tensor(train_loss).mean()))

    with torch.no_grad():
        model.eval()
        test_loss = []
        for batch_x, batch_y in test_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            output = model(batch_x)
            l = criterion(output, batch_y)
            test_loss.append(l.item())

        print("Epoch {} - Test loss: {:.3f}".format(epoch+1, torch.tensor(test_loss).mean()))


  0%|          | 0/300 [00:00<?, ?it/s]


TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.