In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm

import pandas as pd
import numpy as np

import optuna


import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from sklearn.metrics import roc_auc_score

from torch.utils.data import Dataset
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In [60]:
data = pd.read_csv("data_train.csv", index_col=0).reset_index(drop=True)
data


Unnamed: 0,smiles,task1,task2,task3,task4,task5,task6,task7,task8,task9,task10,task11
0,CC(=O)N(C)c1cccc(-c2ccnc3c(C(=O)c4cccs4)cnn23)c1,0,0,0,0,0,0,0,-1,0,0,0
1,COc1cc(N)c(Cl)cc1C(=O)OCCCN1CCCCC1.Cl,0,0,0,0,0,0,0,-1,0,0,0
2,CCCCNc1c(C(=O)OCC)cnc2c1cnn2CC,0,0,0,0,0,0,0,0,0,1,0
3,C#Cc1cccc(Nc2ncnc3cc(OCCOC)c(OCCOC)cc23)c1.Cl,0,0,0,0,0,0,0,-1,0,0,1
4,CC1OC2(CCCCC2Oc2cccc(Cl)c2)N=C1O,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
11995,CC(C)(C)NC[C@@H](O)COc1nsnc1N1CCOCC1,0,0,0,0,0,0,0,-1,0,0,0
11996,CCC[C@@]1(CCc2ccccc2)CC(O)=C([C@H](CC)c2cccc(N...,0,0,0,0,0,0,0,-1,0,0,0
11997,N=C(O)c1cnc(C2CC2)[nH]1,0,0,0,-1,0,0,0,0,0,0,0
11998,CN=C=O,0,0,0,0,0,0,-1,0,0,0,0


In [3]:
y = data[data.columns[1:]].fillna(0)
y=y.to_numpy()
y


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], dtype=int64)

In [7]:
fp_length = 1024
fps = np.zeros((len(data), fp_length))

# Calculate Morgan fingerprints and convert to numpy array
for i, smiles in enumerate(tqdm(data['smiles'])):
    mol = Chem.MolFromSmiles(smiles)
    fp_vec = AllChem.GetMorganFingerprintAsBitVect(
        mol, radius=3, nBits=fp_length)
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp_vec, arr)
    fps[i] = arr


100%|██████████| 12000/12000 [00:06<00:00, 1769.51it/s]


In [8]:
class MoleculeDataset(Dataset):
    def __init__(self, fps, labels):
        self.fps = fps
        self.labels = labels

    def __len__(self):
        return len(self.fps)

    def __getitem__(self, index):
        return torch.FloatTensor(self.fps[index]), torch.FloatTensor(self.labels[index])


In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



In [70]:
batch_size = 57
input_size = 1024
sequence_length = 1
output_size = 11
#num_epochs = 100


In [71]:
fps = fps.reshape(-1, sequence_length, input_size)

X_train, X_val, Y_train, Y_val = train_test_split(
    fps, y, test_size=0.2, random_state=42)


train_dataset = MoleculeDataset(X_train, Y_train)
val_dataset = MoleculeDataset(X_val, Y_val)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [72]:
print("Outputs shape:", outputs.shape)
print("Labels shape:", labels.shape)


Outputs shape: torch.Size([48, 11])
Labels shape: torch.Size([48, 11])


In [65]:

class SimpleGRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout_rate, device):
        super(SimpleGRU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.device = device

        self.gru = nn.GRU(input_size, hidden_size, num_layers,
                          batch_first=True, dropout=dropout_rate)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(
            0), self.hidden_size).to(self.device)
        out, _ = self.gru(x, h0)
        out = self.dropout(out[:, -1, :])
        out = self.fc(out)
        return out


In [56]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_rate=0.5):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size,
                            num_layers, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(
            0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(
            0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.dropout(out)
        out = self.fc(out[:, -1, :])
        return out


In [68]:
def objective(trial):
    # Suggest hyperparameters using the trial object
    hidden_size = trial.suggest_int("hidden_size", 32, 512)
    num_layers = trial.suggest_int("num_layers", 1, 5)
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.8)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-6, 1e-2)
    #weight_decay = trial.suggest_loguniform("weight_decay", 1e-5, 1e-1)
    #num_epochs = trial.suggest_int("num_epochs", 10, 100)
    batch_size = trial.suggest_int("batch_size", 32, 128)


    # Initialize your model using the suggested hyperparameters
    #model = LSTMModel(input_size, hidden_size, num_layers,
    #                  output_size, dropout_rate).to(device)

    model = SimpleGRU(input_size, hidden_size,
                      num_layers, dropout_rate, device).to(device)

    # Define your loss function and optimizer
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)#, weight_decay=weight_decay)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    num_epochs=50

    for epoch in range(1, num_epochs+1):
        # Training
        model.train()
        train_losses = []
        for i, (fps, labels) in enumerate(train_loader):
            inputs = fps.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(inputs)

            mask = labels != 0
            masked_outputs = outputs[mask]
            masked_labels = labels[mask]
            #loss = criterion(outputs, labels)
            loss = criterion(masked_outputs, masked_labels)
            train_losses.append(loss.item())

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        train_loss = np.mean(train_losses)

        # Validation
        model.eval()
        val_losses = []
        with torch.no_grad():
            for fps, labels in val_loader:
                inputs = fps.to(device)
                labels = labels.to(device)
                outputs = model(inputs)
                mask = labels != 0
                masked_outputs = outputs[mask]
                masked_labels = labels[mask]
                loss = criterion(masked_outputs, masked_labels)
                val_losses.append(loss.item())

        val_loss = np.mean(val_losses)

        print(
            f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

        # Return the validation loss (or another metric you want to minimize)
    return val_loss


In [69]:
study = optuna.create_study(direction="minimize")
# You can adjust the number of trials depending on your computational resources
study.optimize(objective, n_trials=50)


[32m[I 2023-04-17 23:56:29,528][0m A new study created in memory with name: no-name-b4932160-1058-4228-a348-72d13676f095[0m


Epoch [2/50], Train Loss: 0.9876, Validation Loss: 0.9667
Epoch [3/50], Train Loss: 0.9276, Validation Loss: 0.9118
Epoch [4/50], Train Loss: 0.8758, Validation Loss: 0.8623
Epoch [5/50], Train Loss: 0.8280, Validation Loss: 0.8175
Epoch [6/50], Train Loss: 0.7854, Validation Loss: 0.7767
Epoch [7/50], Train Loss: 0.7459, Validation Loss: 0.7396
Epoch [8/50], Train Loss: 0.7117, Validation Loss: 0.7062
Epoch [9/50], Train Loss: 0.6762, Validation Loss: 0.6768
Epoch [10/50], Train Loss: 0.6495, Validation Loss: 0.6503
Epoch [11/50], Train Loss: 0.6260, Validation Loss: 0.6275
Epoch [12/50], Train Loss: 0.6065, Validation Loss: 0.6079
Epoch [13/50], Train Loss: 0.5833, Validation Loss: 0.5911
Epoch [14/50], Train Loss: 0.5705, Validation Loss: 0.5767
Epoch [15/50], Train Loss: 0.5620, Validation Loss: 0.5644
Epoch [16/50], Train Loss: 0.5475, Validation Loss: 0.5540
Epoch [17/50], Train Loss: 0.5400, Validation Loss: 0.5453
Epoch [18/50], Train Loss: 0.5300, Validation Loss: 0.5372
Epoch

[32m[I 2023-04-17 23:56:51,858][0m Trial 0 finished with value: 0.4388754225295523 and parameters: {'hidden_size': 323, 'num_layers': 1, 'dropout_rate': 0.10746702576237721, 'learning_rate': 6.872819879249902e-06, 'batch_size': 109}. Best is trial 0 with value: 0.4388754225295523.[0m


Epoch [51/50], Train Loss: 0.4060, Validation Loss: 0.4389
Epoch [2/50], Train Loss: 0.9842, Validation Loss: 0.9635
Epoch [3/50], Train Loss: 0.9376, Validation Loss: 0.9027
Epoch [4/50], Train Loss: 0.8301, Validation Loss: 0.7219
Epoch [5/50], Train Loss: 0.5982, Validation Loss: 0.5220
Epoch [6/50], Train Loss: 0.5069, Validation Loss: 0.4927
Epoch [7/50], Train Loss: 0.4868, Validation Loss: 0.4800
Epoch [8/50], Train Loss: 0.4741, Validation Loss: 0.4715
Epoch [9/50], Train Loss: 0.4672, Validation Loss: 0.4655
Epoch [10/50], Train Loss: 0.4616, Validation Loss: 0.4604
Epoch [11/50], Train Loss: 0.4548, Validation Loss: 0.4559
Epoch [12/50], Train Loss: 0.4527, Validation Loss: 0.4522
Epoch [13/50], Train Loss: 0.4456, Validation Loss: 0.4486
Epoch [14/50], Train Loss: 0.4431, Validation Loss: 0.4457
Epoch [15/50], Train Loss: 0.4407, Validation Loss: 0.4428
Epoch [16/50], Train Loss: 0.4376, Validation Loss: 0.4402
Epoch [17/50], Train Loss: 0.4340, Validation Loss: 0.4377
Epoch

[32m[I 2023-04-17 23:57:27,374][0m Trial 1 finished with value: 0.3993268189461608 and parameters: {'hidden_size': 272, 'num_layers': 4, 'dropout_rate': 0.2528314317083498, 'learning_rate': 8.9891219200738e-06, 'batch_size': 64}. Best is trial 1 with value: 0.3993268189461608.[0m


Epoch [51/50], Train Loss: 0.3743, Validation Loss: 0.3993
Epoch [2/50], Train Loss: 0.9817, Validation Loss: 0.9440
Epoch [3/50], Train Loss: 0.8248, Validation Loss: 0.5877
Epoch [4/50], Train Loss: 0.5083, Validation Loss: 0.4829
Epoch [5/50], Train Loss: 0.4724, Validation Loss: 0.4649
Epoch [6/50], Train Loss: 0.4606, Validation Loss: 0.4548
Epoch [7/50], Train Loss: 0.4512, Validation Loss: 0.4481
Epoch [8/50], Train Loss: 0.4452, Validation Loss: 0.4429
Epoch [9/50], Train Loss: 0.4421, Validation Loss: 0.4386
Epoch [10/50], Train Loss: 0.4349, Validation Loss: 0.4345
Epoch [11/50], Train Loss: 0.4331, Validation Loss: 0.4312
Epoch [12/50], Train Loss: 0.4280, Validation Loss: 0.4279
Epoch [13/50], Train Loss: 0.4250, Validation Loss: 0.4248
Epoch [14/50], Train Loss: 0.4201, Validation Loss: 0.4220
Epoch [15/50], Train Loss: 0.4177, Validation Loss: 0.4197
Epoch [16/50], Train Loss: 0.4130, Validation Loss: 0.4181
Epoch [17/50], Train Loss: 0.4111, Validation Loss: 0.4166
Epoch

[32m[I 2023-04-17 23:58:12,967][0m Trial 2 finished with value: 0.3810829439240953 and parameters: {'hidden_size': 296, 'num_layers': 5, 'dropout_rate': 0.3457489972885964, 'learning_rate': 1.2103690759157509e-05, 'batch_size': 53}. Best is trial 2 with value: 0.3810829439240953.[0m


Epoch [51/50], Train Loss: 0.3420, Validation Loss: 0.3811
Epoch [2/50], Train Loss: 0.9938, Validation Loss: 0.9897
Epoch [3/50], Train Loss: 0.9863, Validation Loss: 0.9826
Epoch [4/50], Train Loss: 0.9790, Validation Loss: 0.9755
Epoch [5/50], Train Loss: 0.9722, Validation Loss: 0.9682
Epoch [6/50], Train Loss: 0.9646, Validation Loss: 0.9606
Epoch [7/50], Train Loss: 0.9565, Validation Loss: 0.9527
Epoch [8/50], Train Loss: 0.9479, Validation Loss: 0.9442
Epoch [9/50], Train Loss: 0.9395, Validation Loss: 0.9349
Epoch [10/50], Train Loss: 0.9293, Validation Loss: 0.9248
Epoch [11/50], Train Loss: 0.9184, Validation Loss: 0.9134
Epoch [12/50], Train Loss: 0.9064, Validation Loss: 0.9004
Epoch [13/50], Train Loss: 0.8922, Validation Loss: 0.8851
Epoch [14/50], Train Loss: 0.8754, Validation Loss: 0.8671
Epoch [15/50], Train Loss: 0.8553, Validation Loss: 0.8453
Epoch [16/50], Train Loss: 0.8311, Validation Loss: 0.8188
Epoch [17/50], Train Loss: 0.8018, Validation Loss: 0.7869
Epoch

[32m[I 2023-04-17 23:59:23,663][0m Trial 3 finished with value: 0.4555260548988978 and parameters: {'hidden_size': 250, 'num_layers': 5, 'dropout_rate': 0.3311338482325642, 'learning_rate': 1.041724203907712e-06, 'batch_size': 32}. Best is trial 2 with value: 0.3810829439240953.[0m


Epoch [51/50], Train Loss: 0.4533, Validation Loss: 0.4555
Epoch [2/50], Train Loss: 0.5034, Validation Loss: 0.4154
Epoch [3/50], Train Loss: 0.4162, Validation Loss: 0.3978
Epoch [4/50], Train Loss: 0.3827, Validation Loss: 0.3737
Epoch [5/50], Train Loss: 0.3487, Validation Loss: 0.3767
Epoch [6/50], Train Loss: 0.3290, Validation Loss: 0.3789
Epoch [7/50], Train Loss: 0.3157, Validation Loss: 0.3908
Epoch [8/50], Train Loss: 0.2955, Validation Loss: 0.3851
Epoch [9/50], Train Loss: 0.2826, Validation Loss: 0.3828
Epoch [10/50], Train Loss: 0.2686, Validation Loss: 0.3909
Epoch [11/50], Train Loss: 0.2636, Validation Loss: 0.4006
Epoch [12/50], Train Loss: 0.2503, Validation Loss: 0.4092
Epoch [13/50], Train Loss: 0.2410, Validation Loss: 0.3934
Epoch [14/50], Train Loss: 0.2308, Validation Loss: 0.4032
Epoch [15/50], Train Loss: 0.2239, Validation Loss: 0.3990
Epoch [16/50], Train Loss: 0.2152, Validation Loss: 0.4051
Epoch [17/50], Train Loss: 0.2115, Validation Loss: 0.4186
Epoch

[32m[I 2023-04-17 23:59:55,781][0m Trial 4 finished with value: 0.44420358041922253 and parameters: {'hidden_size': 104, 'num_layers': 5, 'dropout_rate': 0.5164924089667202, 'learning_rate': 0.001505473986603293, 'batch_size': 82}. Best is trial 2 with value: 0.3810829439240953.[0m


Epoch [51/50], Train Loss: 0.1331, Validation Loss: 0.4442




Epoch [2/50], Train Loss: 0.5930, Validation Loss: 0.4714
Epoch [3/50], Train Loss: 0.4263, Validation Loss: 0.4255
Epoch [4/50], Train Loss: 0.3667, Validation Loss: 0.3956
Epoch [5/50], Train Loss: 0.3293, Validation Loss: 0.3788
Epoch [6/50], Train Loss: 0.3003, Validation Loss: 0.3711
Epoch [7/50], Train Loss: 0.2787, Validation Loss: 0.3659
Epoch [8/50], Train Loss: 0.2634, Validation Loss: 0.3642
Epoch [9/50], Train Loss: 0.2458, Validation Loss: 0.3662
Epoch [10/50], Train Loss: 0.2302, Validation Loss: 0.3684
Epoch [11/50], Train Loss: 0.2183, Validation Loss: 0.3701
Epoch [12/50], Train Loss: 0.2094, Validation Loss: 0.3748
Epoch [13/50], Train Loss: 0.2020, Validation Loss: 0.3795
Epoch [14/50], Train Loss: 0.1916, Validation Loss: 0.3830
Epoch [15/50], Train Loss: 0.1836, Validation Loss: 0.3865
Epoch [16/50], Train Loss: 0.1789, Validation Loss: 0.3893
Epoch [17/50], Train Loss: 0.1717, Validation Loss: 0.3948
Epoch [18/50], Train Loss: 0.1655, Validation Loss: 0.3992
Epoch

[32m[I 2023-04-18 00:00:33,201][0m Trial 5 finished with value: 0.44441847388560957 and parameters: {'hidden_size': 313, 'num_layers': 1, 'dropout_rate': 0.36848092665225707, 'learning_rate': 0.0002222952575912816, 'batch_size': 94}. Best is trial 2 with value: 0.3810829439240953.[0m


Epoch [51/50], Train Loss: 0.0737, Validation Loss: 0.4444
Epoch [2/50], Train Loss: 0.6935, Validation Loss: 0.4524
Epoch [3/50], Train Loss: 0.4362, Validation Loss: 0.4311
Epoch [4/50], Train Loss: 0.4238, Validation Loss: 0.4270
Epoch [5/50], Train Loss: 0.4111, Validation Loss: 0.4163
Epoch [6/50], Train Loss: 0.4008, Validation Loss: 0.4105
Epoch [7/50], Train Loss: 0.3842, Validation Loss: 0.3962
Epoch [8/50], Train Loss: 0.3565, Validation Loss: 0.3817
Epoch [9/50], Train Loss: 0.3382, Validation Loss: 0.3790
Epoch [10/50], Train Loss: 0.3202, Validation Loss: 0.3843
Epoch [11/50], Train Loss: 0.3155, Validation Loss: 0.3821
Epoch [12/50], Train Loss: 0.3040, Validation Loss: 0.3849
Epoch [13/50], Train Loss: 0.2959, Validation Loss: 0.3908
Epoch [14/50], Train Loss: 0.2911, Validation Loss: 0.3950
Epoch [15/50], Train Loss: 0.2828, Validation Loss: 0.3967
Epoch [16/50], Train Loss: 0.2756, Validation Loss: 0.3975
Epoch [17/50], Train Loss: 0.2683, Validation Loss: 0.3988
Epoch

[32m[I 2023-04-18 00:01:11,749][0m Trial 6 finished with value: 0.47294067839781445 and parameters: {'hidden_size': 169, 'num_layers': 5, 'dropout_rate': 0.14460653351800662, 'learning_rate': 0.00012512605908290072, 'batch_size': 80}. Best is trial 2 with value: 0.3810829439240953.[0m


Epoch [51/50], Train Loss: 0.1437, Validation Loss: 0.4729
Epoch [2/50], Train Loss: 0.4570, Validation Loss: 0.3592
Epoch [3/50], Train Loss: 0.3356, Validation Loss: 0.3533
Epoch [4/50], Train Loss: 0.2888, Validation Loss: 0.3557
Epoch [5/50], Train Loss: 0.2459, Validation Loss: 0.3721
Epoch [6/50], Train Loss: 0.2048, Validation Loss: 0.3773
Epoch [7/50], Train Loss: 0.1721, Validation Loss: 0.3886
Epoch [8/50], Train Loss: 0.1431, Validation Loss: 0.3979
Epoch [9/50], Train Loss: 0.1229, Validation Loss: 0.4033
Epoch [10/50], Train Loss: 0.1086, Validation Loss: 0.4042
Epoch [11/50], Train Loss: 0.0932, Validation Loss: 0.4020
Epoch [12/50], Train Loss: 0.0829, Validation Loss: 0.4082
Epoch [13/50], Train Loss: 0.0760, Validation Loss: 0.4104
Epoch [14/50], Train Loss: 0.0664, Validation Loss: 0.4109
Epoch [15/50], Train Loss: 0.0632, Validation Loss: 0.4170
Epoch [16/50], Train Loss: 0.0599, Validation Loss: 0.4158
Epoch [17/50], Train Loss: 0.0552, Validation Loss: 0.4153
Epoch

[32m[I 2023-04-18 00:01:34,365][0m Trial 7 finished with value: 0.4205220192670822 and parameters: {'hidden_size': 193, 'num_layers': 3, 'dropout_rate': 0.38443388026129977, 'learning_rate': 0.001991542993319189, 'batch_size': 112}. Best is trial 2 with value: 0.3810829439240953.[0m


Epoch [51/50], Train Loss: 0.0272, Validation Loss: 0.4205
Epoch [2/50], Train Loss: 0.9384, Validation Loss: 0.8664
Epoch [3/50], Train Loss: 0.7063, Validation Loss: 0.5409
Epoch [4/50], Train Loss: 0.5057, Validation Loss: 0.4890
Epoch [5/50], Train Loss: 0.4787, Validation Loss: 0.4739
Epoch [6/50], Train Loss: 0.4615, Validation Loss: 0.4633
Epoch [7/50], Train Loss: 0.4533, Validation Loss: 0.4553
Epoch [8/50], Train Loss: 0.4451, Validation Loss: 0.4488
Epoch [9/50], Train Loss: 0.4376, Validation Loss: 0.4421
Epoch [10/50], Train Loss: 0.4283, Validation Loss: 0.4361
Epoch [11/50], Train Loss: 0.4214, Validation Loss: 0.4303
Epoch [12/50], Train Loss: 0.4158, Validation Loss: 0.4251
Epoch [13/50], Train Loss: 0.4100, Validation Loss: 0.4199
Epoch [14/50], Train Loss: 0.4038, Validation Loss: 0.4155
Epoch [15/50], Train Loss: 0.3978, Validation Loss: 0.4115
Epoch [16/50], Train Loss: 0.3928, Validation Loss: 0.4082
Epoch [17/50], Train Loss: 0.3883, Validation Loss: 0.4045
Epoch

[32m[I 2023-04-18 00:02:09,615][0m Trial 8 finished with value: 0.3616165483281726 and parameters: {'hidden_size': 247, 'num_layers': 3, 'dropout_rate': 0.23483425101345998, 'learning_rate': 1.9500472104762218e-05, 'batch_size': 58}. Best is trial 8 with value: 0.3616165483281726.[0m


Epoch [51/50], Train Loss: 0.2799, Validation Loss: 0.3616
Epoch [2/50], Train Loss: 0.5309, Validation Loss: 0.4228
Epoch [3/50], Train Loss: 0.4362, Validation Loss: 0.4132
Epoch [4/50], Train Loss: 0.4139, Validation Loss: 0.4002
Epoch [5/50], Train Loss: 0.3782, Validation Loss: 0.3780
Epoch [6/50], Train Loss: 0.3500, Validation Loss: 0.3793
Epoch [7/50], Train Loss: 0.3360, Validation Loss: 0.3750
Epoch [8/50], Train Loss: 0.3131, Validation Loss: 0.3798
Epoch [9/50], Train Loss: 0.2953, Validation Loss: 0.3813
Epoch [10/50], Train Loss: 0.2908, Validation Loss: 0.3908
Epoch [11/50], Train Loss: 0.2794, Validation Loss: 0.3900
Epoch [12/50], Train Loss: 0.2731, Validation Loss: 0.3913
Epoch [13/50], Train Loss: 0.2623, Validation Loss: 0.4003
Epoch [14/50], Train Loss: 0.2575, Validation Loss: 0.4014
Epoch [15/50], Train Loss: 0.2509, Validation Loss: 0.4027
Epoch [16/50], Train Loss: 0.2486, Validation Loss: 0.4009
Epoch [17/50], Train Loss: 0.2450, Validation Loss: 0.3961
Epoch

[32m[I 2023-04-18 00:03:57,764][0m Trial 9 finished with value: 0.42760222578701906 and parameters: {'hidden_size': 34, 'num_layers': 5, 'dropout_rate': 0.37160933535776874, 'learning_rate': 0.0006973503455462049, 'batch_size': 33}. Best is trial 8 with value: 0.3616165483281726.[0m


Epoch [51/50], Train Loss: 0.1716, Validation Loss: 0.4276
Epoch [2/50], Train Loss: 0.8291, Validation Loss: 0.5335
Epoch [3/50], Train Loss: 0.4981, Validation Loss: 0.4650
Epoch [4/50], Train Loss: 0.4658, Validation Loss: 0.4487
Epoch [5/50], Train Loss: 0.4515, Validation Loss: 0.4374
Epoch [6/50], Train Loss: 0.4392, Validation Loss: 0.4284
Epoch [7/50], Train Loss: 0.4295, Validation Loss: 0.4222
Epoch [8/50], Train Loss: 0.4199, Validation Loss: 0.4157
Epoch [9/50], Train Loss: 0.4105, Validation Loss: 0.4102
Epoch [10/50], Train Loss: 0.4034, Validation Loss: 0.4050
Epoch [11/50], Train Loss: 0.3958, Validation Loss: 0.3996
Epoch [12/50], Train Loss: 0.3845, Validation Loss: 0.3929
Epoch [13/50], Train Loss: 0.3770, Validation Loss: 0.3859
Epoch [14/50], Train Loss: 0.3649, Validation Loss: 0.3797
Epoch [15/50], Train Loss: 0.3545, Validation Loss: 0.3734
Epoch [16/50], Train Loss: 0.3476, Validation Loss: 0.3691
Epoch [17/50], Train Loss: 0.3393, Validation Loss: 0.3662
Epoch

[32m[I 2023-04-18 00:04:36,751][0m Trial 10 finished with value: 0.3697468807299932 and parameters: {'hidden_size': 468, 'num_layers': 3, 'dropout_rate': 0.6624623622842303, 'learning_rate': 4.770022464285832e-05, 'batch_size': 54}. Best is trial 8 with value: 0.3616165483281726.[0m


Epoch [51/50], Train Loss: 0.2427, Validation Loss: 0.3697
Epoch [2/50], Train Loss: 0.9297, Validation Loss: 0.7299
Epoch [3/50], Train Loss: 0.5558, Validation Loss: 0.4842
Epoch [4/50], Train Loss: 0.4891, Validation Loss: 0.4607
Epoch [5/50], Train Loss: 0.4690, Validation Loss: 0.4477
Epoch [6/50], Train Loss: 0.4603, Validation Loss: 0.4381
Epoch [7/50], Train Loss: 0.4465, Validation Loss: 0.4311
Epoch [8/50], Train Loss: 0.4370, Validation Loss: 0.4249
Epoch [9/50], Train Loss: 0.4308, Validation Loss: 0.4203
Epoch [10/50], Train Loss: 0.4231, Validation Loss: 0.4163
Epoch [11/50], Train Loss: 0.4180, Validation Loss: 0.4129
Epoch [12/50], Train Loss: 0.4117, Validation Loss: 0.4099
Epoch [13/50], Train Loss: 0.4110, Validation Loss: 0.4075
Epoch [14/50], Train Loss: 0.4001, Validation Loss: 0.4046
Epoch [15/50], Train Loss: 0.3991, Validation Loss: 0.4016
Epoch [16/50], Train Loss: 0.3921, Validation Loss: 0.3981
Epoch [17/50], Train Loss: 0.3900, Validation Loss: 0.3952
Epoch

[32m[I 2023-04-18 00:05:11,982][0m Trial 11 finished with value: 0.3623266306727432 and parameters: {'hidden_size': 479, 'num_layers': 3, 'dropout_rate': 0.7079701185167607, 'learning_rate': 4.00050004590533e-05, 'batch_size': 57}. Best is trial 8 with value: 0.3616165483281726.[0m


Epoch [51/50], Train Loss: 0.2810, Validation Loss: 0.3623
Epoch [2/50], Train Loss: 0.5196, Validation Loss: 0.3932
Epoch [3/50], Train Loss: 0.4414, Validation Loss: 0.3864
Epoch [4/50], Train Loss: 0.4147, Validation Loss: 0.3944
Epoch [5/50], Train Loss: 0.4080, Validation Loss: 0.3829
Epoch [6/50], Train Loss: 0.3970, Validation Loss: 0.3989
Epoch [7/50], Train Loss: 0.3960, Validation Loss: 0.4010
Epoch [8/50], Train Loss: 0.3914, Validation Loss: 0.4085
Epoch [9/50], Train Loss: 0.3960, Validation Loss: 0.4079
Epoch [10/50], Train Loss: 0.3940, Validation Loss: 0.4038
Epoch [11/50], Train Loss: 0.3965, Validation Loss: 0.4142
Epoch [12/50], Train Loss: 0.3992, Validation Loss: 0.4141
Epoch [13/50], Train Loss: 0.3934, Validation Loss: 0.4053
Epoch [14/50], Train Loss: 0.3951, Validation Loss: 0.4089
Epoch [15/50], Train Loss: 0.3884, Validation Loss: 0.4078
Epoch [16/50], Train Loss: 0.3891, Validation Loss: 0.4052
Epoch [17/50], Train Loss: 0.3979, Validation Loss: 0.4143
Epoch

[33m[W 2023-04-18 00:05:38,020][0m Trial 12 failed with parameters: {'hidden_size': 505, 'num_layers': 2, 'dropout_rate': 0.7999060329557599, 'learning_rate': 0.005926707695993871, 'batch_size': 64} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "d:\Miniconda\envs\ails\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\agneg\AppData\Local\Temp\ipykernel_9176\2231237009.py", line 37, in objective
    outputs = model(inputs)
  File "d:\Miniconda\envs\ails\lib\site-packages\torch\nn\modules\module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "C:\Users\agneg\AppData\Local\Temp\ipykernel_9176\3529936515.py", line 14, in forward
    h0 = torch.zeros(self.num_layers, x.size(
KeyboardInterrupt
[33m[W 2023-04-18 00:05:38,021][0m Trial 12 failed with value None.[0m


Epoch [46/50], Train Loss: 0.3725, Validation Loss: 0.3942


KeyboardInterrupt: 

In [162]:
best_params = study.best_params
print("Best hyperparameters:", best_params)


Best hyperparameters: {'hidden_size': 433, 'num_layers': 2, 'dropout_rate': 0.5442089330486015, 'learning_rate': 0.006091997080097445, 'weight_decay': 1.4045570090893667e-05}


In [73]:
best_hidden_size = 479  # 433
best_num_layers = 3 #2
best_dropout_rate = 0.7079701185167607  # 0.5442089330486015
best_learning_rate = 4.00050004590533e-05  # 0.006091997080097445
#best_weight_decay = #1.4045570090893667e-05



In [75]:
model = SimpleGRU(input_size, best_hidden_size,
                  best_num_layers, best_dropout_rate, device).to(device)


criterion = nn.MSELoss()

optimizer = optim.Adam(
    model.parameters(), lr=best_learning_rate) #, weight_decay=best_weight_decay)




In [76]:
patience = 30
min_delta = 0.001
num_epochs = 100

best_validation_loss = float("inf")
counter = 0


In [77]:
for epoch in range(1, num_epochs+1):
     # Training
    model.train()
    train_losses = []
    for i, (fps, labels) in enumerate(train_loader):
            inputs = fps.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(inputs)

            mask = labels != 0
            masked_outputs = outputs[mask]
            masked_labels = labels[mask]
            # loss = criterion(outputs, labels)
            loss = criterion(masked_outputs, masked_labels)
            train_losses.append(loss.item())

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    train_loss = np.mean(train_losses)

        # Validation
    model.eval()
    val_losses = []
    with torch.no_grad():
            for fps, labels in val_loader:
                inputs = fps.to(device)
                labels = labels.to(device)
                outputs = model(inputs)
                mask = labels != 0
                masked_outputs = outputs[mask]
                masked_labels = labels[mask]
                loss = criterion(masked_outputs, masked_labels)
                val_losses.append(loss.item())

    val_loss = np.mean(val_losses)

    print(
            f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}")

    if val_loss < best_validation_loss - min_delta:
        best_validation_loss = val_loss
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print(f"Early stopping at epoch {epoch}")
            break


Epoch [2/100], Train Loss: 0.9229, Validation Loss: 0.7500
Epoch [3/100], Train Loss: 0.5646, Validation Loss: 0.4794
Epoch [4/100], Train Loss: 0.4857, Validation Loss: 0.4577
Epoch [5/100], Train Loss: 0.4697, Validation Loss: 0.4462
Epoch [6/100], Train Loss: 0.4564, Validation Loss: 0.4379
Epoch [7/100], Train Loss: 0.4501, Validation Loss: 0.4305
Epoch [8/100], Train Loss: 0.4371, Validation Loss: 0.4245
Epoch [9/100], Train Loss: 0.4323, Validation Loss: 0.4196
Epoch [10/100], Train Loss: 0.4261, Validation Loss: 0.4153
Epoch [11/100], Train Loss: 0.4179, Validation Loss: 0.4118
Epoch [12/100], Train Loss: 0.4108, Validation Loss: 0.4083
Epoch [13/100], Train Loss: 0.4085, Validation Loss: 0.4059
Epoch [14/100], Train Loss: 0.4043, Validation Loss: 0.4025
Epoch [15/100], Train Loss: 0.3979, Validation Loss: 0.3999
Epoch [16/100], Train Loss: 0.3914, Validation Loss: 0.3959
Epoch [17/100], Train Loss: 0.3845, Validation Loss: 0.3917
Epoch [18/100], Train Loss: 0.3814, Validation L

In [54]:
patience = 30
min_delta = 0.001
num_epochs = 100 

best_validation_loss = float("inf")
counter = 0

for epoch in range(num_epochs):
    # Training loop
    model.train()
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    with torch.no_grad():
        valid_loss = 0
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            valid_loss += loss.item()

        valid_loss /= len(val_loader)

    print(
        f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {loss.item():.4f}, Validation Loss: {valid_loss:.4f}")
    
    if valid_loss < best_validation_loss - min_delta:
        best_validation_loss = valid_loss
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print(f"Early stopping at epoch {epoch}")
            break


Epoch [1/100], Train Loss: 0.1514, Validation Loss: 0.1304
Epoch [2/100], Train Loss: 0.1539, Validation Loss: 0.1314
Epoch [3/100], Train Loss: 0.1583, Validation Loss: 0.1339
Epoch [4/100], Train Loss: 0.1539, Validation Loss: 0.1319
Epoch [5/100], Train Loss: 0.1574, Validation Loss: 0.1320
Epoch [6/100], Train Loss: 0.1566, Validation Loss: 0.1322
Epoch [7/100], Train Loss: 0.1587, Validation Loss: 0.1335
Epoch [8/100], Train Loss: 0.1513, Validation Loss: 0.1340
Epoch [9/100], Train Loss: 0.1565, Validation Loss: 0.1340
Epoch [10/100], Train Loss: 0.1564, Validation Loss: 0.1351
Epoch [11/100], Train Loss: 0.1583, Validation Loss: 0.1354
Epoch [12/100], Train Loss: 0.1604, Validation Loss: 0.1362
Epoch [13/100], Train Loss: 0.1623, Validation Loss: 0.1388
Epoch [14/100], Train Loss: 0.1613, Validation Loss: 0.1372
Epoch [15/100], Train Loss: 0.1657, Validation Loss: 0.1376
Epoch [16/100], Train Loss: 0.1613, Validation Loss: 0.1366
Epoch [17/100], Train Loss: 0.1600, Validation Lo

In [78]:
test_data = pd.read_csv("smiles_test.csv", index_col=0).reset_index(drop=True)
test_data


Unnamed: 0,smiles
0,OC(COc1ccc(Cl)cc1)=N[C@H]1CC[C@H](N=C(O)COc2cc...
1,CCCO/N=C(/C)c1cc(C(O)=NC(Cc2cc(F)cc(F)c2)[C@@H...
2,COc1cc(Cl)ccc1Cl
3,COc1cc(C(O)=NCc2ccc(OCCN(C)C)cc2)cc(OC)c1OC
4,CCC(=O)O[C@@]1(C(=O)CCl)[C@@H](C)C[C@H]2[C@@H]...
...,...
5891,N#Cc1cc(NC(=O)C(=O)O)c(Cl)c(NC(=O)C(=O)O)c1.NC...
5892,O=c1cccc2n1C[C@@H]1CNC[C@H]2C1
5893,CSCC[C@H](N=C(O)[C@H](Cc1ccccc1)N=C(O)CN=C(O)C...
5894,CCn1cc2c3c(cc(C(O)=NC(Cc4ccccc4)[C@H](O)C[NH2+...


In [79]:
fp_length = 1024

test_fps = np.zeros((len(test_data), fp_length))
for i, smiles in enumerate(test_data['smiles']):
    mol = Chem.MolFromSmiles(smiles)
    fp_vec = AllChem.GetMorganFingerprintAsBitVect(
        mol, radius=3, nBits=fp_length)
    arr = np.zeros((1,))
    Chem.DataStructs.ConvertToNumpyArray(fp_vec, arr)
    test_fps[i] = arr




In [80]:
model.eval()

x_test_tensor = torch.tensor(
    test_fps, dtype=torch.float).unsqueeze(1).to(device)


with torch.no_grad():
    test_preds = model(x_test_tensor)

# Convert the predictions back to numpy
test_preds_np = test_preds.cpu().numpy()
# Save the predictions to a CSV file
predictions_df = pd.DataFrame(test_preds_np, columns=[
                              "task1", "task2", "task3", "task4", "task5", "task6", "task7", "task8", "task9", "task10", "task11"])

predictions_df.to_csv("test_predictions_6.csv")


In [169]:
predictions_df


Unnamed: 0,task1,task2,task3,task4,task5,task6,task7,task8,task9,task10,task11
0,-0.102386,-0.056833,-0.063949,-0.101871,0.017161,-0.100909,-0.248593,-0.153750,-0.438664,0.270830,-0.056658
1,1.022583,-0.046084,-0.003668,0.000650,0.037067,0.039401,0.006188,-0.006683,0.057424,0.013478,-0.021712
2,-0.001237,0.083306,0.081864,-0.066646,-0.088987,-0.024911,-0.985238,0.184275,-0.454025,0.019616,-0.130998
3,0.015700,0.071019,0.067633,-0.181052,-0.073512,-0.210930,-0.141984,-0.493230,-0.219230,0.077950,0.022317
4,0.027253,-0.030939,0.005519,-0.004438,0.032031,-0.012964,-0.191509,0.013752,0.424474,1.003188,-0.276660
...,...,...,...,...,...,...,...,...,...,...,...
5891,0.006398,0.000837,-0.129234,-0.000288,0.027554,0.096568,-0.800623,-0.342583,-0.462321,-0.270477,-0.021117
5892,-0.209650,0.097711,0.058675,0.038899,-0.113364,-0.182386,-0.001555,-0.642442,0.122331,0.111838,0.225398
5893,-0.155359,0.033007,0.112743,0.028786,-0.065611,-0.092091,0.039802,-0.684746,-0.020961,0.181185,-0.021307
5894,0.972080,0.011116,-0.037574,0.014832,0.002973,0.051173,-0.027172,-0.084397,0.005172,-0.189104,0.021973


In [170]:
score_data = pd.read_csv("sample_submission.csv",
                         index_col=0).reset_index(drop=True)
score_data

Unnamed: 0,task1,task2,task3,task4,task5,task6,task7,task8,task9,task10,task11
0,0.965388,0.669021,0.364129,0.248534,0.082723,0.101662,0.894853,0.099291,0.931158,0.132221,0.617906
1,0.972610,0.986971,0.060073,0.286885,0.865854,0.805776,0.481583,0.715330,0.388927,0.998184,0.378946
2,0.366591,0.275695,0.063553,0.966171,0.442205,0.969089,0.509688,0.540241,0.441256,0.164225,0.070570
3,0.475604,0.490168,0.755998,0.477857,0.371955,0.947405,0.280805,0.872361,0.513712,0.570384,0.990165
4,0.034529,0.669413,0.480047,0.011377,0.747641,0.272674,0.322530,0.330088,0.929216,0.492997,0.496907
...,...,...,...,...,...,...,...,...,...,...,...
5891,0.841416,0.832933,0.144299,0.092632,0.860756,0.797975,0.407141,0.819184,0.808753,0.693338,0.253581
5892,0.634844,0.643848,0.698586,0.211566,0.791034,0.462967,0.498234,0.265715,0.171268,0.524664,0.046151
5893,0.161446,0.419693,0.310739,0.977375,0.632457,0.645635,0.952371,0.000913,0.391865,0.986964,0.953342
5894,0.630445,0.798230,0.842443,0.188696,0.407885,0.308575,0.523217,0.240382,0.564827,0.343042,0.005972


In [171]:
target = pd.read_csv("data_train.csv", index_col=0).reset_index(drop=True)
target = target.iloc[:5896, 1:]
target = (target + 1)/2
target[target == 0.5] = -1

target


Unnamed: 0,task1,task2,task3,task4,task5,task6,task7,task8,task9,task10,task11
0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0
1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0
2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,1.0
4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...
5891,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,0.0,-1.0,-1.0
5892,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5893,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0
5894,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [172]:
auc_per_task = []
for j in range(target.shape[1]):
    y_score = predictions_df.iloc[:, j]
    y_true = target.iloc[:, j]
    idx = (y_true != (-1))
    y_true_filtered = y_true[idx]
    y_score_filtered = y_score[idx]

    if len(np.unique(y_true_filtered)) >= 2:
        auc_per_task.append(roc_auc_score(y_true_filtered, y_score_filtered))
    else:
        auc_per_task.append(np.nan)

avg_auc = np.nanmean(auc_per_task)
print(avg_auc)


0.4993757843211411
