In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

from tqdm import tqdm

import pandas as pd
import numpy as np

import optuna


import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, train_test_split

from torch.utils.data import Dataset
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In [2]:
data = pd.read_csv("data_train.csv", index_col=0).reset_index(drop=True)
data


Unnamed: 0,smiles,task1,task2,task3,task4,task5,task6,task7,task8,task9,task10,task11
0,CC(=O)N(C)c1cccc(-c2ccnc3c(C(=O)c4cccs4)cnn23)c1,0,0,0,0,0,0,0,-1,0,0,0
1,COc1cc(N)c(Cl)cc1C(=O)OCCCN1CCCCC1.Cl,0,0,0,0,0,0,0,-1,0,0,0
2,CCCCNc1c(C(=O)OCC)cnc2c1cnn2CC,0,0,0,0,0,0,0,0,0,1,0
3,C#Cc1cccc(Nc2ncnc3cc(OCCOC)c(OCCOC)cc23)c1.Cl,0,0,0,0,0,0,0,-1,0,0,1
4,CC1OC2(CCCCC2Oc2cccc(Cl)c2)N=C1O,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
11995,CC(C)(C)NC[C@@H](O)COc1nsnc1N1CCOCC1,0,0,0,0,0,0,0,-1,0,0,0
11996,CCC[C@@]1(CCc2ccccc2)CC(O)=C([C@H](CC)c2cccc(N...,0,0,0,0,0,0,0,-1,0,0,0
11997,N=C(O)c1cnc(C2CC2)[nH]1,0,0,0,-1,0,0,0,0,0,0,0
11998,CN=C=O,0,0,0,0,0,0,-1,0,0,0,0


In [25]:

y = data[data.columns[1:]].to_numpy()
y


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], dtype=int64)

In [4]:
fp_length = 1024
fps = np.zeros((len(data), fp_length))

# Calculate Morgan fingerprints and convert to numpy array
for i, smiles in enumerate(tqdm(data['smiles'])):
    mol = Chem.MolFromSmiles(smiles)
    fp_vec = AllChem.GetMorganFingerprintAsBitVect(
        mol, radius=3, nBits=fp_length)
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp_vec, arr)
    fps[i] = arr


100%|██████████| 12000/12000 [00:06<00:00, 1790.61it/s]


In [5]:
print("fps shape:", fps.shape)
print("y shape:", y.shape)


fps shape: (12000, 1024)
y shape: (12000, 11)


In [6]:
X_train, X_val, Y_train, Y_val = train_test_split(
    fps, y, test_size=1/6, random_state=42)

In [165]:
def train_rf(X_train, y_train, X_test):
    seed = 1234
    n_tasks = y_train.shape[1]
    y_hats_proba = np.empty((X_test.shape[0], n_tasks))
    y_hats_class = np.empty_like(y_hats_proba)

    # Train RF per task
    for j in tqdm(range(n_tasks)):
        rf_model = RandomForestClassifier(n_estimators=1000,min_samples_split=5, random_state=seed)
        # Mask out unknown samples
        # True where label is 0 or 1, False for unknowns
        idx = (y_train[:, j] != (0))
        # Train model
        rf_model.fit(X_train[idx], y_train[idx, j])
        # Predict class probabilities (select only values for positiv class with index 1)
        y_hats_proba[:, j] = rf_model.predict_proba(X_test)[:, 1]
        # Predict class
        y_hats_class[:, j] = rf_model.predict(X_test)
    return y_hats_proba, y_hats_class


In [33]:
def calc_masked_AUC_per_task(prediction, target):
    auc_per_task = []
    for j in range(target.shape[1]):
                y_score = prediction[:, j]
                y_true = target[:, j]
                # Mask out unknown samples
                idx = (y_true != (0))
                # Calculate AUC per task
                auc_per_task.append(roc_auc_score(y_true[idx], y_score[idx]))
    return auc_per_task


In [34]:
def train_rf_with_grid_search(X_train, y_train):
    seed = 1234
    n_tasks = y_train.shape[1]
    y_hats_proba = np.empty((X_val.shape[0], n_tasks))
    y_hats_class = np.empty_like(y_hats_proba)

    best_params_list = []

    for j in tqdm(range(n_tasks)):
        # Define the classifier with the search space
        rf_model = RandomForestClassifier(random_state=seed)
        param_grid = {
            'n_estimators': [1000],
            'max_depth': [10, 20, 30, 40, 50, None],
            'min_samples_split': [2, 5, 10, 15, 20],
            'min_samples_leaf': [1, 2, 4, 6, 8, 10],
        }

        # Mask out unknown samples
        idx = (y_train[:, j] != (0))

        # Perform grid search with cross-validation
        grid_search = GridSearchCV(
            rf_model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
        grid_search.fit(X_train[idx], y_train[idx, j])

        # Train the model with the best parameters
        best_rf_model = grid_search.best_estimator_
        best_rf_model.fit(X_train[idx], y_train[idx, j])

        # Predict class probabilities (select only values for positive class with index 1)
        y_hats_proba[:, j] = best_rf_model.predict_proba(X_val)[:, 1]
        # Predict class
        y_hats_class[:, j] = best_rf_model.predict(X_val)

        print(f"Task {j} - Best parameters: {grid_search.best_params_}")
        best_params_list.append(grid_search.best_params_)

    return y_hats_proba, y_hats_class, best_params_list


y_hats_proba, y_hats_class, best_params_list = train_rf_with_grid_search(
    X_train, Y_train)

auc_per_task = calc_masked_AUC_per_task(y_hats_proba, Y_val)
print("Mean AUC:", np.mean(auc_per_task))


  9%|▉         | 1/11 [02:36<26:01, 156.15s/it]

Task 0 - Best parameters: {'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 1000}


 18%|█▊        | 2/11 [04:19<18:43, 124.85s/it]

Task 1 - Best parameters: {'max_depth': 50, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}


 27%|██▋       | 3/11 [06:51<18:20, 137.60s/it]

Task 2 - Best parameters: {'max_depth': 30, 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 1000}


 36%|███▋      | 4/11 [08:16<13:37, 116.79s/it]

Task 3 - Best parameters: {'max_depth': 20, 'min_samples_leaf': 8, 'min_samples_split': 20, 'n_estimators': 1000}


 45%|████▌     | 5/11 [09:53<10:56, 109.41s/it]

Task 4 - Best parameters: {'max_depth': 20, 'min_samples_leaf': 8, 'min_samples_split': 2, 'n_estimators': 1000}


 55%|█████▍    | 6/11 [12:16<10:04, 120.92s/it]

Task 5 - Best parameters: {'max_depth': 20, 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 1000}


 64%|██████▎   | 7/11 [52:41<58:17, 874.26s/it]

Task 6 - Best parameters: {'max_depth': 40, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 1000}


 73%|███████▎  | 8/11 [1:16:24<52:27, 1049.05s/it]

Task 7 - Best parameters: {'max_depth': 30, 'min_samples_leaf': 6, 'min_samples_split': 2, 'n_estimators': 1000}


 82%|████████▏ | 9/11 [1:40:47<39:16, 1178.34s/it]

Task 8 - Best parameters: {'max_depth': 40, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}


 91%|█████████ | 10/11 [1:44:26<14:42, 882.15s/it]

Task 9 - Best parameters: {'max_depth': 30, 'min_samples_leaf': 8, 'min_samples_split': 2, 'n_estimators': 1000}


100%|██████████| 11/11 [1:46:36<00:00, 581.49s/it]

Task 10 - Best parameters: {'max_depth': 10, 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 1000}
Mean AUC: 0.7454036210982008





In [166]:
y_hats_proba, y_hats_class = train_rf(X_train, Y_train, X_val)


100%|██████████| 11/11 [01:40<00:00,  9.16s/it]


In [167]:
def calc_masked_AUC_per_task(prediction, target):
    auc_per_task = []
    for j in range(target.shape[1]):
        y_score = prediction[:, j]
        y_true = target[:, j]
        # Mask out unknown samples
        idx = (y_true != (0))
        # Calculate AUC per task
        auc_per_task.append(roc_auc_score(y_true[idx], y_score[idx]))
    return auc_per_task

auc_per_task = calc_masked_AUC_per_task(y_hats_proba, Y_val)

np.mean(auc_per_task)


0.744468769502772

In [None]:
1000: 0.741506020607747 



In [36]:
test_data = pd.read_csv("smiles_test.csv", index_col=0).reset_index(drop=True)
test_data


Unnamed: 0,smiles
0,OC(COc1ccc(Cl)cc1)=N[C@H]1CC[C@H](N=C(O)COc2cc...
1,CCCO/N=C(/C)c1cc(C(O)=NC(Cc2cc(F)cc(F)c2)[C@@H...
2,COc1cc(Cl)ccc1Cl
3,COc1cc(C(O)=NCc2ccc(OCCN(C)C)cc2)cc(OC)c1OC
4,CCC(=O)O[C@@]1(C(=O)CCl)[C@@H](C)C[C@H]2[C@@H]...
...,...
5891,N#Cc1cc(NC(=O)C(=O)O)c(Cl)c(NC(=O)C(=O)O)c1.NC...
5892,O=c1cccc2n1C[C@@H]1CNC[C@H]2C1
5893,CSCC[C@H](N=C(O)[C@H](Cc1ccccc1)N=C(O)CN=C(O)C...
5894,CCn1cc2c3c(cc(C(O)=NC(Cc4ccccc4)[C@H](O)C[NH2+...


In [37]:
fp_length = 1024

test_fps = np.zeros((len(test_data), fp_length))
for i, smiles in enumerate(test_data['smiles']):
    mol = Chem.MolFromSmiles(smiles)
    fp_vec = AllChem.GetMorganFingerprintAsBitVect(
        mol, radius=3, nBits=fp_length)
    arr = np.zeros((1,))
    Chem.DataStructs.ConvertToNumpyArray(fp_vec, arr)
    test_fps[i] = arr




In [38]:
y_hats_proba, y_hats_class = train_rf_with_best_params(
    X_train, Y_train, test_fps, best_params_list)


In [142]:
y_hats_proba, y_hats_class = train_rf(X_train, Y_train, test_fps)

100%|██████████| 11/11 [02:03<00:00, 11.24s/it]


In [39]:
predictions = pd.DataFrame(y_hats_proba, columns=data.columns[1:])
predictions.index.name = 'ID'
predictions.reset_index(inplace=True)


In [40]:
predictions.to_csv('predictions_rf_search_2.csv', index=False)


In [146]:
def calc_masked_AUC_per_task(prediction, target):
    auc_per_task = []
    for j in range(target.shape[1]):
        y_score = prediction[:, j]
        y_true = target[:, j]
        # Mask out unknown samples
        idx = (y_true != (0))
        # Calculate AUC per task
        auc_per_task.append(roc_auc_score(y_true[idx], y_score[idx]))
    return auc_per_task


auc_per_task = calc_masked_AUC_per_task(y_hats_proba, Y_val)
auc_per_task

np.mean(auc_per_task)


IndexError: boolean index did not match indexed array along dimension 0; dimension is 5896 but corresponding boolean dimension is 2000

In [170]:
score_data = pd.read_csv("sample_submission.csv",
                         index_col=0).reset_index(drop=True)
score_data

Unnamed: 0,task1,task2,task3,task4,task5,task6,task7,task8,task9,task10,task11
0,0.965388,0.669021,0.364129,0.248534,0.082723,0.101662,0.894853,0.099291,0.931158,0.132221,0.617906
1,0.972610,0.986971,0.060073,0.286885,0.865854,0.805776,0.481583,0.715330,0.388927,0.998184,0.378946
2,0.366591,0.275695,0.063553,0.966171,0.442205,0.969089,0.509688,0.540241,0.441256,0.164225,0.070570
3,0.475604,0.490168,0.755998,0.477857,0.371955,0.947405,0.280805,0.872361,0.513712,0.570384,0.990165
4,0.034529,0.669413,0.480047,0.011377,0.747641,0.272674,0.322530,0.330088,0.929216,0.492997,0.496907
...,...,...,...,...,...,...,...,...,...,...,...
5891,0.841416,0.832933,0.144299,0.092632,0.860756,0.797975,0.407141,0.819184,0.808753,0.693338,0.253581
5892,0.634844,0.643848,0.698586,0.211566,0.791034,0.462967,0.498234,0.265715,0.171268,0.524664,0.046151
5893,0.161446,0.419693,0.310739,0.977375,0.632457,0.645635,0.952371,0.000913,0.391865,0.986964,0.953342
5894,0.630445,0.798230,0.842443,0.188696,0.407885,0.308575,0.523217,0.240382,0.564827,0.343042,0.005972


In [171]:
target = pd.read_csv("data_train.csv", index_col=0).reset_index(drop=True)
target = target.iloc[:5896, 1:]
target = (target + 1)/2
target[target == 0.5] = -1

target


Unnamed: 0,task1,task2,task3,task4,task5,task6,task7,task8,task9,task10,task11
0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0
1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0
2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,1.0
4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...
5891,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,0.0,-1.0,-1.0
5892,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
5893,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0
5894,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [172]:
auc_per_task = []
for j in range(target.shape[1]):
    y_score = predictions_df.iloc[:, j]
    y_true = target.iloc[:, j]
    idx = (y_true != (0))
    y_true_filtered = y_true[idx]
    y_score_filtered = y_score[idx]

    if len(np.unique(y_true_filtered)) >= 2:
        auc_per_task.append(roc_auc_score(y_true_filtered, y_score_filtered))
    else:
        auc_per_task.append(np.nan)

avg_auc = np.nanmean(auc_per_task)
print(avg_auc)


0.4993757843211411
