In [2]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from tqdm import tqdm

import pandas as pd
import numpy as np


import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from sklearn.metrics import roc_auc_score



import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


In [3]:
data = pd.read_csv("data_train.csv", index_col=0).reset_index(drop=True)
data


Unnamed: 0,smiles,task1,task2,task3,task4,task5,task6,task7,task8,task9,task10,task11
0,CC(=O)N(C)c1cccc(-c2ccnc3c(C(=O)c4cccs4)cnn23)c1,0,0,0,0,0,0,0,-1,0,0,0
1,COc1cc(N)c(Cl)cc1C(=O)OCCCN1CCCCC1.Cl,0,0,0,0,0,0,0,-1,0,0,0
2,CCCCNc1c(C(=O)OCC)cnc2c1cnn2CC,0,0,0,0,0,0,0,0,0,1,0
3,C#Cc1cccc(Nc2ncnc3cc(OCCOC)c(OCCOC)cc23)c1.Cl,0,0,0,0,0,0,0,-1,0,0,1
4,CC1OC2(CCCCC2Oc2cccc(Cl)c2)N=C1O,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
11995,CC(C)(C)NC[C@@H](O)COc1nsnc1N1CCOCC1,0,0,0,0,0,0,0,-1,0,0,0
11996,CCC[C@@]1(CCc2ccccc2)CC(O)=C([C@H](CC)c2cccc(N...,0,0,0,0,0,0,0,-1,0,0,0
11997,N=C(O)c1cnc(C2CC2)[nH]1,0,0,0,-1,0,0,0,0,0,0,0
11998,CN=C=O,0,0,0,0,0,0,-1,0,0,0,0


In [126]:
y = data[data.columns[1:]].to_numpy()
y


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]], dtype=int64)

In [113]:
fp_length = 1024
fps = np.zeros((len(data), fp_length))

# Calculate Morgan fingerprints and convert to numpy array
for i, smiles in enumerate(tqdm(data['smiles'])):
    mol = Chem.MolFromSmiles(smiles)
    fp_vec = AllChem.GetMorganFingerprintAsBitVect(
        mol, radius=3, nBits=fp_length)
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp_vec, arr)
    fps[i] = arr


100%|██████████| 12000/12000 [00:07<00:00, 1701.26it/s]


In [114]:
print("fps shape:", fps.shape)
print("y shape:", y.shape)


fps shape: (12000, 1024)
y shape: (12000, 11)


In [115]:
X_train, X_val, Y_train, Y_val = train_test_split(
    fps, y, test_size=1/6, random_state=42)

In [123]:
def train_rf(X_train, y_train, X_test):
    seed = 1234
    n_tasks = y_train.shape[1]
    y_hats_proba = np.empty((X_test.shape[0], n_tasks))
    y_hats_class = np.empty_like(y_hats_proba)

    # Train RF per task
    for j in tqdm(range(n_tasks)):
        rf_model = RandomForestClassifier(n_estimators=1000, random_state=seed)
        # Mask out unknown samples
        # True where label is 0 or 1, False for unknowns
        idx = (y_train[:, j] != (0))
        # Train model
        rf_model.fit(X_train[idx], y_train[idx, j])
        # Predict class probabilities (select only values for positiv class with index 1)
        y_hats_proba[:, j] = rf_model.predict_proba(X_test)[:, 1]
        # Predict class
        y_hats_class[:, j] = rf_model.predict(X_test)
    return y_hats_proba, y_hats_class


In [124]:
y_hats_proba, y_hats_class = train_rf(X_train, Y_train, X_val)


100%|██████████| 11/11 [01:45<00:00,  9.58s/it]


In [127]:
test_data = pd.read_csv("smiles_test.csv", index_col=0).reset_index(drop=True)
test_data


Unnamed: 0,smiles
0,OC(COc1ccc(Cl)cc1)=N[C@H]1CC[C@H](N=C(O)COc2cc...
1,CCCO/N=C(/C)c1cc(C(O)=NC(Cc2cc(F)cc(F)c2)[C@@H...
2,COc1cc(Cl)ccc1Cl
3,COc1cc(C(O)=NCc2ccc(OCCN(C)C)cc2)cc(OC)c1OC
4,CCC(=O)O[C@@]1(C(=O)CCl)[C@@H](C)C[C@H]2[C@@H]...
...,...
5891,N#Cc1cc(NC(=O)C(=O)O)c(Cl)c(NC(=O)C(=O)O)c1.NC...
5892,O=c1cccc2n1C[C@@H]1CNC[C@H]2C1
5893,CSCC[C@H](N=C(O)[C@H](Cc1ccccc1)N=C(O)CN=C(O)C...
5894,CCn1cc2c3c(cc(C(O)=NC(Cc4ccccc4)[C@H](O)C[NH2+...


In [128]:
fp_length = 1024

test_fps = np.zeros((len(test_data), fp_length))
for i, smiles in enumerate(test_data['smiles']):
    mol = Chem.MolFromSmiles(smiles)
    fp_vec = AllChem.GetMorganFingerprintAsBitVect(
        mol, radius=3, nBits=fp_length)
    arr = np.zeros((1,))
    Chem.DataStructs.ConvertToNumpyArray(fp_vec, arr)
    test_fps[i] = arr




In [142]:
y_hats_proba, y_hats_class = train_rf(X_train, Y_train, test_fps)

100%|██████████| 11/11 [02:03<00:00, 11.24s/it]


In [143]:
predictions = pd.DataFrame(y_hats_proba, columns=data.columns[1:])
predictions.index.name = 'ID'
predictions.reset_index(inplace=True)


In [145]:
predictions.to_csv('predictions_rf.csv', index=False)


In [146]:
def calc_masked_AUC_per_task(prediction, target):
    auc_per_task = []
    for j in range(target.shape[1]):
        y_score = prediction[:, j]
        y_true = target[:, j]
        # Mask out unknown samples
        idx = (y_true != (0))
        # Calculate AUC per task
        auc_per_task.append(roc_auc_score(y_true[idx], y_score[idx]))
    return auc_per_task


auc_per_task = calc_masked_AUC_per_task(y_hats_proba, Y_val)
auc_per_task

np.mean(auc_per_task)


IndexError: boolean index did not match indexed array along dimension 0; dimension is 5896 but corresponding boolean dimension is 2000