In [1]:
import numpy as np
import pandas as pd
import os

from rdkit import Chem
from wrapMordred import mordredWrapper

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import optuna
import joblib

In [2]:
np.random.seed(1234)

In [3]:
# endpoint = 'skin-sensitization'
endpoint = 'eye-irritation'

loc = r'D:\School\Semester3\Seminar - Reproducibility\seminar-toxicity\data'
endpoint_loc = os.path.join(loc, endpoint)
model = r'D:\School\Semester3\Seminar - Reproducibility\seminar-toxicity\src\models'
model_loc = os.path.join(model, endpoint)

In [4]:
filename = 'train.csv'
df_train = pd.read_csv(os.path.join(endpoint_loc, filename))

In [5]:
df_train.shape

(3101, 2)

In [6]:
df_train.head()

Unnamed: 0,SMILES,Activity
0,CCOP(OC1=C(C=C(C=C1C)C(C)(C)C)C(C)(C)C)OC1=C(C...,0
1,CCCCCCCCCCCCCCCCSC1NC2=CC(=CC=C2N=1)S(O)(=O)=O,1
2,O=C(CC(=O)CC1=CC(F)=C(F)C=C1F)N1CC2=NN=C(N2CC1...,0
3,CC1C=C(N)N(N=1)C1C=CC=CC=1,0
4,CC1CCCC(C)(C)C=1C(=O)C=CC,0


In [7]:
filename = 'val.csv'
df_val = pd.read_csv(os.path.join(endpoint_loc, filename))

In [8]:
df_val.shape

(776, 2)

In [9]:
df_val.head()

Unnamed: 0,SMILES,Activity
0,COC1=CC=C(CC2CC2)C=C1,1
1,CC(C)OP(=O)(OC(C)C)SCC1C=CC=CC=1,1
2,NNC(N)=O,1
3,OS(=O)(=O)C1C=CC=C2C=C(C=C(NC3C=CC=CC=3)C2=1)N...,1
4,CCCC1COC(CC1)C1C=CC(O)=CC=1,1


In [10]:
train_smiles = df_train['SMILES'].to_numpy()
train_labels = df_train['Activity'].to_numpy()
val_smiles = df_val['SMILES'].to_numpy()
val_labels = df_val['Activity'].to_numpy()

In [11]:
print('train size smiles :', train_smiles.shape)
print('train size labels :', train_labels.shape)
print('pos samples in train size :', train_labels[train_labels == 1].shape)
print('neg samples in train size :', train_labels[train_labels == 0].shape)
print('val size smiles :', val_smiles.shape)
print('val size labels :', val_labels.shape)
print('pos samples in val size :', val_labels[val_labels == 1].shape)
print('neg samples in val size :', val_labels[val_labels == 0].shape)

train size smiles : (3101,)
train size labels : (3101,)
pos samples in train size : (2121,)
neg samples in train size : (980,)
val size smiles : (776,)
val size labels : (776,)
pos samples in val size : (531,)
neg samples in val size : (245,)


In [12]:
mordred = mordredWrapper(np.concatenate((train_smiles,val_smiles)))

[17:47:56] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[17:47:56] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 18 19 20 21 22 23
[17:47:56] Can't kekulize mol.  Unkekulized atoms: 5 6 7 9 10 11 12 13 14 15 16 17 18
[17:47:56] Can't kekulize mol.  Unkekulized atoms: 5 6 7 9 10 11 12 13 14 15 16 17 18
[17:47:56] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 15 16 17 19 20 21 25 27 28
[17:47:56] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 9 10 11 13 14 15
  1%|          | 21/3871 [00:02<05:34, 11.52it/s] 

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  3%|▎         | 131/3871 [00:05<02:29, 24.93it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  4%|▍         | 164/3871 [00:05<01:56, 31.80it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  6%|▌         | 235/3871 [00:07<01:07, 53.47it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  6%|▋         | 251/3871 [00:08<01:48, 33.27it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  8%|▊         | 303/3871 [00:09<02:00, 29.49it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 22%|██▏       | 865/3871 [00:24<01:59, 25.08it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 32%|███▏      | 1228/3871 [00:35<01:46, 24.75it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 3871/3871 [02:10<00:00, 29.77it/s]


In [13]:
train_fingerprints, train_labels = mordred.get_fingerprints(train_smiles, train_labels)
val_fingerprints, val_labels = mordred.get_fingerprints(val_smiles, val_labels)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
[17:51:25] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[17:52:05] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 18 19 20 21 22 23
[17:53:11] Can't kekulize mol.  Unkekulized atoms: 5 6 7 9 10 11 12 13 14 15 16 17 18
[17:53:31] Can't kekulize mol.  Unkekulized atoms: 5 6 7 9 10 11 12 13 14 15 16 17 18
[17:53:39] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 15 16 17 19 20 21 25 27 28
[17:56:49] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 9 10 11 13 14 15


In [14]:
print('train size fingerprints :', train_fingerprints.shape)
print('train size labels :', train_labels.shape)
print('pos samples in train size :', train_labels[train_labels == 1].shape)
print('neg samples in train size :', train_labels[train_labels == 0].shape)
print('val size fingerprints :', val_fingerprints.shape)
print('val size labels :', val_labels.shape)
print('pos samples in val size :', val_labels[val_labels == 1].shape)
print('neg samples in val size :', val_labels[val_labels == 0].shape)

train size fingerprints : (3096, 945)
train size labels : (3096,)
pos samples in train size : (2116,)
neg samples in train size : (980,)
val size fingerprints : (775, 945)
val size labels : (775,)
pos samples in val size : (530,)
neg samples in val size : (245,)


In [15]:
def objective(trial, xtrain, ytrain):
    n = trial.suggest_int('n_estimators', 50, 250)
    rf = RandomForestClassifier(n_estimators = n)

    scores = cross_validate(rf, xtrain, ytrain, cv=5, scoring='roc_auc')
    mean_roc = scores['test_score'].mean()

    return 1/(mean_roc + 1e-6)

In [16]:
study = optuna.create_study(study_name='rf_study_mordred', storage='sqlite:///rf_study_mordred.db')  # Create a new study.
study.optimize(lambda trial: objective(trial, train_fingerprints, train_labels), n_trials=20)  # Invoke optimization of the objective function.

[I 2024-01-30 17:58:00,914] A new study created in RDB with name: rf_study_mordred
[I 2024-01-30 17:58:12,520] Trial 0 finished with value: 1.2800709803278167 and parameters: {'n_estimators': 55}. Best is trial 0 with value: 1.2800709803278167.
[I 2024-01-30 17:58:44,051] Trial 1 finished with value: 1.2688421434964787 and parameters: {'n_estimators': 152}. Best is trial 1 with value: 1.2688421434964787.
[I 2024-01-30 17:59:01,291] Trial 2 finished with value: 1.274100012755968 and parameters: {'n_estimators': 85}. Best is trial 1 with value: 1.2688421434964787.
[I 2024-01-30 17:59:45,764] Trial 3 finished with value: 1.2654108499778598 and parameters: {'n_estimators': 226}. Best is trial 3 with value: 1.2654108499778598.
[I 2024-01-30 18:00:17,415] Trial 4 finished with value: 1.2681920028195828 and parameters: {'n_estimators': 162}. Best is trial 3 with value: 1.2654108499778598.
[I 2024-01-30 18:00:49,297] Trial 5 finished with value: 1.265993842611653 and parameters: {'n_estimators

In [17]:
study.best_params

{'n_estimators': 192}

In [18]:
rf = RandomForestClassifier(n_estimators = 192, random_state=1234)
rf.fit(train_fingerprints, train_labels)

In [19]:
# performing predictions on the test dataset 
y_pred = rf.predict(train_fingerprints)

In [20]:
print('Train accuracy = ', (y_pred == train_labels).sum()/len(train_labels))

Train accuracy =  0.9996770025839793


In [21]:
y_pred = rf.predict(val_fingerprints)

In [22]:
print('Val accuracy = ', (y_pred == val_labels).sum()/len(val_labels))

Val accuracy =  0.7651612903225806


In [23]:
tn, fp, fn, tp = confusion_matrix(y_pred, val_labels).ravel()

In [24]:
(tn, fp, fn, tp)

(113, 50, 132, 480)

In [25]:
ACC = (tp + tn)/(tp + tn + fn + fp)
SEN = tp/(tp + fn)
SPE = tn/(tn + fp)

In [26]:
print(f'Accuracy = {ACC}')
print(f'Sensitivity = {SEN}')
print(f'Specificity = {SPE}')

Accuracy = 0.7651612903225806
Sensitivity = 0.7843137254901961
Specificity = 0.6932515337423313


#### Saving model

In [27]:
model_name = 'rf-Mordred.joblib'
joblib.dump(rf, os.path.join(model_loc, model_name))

['D:\\School\\Semester3\\Seminar - Reproducibility\\seminar-toxicity\\src\\models\\eye-irritation\\rf-Mordred.joblib']