In [1]:
import numpy as np
import pandas as pd
import os

from rdkit import Chem
from wrapMordred import mordredWrapper

from sklearn import svm
from sklearn.metrics import confusion_matrix
import optuna
import joblib

In [2]:
np.random.seed(1234)

In [3]:
# endpoint = 'skin-sensitization'
endpoint = 'eye-irritation'

loc = r'D:\School\Semester3\Seminar - Reproducibility\seminar-toxicity\data'
endpoint_loc = os.path.join(loc, endpoint)
model = r'D:\School\Semester3\Seminar - Reproducibility\seminar-toxicity\src\models'
model_loc = os.path.join(model, endpoint)

In [4]:
filename = 'train.csv'
df_train = pd.read_csv(os.path.join(endpoint_loc, filename))

In [5]:
df_train.shape

(3101, 2)

In [6]:
df_train.head()

Unnamed: 0,SMILES,Activity
0,CCOP(OC1=C(C=C(C=C1C)C(C)(C)C)C(C)(C)C)OC1=C(C...,0
1,CCCCCCCCCCCCCCCCSC1NC2=CC(=CC=C2N=1)S(O)(=O)=O,1
2,O=C(CC(=O)CC1=CC(F)=C(F)C=C1F)N1CC2=NN=C(N2CC1...,0
3,CC1C=C(N)N(N=1)C1C=CC=CC=1,0
4,CC1CCCC(C)(C)C=1C(=O)C=CC,0


In [7]:
filename = 'val.csv'
df_val = pd.read_csv(os.path.join(endpoint_loc, filename))

In [8]:
df_val.shape

(776, 2)

In [9]:
df_val.head()

Unnamed: 0,SMILES,Activity
0,COC1=CC=C(CC2CC2)C=C1,1
1,CC(C)OP(=O)(OC(C)C)SCC1C=CC=CC=1,1
2,NNC(N)=O,1
3,OS(=O)(=O)C1C=CC=C2C=C(C=C(NC3C=CC=CC=3)C2=1)N...,1
4,CCCC1COC(CC1)C1C=CC(O)=CC=1,1


In [10]:
train_smiles = df_train['SMILES'].to_numpy()
train_labels = df_train['Activity'].to_numpy()
val_smiles = df_val['SMILES'].to_numpy()
val_labels = df_val['Activity'].to_numpy()

In [11]:
print('train size smiles :', train_smiles.shape)
print('train size labels :', train_labels.shape)
print('pos samples in train size :', train_labels[train_labels == 1].shape)
print('neg samples in train size :', train_labels[train_labels == 0].shape)
print('val size smiles :', val_smiles.shape)
print('val size labels :', val_labels.shape)
print('pos samples in val size :', val_labels[val_labels == 1].shape)
print('neg samples in val size :', val_labels[val_labels == 0].shape)

train size smiles : (3101,)
train size labels : (3101,)
pos samples in train size : (2121,)
neg samples in train size : (980,)
val size smiles : (776,)
val size labels : (776,)
pos samples in val size : (531,)
neg samples in val size : (245,)


In [12]:
mordred = mordredWrapper(np.concatenate((train_smiles,val_smiles)))

[14:48:42] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[14:48:42] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 18 19 20 21 22 23
[14:48:42] Can't kekulize mol.  Unkekulized atoms: 5 6 7 9 10 11 12 13 14 15 16 17 18
[14:48:42] Can't kekulize mol.  Unkekulized atoms: 5 6 7 9 10 11 12 13 14 15 16 17 18
[14:48:42] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 15 16 17 19 20 21 25 27 28
[14:48:42] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 9 10 11 13 14 15
  0%|          | 18/3871 [00:02<06:15, 10.26it/s] 

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  3%|▎         | 131/3871 [00:06<03:01, 20.61it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  4%|▍         | 164/3871 [00:07<02:25, 25.47it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  6%|▋         | 250/3871 [00:10<02:22, 25.43it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  7%|▋         | 271/3871 [00:10<01:49, 32.78it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 22%|██▏       | 861/3871 [00:31<03:17, 15.27it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 32%|███▏      | 1228/3871 [00:46<02:08, 20.57it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 47%|████▋     | 1802/3871 [01:13<02:21, 14.57it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 3871/3871 [02:46<00:00, 23.31it/s]


In [13]:
train_fingerprints, train_labels = mordred.get_fingerprints(train_smiles, train_labels)
val_fingerprints, val_labels = mordred.get_fingerprints(val_smiles, val_labels)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
[14:53:08] Can't kekulize mol.  Unkekulized atoms: 0 1 2 3 4
[14:53:56] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 14 15 16 18 19 20 21 22 23
[14:55:13] Can't kekulize mol.  Unkekulized atoms: 5 6 7 9 10 11 12 13 14 15 16 17 18
[14:55:35] Can't kekulize mol.  Unkekulized atoms: 5 6 7 9 10 11 12 13 14 15 16 17 18
[14:55:45] Can't kekulize mol.  Unkekulized atoms: 10 11 12 13 15 16 17 19 20 21 25 27 28
[14:58:53] Can't kekulize mol.  Unkekulized atoms: 1 2 3 4 5 6 7 9 10 11 13 14 15


In [14]:
print('train size fingerprints :', train_fingerprints.shape)
print('train size labels :', train_labels.shape)
print('pos samples in train size :', train_labels[train_labels == 1].shape)
print('neg samples in train size :', train_labels[train_labels == 0].shape)
print('val size fingerprints :', val_fingerprints.shape)
print('val size labels :', val_labels.shape)
print('pos samples in val size :', val_labels[val_labels == 1].shape)
print('neg samples in val size :', val_labels[val_labels == 0].shape)

train size fingerprints : (3096, 945)
train size labels : (3096,)
pos samples in train size : (2116,)
neg samples in train size : (980,)
val size fingerprints : (775, 945)
val size labels : (775,)
pos samples in val size : (530,)
neg samples in val size : (245,)


In [15]:
# def objective(trial, xtrain, ytrain):
#     n = trial.suggest_int('n_estimators', 50, 250)
#     rf = RandomForestClassifier(n_estimators = n)

#     scores = cross_validate(rf, xtrain, ytrain, cv=5, scoring='roc_auc')
#     mean_roc = scores['test_score'].mean()

#     return 1/(mean_roc + 1e-6)

In [16]:
# study = optuna.create_study(study_name='rf_study_mordred', storage='sqlite:///rf_study_mordred.db')  # Create a new study.
# study.optimize(lambda trial: objective(trial, train_fingerprints, train_labels), n_trials=20)  # Invoke optimization of the objective function.

In [17]:
# study.best_params

In [18]:
clf = svm.SVC()
clf.fit(train_fingerprints, train_labels)

In [19]:
# performing predictions on the test dataset 
y_pred = clf.predict(train_fingerprints)

In [20]:
print('Train accuracy = ', (y_pred == train_labels).sum()/len(train_labels))

Train accuracy =  0.6847545219638242


In [21]:
y_pred = clf.predict(val_fingerprints)

In [22]:
print('Val accuracy = ', (y_pred == val_labels).sum()/len(val_labels))

Val accuracy =  0.6812903225806451


In [23]:
tn, fp, fn, tp = confusion_matrix(y_pred, val_labels).ravel()

In [24]:
(tn, fp, fn, tp)

(1, 3, 244, 527)

In [25]:
ACC = (tp + tn)/(tp + tn + fn + fp)
SEN = tp/(tp + fn)
SPE = tn/(tn + fp)

In [26]:
print(f'Accuracy = {ACC}')
print(f'Sensitivity = {SEN}')
print(f'Specificity = {SPE}')

Accuracy = 0.6812903225806451
Sensitivity = 0.6835278858625162
Specificity = 0.25


#### Saving model

In [27]:
model_name = 'svm-Mordred.joblib'
joblib.dump(clf, os.path.join(model_loc, model_name))

['D:\\School\\Semester3\\Seminar - Reproducibility\\seminar-toxicity\\src\\models\\eye-irritation\\svm-Mordred.joblib']