In [1]:
import numpy as np
import pandas as pd
import os

from rdkit import Chem
from rdkit.Chem import MACCSkeys, rdFingerprintGenerator
from rdkit import DataStructs

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier 
import optuna

In [2]:
np.random.seed(1234)

In [3]:
# endpoint = 'skin-sensitization'
endpoint = 'eye-irritation'

loc = r'D:\School\Semester3\Seminar - Reproducibility\seminar-toxicity\data'
endpoint_loc = os.path.join(loc, endpoint)

In [4]:
filename = 'data.csv'
df = pd.read_csv(os.path.join(endpoint_loc, filename))

In [5]:
df.head()

Unnamed: 0,CASRN,SMILES,Activity
0,51581-32-9,CN(C)C(=O)OC1C=CC=NC=1,1
1,35155-28-3,CN1C=C2CC3N(C)CC(CO)CC3(OC)C3C=CC=C1C=32,1
2,289-95-2,C1N=CC=CN=1,1
3,77-78-1,COS(=O)(=O)OC,1
4,80-73-9,CN1CCN(C)C1=O,1


In [6]:
print('size of dataset:', df.shape[0])
print('size of pos samples:', df[df['Activity'] == 1].shape[0])
print('size of neg samples:', df[df['Activity'] == 0].shape[0])

size of dataset: 3877
size of pos samples: 2652
size of neg samples: 1225


In [7]:
smiles = df['SMILES'].to_numpy()
labels = df['Activity'].to_numpy()

In [8]:
smiles.shape, labels.shape

((3877,), (3877,))

In [9]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1234)

In [10]:
for i, (train_index, test_index) in enumerate(sss.split(smiles, labels)):
    print(i)

0


In [11]:
train_smiles = smiles[train_index]
train_labels = labels[train_index]
val_smiles = smiles[test_index]
val_labels = labels[test_index]

In [12]:
print('train size smiles :', train_smiles.shape)
print('train size labels :', train_labels.shape)
print('pos samples in train size :', train_labels[train_labels == 1].shape)
print('neg samples in train size :', train_labels[train_labels == 0].shape)
print('val size smiles :', val_smiles.shape)
print('val size labels :', val_labels.shape)
print('pos samples in val size :', val_labels[val_labels == 1].shape)
print('neg samples in val size :', val_labels[val_labels == 0].shape)

train size smiles : (3101,)
train size labels : (3101,)
pos samples in train size : (2121,)
neg samples in train size : (980,)
val size smiles : (776,)
val size labels : (776,)
pos samples in val size : (531,)
neg samples in val size : (245,)


In [13]:
df_train = pd.DataFrame(np.concatenate([train_smiles.reshape(-1,1), train_labels.reshape(-1,1)], axis=1), columns=['SMILES', 'Activity'])

In [14]:
df_train.head()

Unnamed: 0,SMILES,Activity
0,CCOP(OC1=C(C=C(C=C1C)C(C)(C)C)C(C)(C)C)OC1=C(C...,0
1,CCCCCCCCCCCCCCCCSC1NC2=CC(=CC=C2N=1)S(O)(=O)=O,1
2,O=C(CC(=O)CC1=CC(F)=C(F)C=C1F)N1CC2=NN=C(N2CC1...,0
3,CC1C=C(N)N(N=1)C1C=CC=CC=1,0
4,CC1CCCC(C)(C)C=1C(=O)C=CC,0


In [15]:
df_val = pd.DataFrame(np.concatenate([val_smiles.reshape(-1,1), val_labels.reshape(-1,1)], axis=1), columns=['SMILES', 'Activity'])

In [16]:
df_val.head()

Unnamed: 0,SMILES,Activity
0,COC1=CC=C(CC2CC2)C=C1,1
1,CC(C)OP(=O)(OC(C)C)SCC1C=CC=CC=1,1
2,NNC(N)=O,1
3,OS(=O)(=O)C1C=CC=C2C=C(C=C(NC3C=CC=CC=3)C2=1)N...,1
4,CCCC1COC(CC1)C1C=CC(O)=CC=1,1


In [17]:
fname = 'train.csv'
df_train.to_csv(os.path.join(endpoint_loc, fname), index=False)

In [18]:
fname = 'val.csv'
df_val.to_csv(os.path.join(endpoint_loc, fname), index=False)