In [1]:
import numpy as np
import pandas as pd
import os

from rdkit import Chem
from rdkit.Chem import MACCSkeys, rdFingerprintGenerator
from rdkit import DataStructs

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier 
import optuna

In [2]:
np.random.seed(1234)

In [3]:
endpoint = 'skin-sensitization'
# endpoint = 'eye-irritation'

loc = r'D:\School\Semester3\Seminar - Reproducibility\seminar-toxicity\data'
endpoint_loc = os.path.join(loc, endpoint)

In [4]:
filename = 'data.csv'
df = pd.read_csv(os.path.join(endpoint_loc, filename))

In [5]:
df.head()

Unnamed: 0,CASRN,SMILES,Activity
0,1655500-83-6,CC(CC=C(C)C)C1CC1(C)CO,1
1,2082-81-7,CC(=C)C(=O)OCCCCOC(=O)C(C)=C,1
2,75-33-2,CC(C)S,1
3,16958-92-2,CCCCCCCCCCCCCOC(=O)CCCCC(=O)OCCCCCCCCCCCCC,1
4,106-26-3,CC(C)=CCCC(C)=CC=O,1


In [6]:
print('size of dataset:', df.shape[0])
print('size of pos samples:', df[df['Activity'] == 1].shape[0])
print('size of neg samples:', df[df['Activity'] == 0].shape[0])

size of dataset: 3695
size of pos samples: 2021
size of neg samples: 1674


In [7]:
smiles = df['SMILES'].to_numpy()
labels = df['Activity'].to_numpy()

In [8]:
smiles.shape, labels.shape

((3695,), (3695,))

In [9]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1234)

In [10]:
for i, (train_index, test_index) in enumerate(sss.split(smiles, labels)):
    print(i)

0


In [11]:
train_smiles = smiles[train_index]
train_labels = labels[train_index]
val_smiles = smiles[test_index]
val_labels = labels[test_index]

In [12]:
print('train size smiles :', train_smiles.shape)
print('train size labels :', train_labels.shape)
print('pos samples in train size :', train_labels[train_labels == 1].shape)
print('neg samples in train size :', train_labels[train_labels == 0].shape)
print('val size smiles :', val_smiles.shape)
print('val size labels :', val_labels.shape)
print('pos samples in val size :', val_labels[val_labels == 1].shape)
print('neg samples in val size :', val_labels[val_labels == 0].shape)

train size smiles : (2956,)
train size labels : (2956,)
pos samples in train size : (1617,)
neg samples in train size : (1339,)
val size smiles : (739,)
val size labels : (739,)
pos samples in val size : (404,)
neg samples in val size : (335,)


In [13]:
df_train = pd.DataFrame(np.concatenate([train_smiles.reshape(-1,1), train_labels.reshape(-1,1)], axis=1), columns=['SMILES', 'Activity'])

In [14]:
df_train.head()

Unnamed: 0,SMILES,Activity
0,CC(=O)CC(=O)NC1=CC(Cl)=CC=C1OC,0
1,CC1=NN2C(=NC3C=CC=CC=3C2=O)C1N=NC1C=CC(Cl)=CC=...,0
2,CCCCCCCCCCCCCCCCCCN(CCCCCCCCCCCCCCCCCC)C(=O)C1...,0
3,ClC1C=C(C=CC=1)OC1C=CC(Cl)=CC=1,0
4,OCCN1CCN(CCS(O)(=O)=O)CC1,0


In [15]:
df_val = pd.DataFrame(np.concatenate([val_smiles.reshape(-1,1), val_labels.reshape(-1,1)], axis=1), columns=['SMILES', 'Activity'])

In [16]:
df_val.head()

Unnamed: 0,SMILES,Activity
0,CCC1(CO)COCOC1,0
1,CC(=O)C(=CC1C=CC=CC=1[N+]([O-])=O)C(=O)OC,1
2,[O-][N+](=O)C1C(=NN2C=CC=CC2=1)OCCO,0
3,CCCCCCCCCCCC(=O)OC1CC(C)(C)NC(C)(C)C1,0
4,CC(C)(C)CC(C)CC(=O)OOC(C)(C)C,1


In [17]:
fname = 'train.csv'
df_train.to_csv(os.path.join(endpoint_loc, fname), index=False)

In [18]:
fname = 'val.csv'
df_train.to_csv(os.path.join(endpoint_loc, fname), index=False)