In [None]:
import numpy as np
import pandas as pd
import os

from rdkit import Chem
from rdkit.Chem import MACCSkeys, rdFingerprintGenerator
from rdkit import DataStructs

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import optuna

In [None]:
np.random.seed(1234)

In [None]:
# endpoint = 'skin-sensitization'
endpoint = 'eye-irritation'

loc = r'D:\School\Semester3\Seminar - Reproducibility\seminar-toxicity\data'
endpoint_loc = os.path.join(loc, endpoint)
model = r'D:\School\Semester3\Seminar - Reproducibility\seminar-toxicity\src\models'
model_loc = os.path.join(model, endpoint)

In [None]:
filename = 'train.csv'
df_train = pd.read_csv(os.path.join(endpoint_loc, filename))

In [None]:
df_train.shape

In [None]:
df_train.head()

In [None]:
filename = 'val.csv'
df_val = pd.read_csv(os.path.join(endpoint_loc, filename))

In [None]:
df_val.shape

In [None]:
df_val.head()

In [None]:
train_smiles = df_train['SMILES'].to_numpy()
train_labels = df_train['Activity'].to_numpy()
val_smiles = df_val['SMILES'].to_numpy()
val_labels = df_val['Activity'].to_numpy()

In [None]:
print('train size smiles :', train_smiles.shape)
print('train size labels :', train_labels.shape)
print('pos samples in train size :', train_labels[train_labels == 1].shape)
print('neg samples in train size :', train_labels[train_labels == 0].shape)
print('val size smiles :', val_smiles.shape)
print('val size labels :', val_labels.shape)
print('pos samples in val size :', val_labels[val_labels == 1].shape)
print('neg samples in val size :', val_labels[val_labels == 0].shape)

In [None]:
def get_MAACS(smiles_array, labels):
    fps = []
    y = []
    for smiles, label in zip(smiles_array, labels):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            pass
        else:
            fps.append(np.array(MACCSkeys.GenMACCSKeys(mol)))
            y.append(label)

    assert len(fps) == len(y)
    
    return np.array(fps), np.array(y)

In [None]:
train_fingerprints, train_labels = get_MAACS(train_smiles, train_labels)
val_fingerprints, val_labels = get_MAACS(val_smiles, val_labels)

In [None]:
print('train size fingerprints :', train_fingerprints.shape)
print('train size labels :', train_labels.shape)
print('pos samples in train size :', train_labels[train_labels == 1].shape)
print('neg samples in train size :', train_labels[train_labels == 0].shape)
print('val size fingerprints :', val_fingerprints.shape)
print('val size labels :', val_labels.shape)
print('pos samples in val size :', val_labels[val_labels == 1].shape)
print('neg samples in val size :', val_labels[val_labels == 0].shape)

In [None]:
def objective(trial, xtrain, ytrain):
    n = trial.suggest_int('n_estimators', 2, 200)
    rf = RandomForestClassifier(n_estimators = n)

    scores = cross_validate(rf, xtrain, ytrain, cv=5, scoring='roc_auc')
    mean_roc = scores['test_score'].mean()

    return 1/(mean_roc + 1e-6)

In [None]:
study = optuna.create_study(study_name='rf_study', storage='sqlite:///rf_study.db')  # Create a new study.
study.optimize(lambda trial: objective(trial, train_fingerprints, train_labels), n_trials=20)  # Invoke optimization of the objective function.

In [None]:
study.best_params

In [None]:
rf = RandomForestClassifier(n_estimators = 159, random_state=1234)
rf.fit(train_fingerprints, train_labels)

In [None]:
# performing predictions on the test dataset 
y_pred = rf.predict(train_fingerprints)

In [None]:
print('Train accuracy = ', (y_pred == train_labels).sum()/len(train_labels))

In [None]:
y_pred = rf.predict(val_fingerprints)

In [None]:
print('Val accuracy = ', (y_pred == val_labels).sum()/len(val_labels))

In [None]:
confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0])