In [4]:
import os
import numpy as np
import pandas as pd
from helper.load_dataset import load_bace_classification
from helper.preprocess import split_train_valid_test
from helper.features import smi_ecfp
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from hyperopt import hp, tpe, fmin, Trials, space_eval
from hyperopt.pyll import scope

In [8]:
# Load dataset
bace_class = load_bace_classification()

# Split dataset
train, valid, test = split_train_valid_test(bace_class)
merge = pd.concat((train, valid))

# Generate fingerprint
train_smis = train['SMILES']
valid_smis = valid['SMILES']
test_smis = test['SMILES']
merge_smis = merge['SMILES']
X_train = [smi_ecfp(smi) for smi in train_smis]
X_valid = [smi_ecfp(smi) for smi in valid_smis]
X_test = [smi_ecfp(smi) for smi in test_smis]
X_merge = [smi_ecfp(smi) for smi in merge_smis]


# Target defined
y_train = train['Class']
y_valid = valid['Class']
y_test = test['Class']
y_merge = merge['Class']

# Hyperparameters tuning with Hyperopt
trials = Trials()

rf_search_space = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 5, 100, 5)),
    'max_depth': scope.int(hp.quniform('max_depth', 2, 20, 1)),
    'max_features': scope.int(hp.quniform('max_features', 5, 150, 5)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 2, 20, 1)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 20, 1))
}

def rf_objective(params):
    model = RandomForestClassifier(
        n_estimators=params['n_estimators'], 
        max_depth=params['max_depth'], 
        max_features=params['max_features'], 
        min_samples_leaf=params['min_samples_leaf'], 
        min_samples_split=params['min_samples_split'],
        n_jobs=1
    )
    model.fit(X_train, y_train)
    y_valid_hat = model.predict(X_valid)
    f1 = f1_score(y_valid, y_valid_hat)
    return -f1

best_rf_params = fmin(
    fn=rf_objective,
    space=rf_search_space,
    algo=tpe.suggest,
    max_evals=20,
    trials=trials
)

best_rf_params = space_eval(rf_search_space, best_rf_params)

model = RandomForestClassifier(**best_rf_params)
model.fit(X_merge, y_merge)
y_train_pred = model.predict(X_merge)
y_test_pred = model.predict(X_test)

roc_train = roc_auc_score(y_merge, y_train_pred)
roc_test = roc_auc_score(y_test, y_test_pred)

print('AUC-ROC scores: ')
print(f'Train: {roc_train}')
print(f'Test: {roc_test}')

100%|██████████| 20/20 [00:04<00:00,  4.05trial/s, best loss: -0.8352941176470589]
AUC-ROC scores: 
Train: 0.8658335840648153
Test: 0.8162770562770563
