# Machine Learning Model Selection and Hyperparameter Optimization

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../..')
from helper_modules.plotting_metrics import PlotMetric

## Docking Results

In [2]:
file_name = '../4_Ensemble_docking_results/df_DkSc_results_COCRYS_DEKOIS_DUD.pkl'
df_dk_res = pd.read_pickle(file_name)

# Extract the features columns: Docking scores
X = df_dk_res.drop('activity', axis = 1)
# Extract the response variable: Activity
y = df_dk_res['activity']

X.shape

(6233, 136)

***
<h3 style='color: black; background-color: #F9E5AB; padding: 5px;'>
    Train-test split
</h3>

In [3]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [4]:
# Stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.25, 
    stratify = y, random_state = 42
)

In [5]:
# Verify stratified splitting
n_train_mols    = y_train.shape[0]
n_train_actives = y_train.sum()
n_test_mols    = y_test.shape[0]
n_test_actives = y_test.sum()
print(f'No. of molecules in train set: {n_train_mols}', 
      f'with {n_train_actives} actives.',
      f'R_a = {n_train_actives / n_train_mols:.2f}')
print(f'No. of molecules in test set: {n_test_mols}', 
      f'with {n_test_actives} actives.',
      f'R_a = {n_test_actives / n_test_mols:.2f}')

No. of molecules in train set: 4674 with 225 actives. R_a = 0.05
No. of molecules in test set: 1559 with 75 actives. R_a = 0.05


In [6]:
# We will define a simple function to report the GS results
def GSCV_report(gs_fitted, X_train, X_test, y_train, y_test):
    y_train_predict = gs_fitted.predict_proba(X_train)
    y_test_predict  = gs_fitted.predict_proba(X_test)
    mean_cv_roc     = gs_fitted.best_score_
    train_roc       = roc_auc_score(y_train, y_train_predict[:, 1])
    test_roc        = roc_auc_score(y_test, y_test_predict[:, 1])
    best_params     = gs_fitted.best_params_

    # Print results
    print('*'*10, 'GRID SEARCH RESULTS', '*'*10)
    print('- Mean CV ROC-AUC:\t{:.3f}'.format(mean_cv_roc))
    print('- Train ROC-AUC:  \t{:.3f}'.format(train_roc))
    print('- Test ROC-AUC:   \t{:.3f}'.format(test_roc))
    print('- Best hyperparameters', best_params)
    print('**'*21)
    print('')

***
<h3 style='color: black; background-color: #F9E5AB; padding: 5px;'>
    Grid Search: Logistic Regression
</h3>

In [7]:
from sklearn.linear_model import LogisticRegression

In [8]:
%%time
scoring          = 'roc_auc'
estimator_name   = 'LogisticRegression'
estimator        = LogisticRegression(
                         max_iter = 400
                   )
estimator_hyprms = {
    'C'       : np.geomspace(1e-6, 1e2, 5),
    'penalty' : ['l1', 'l2', None]
}

# Grid search
gs_lr = GridSearchCV(
        estimator    = estimator,
        param_grid   = estimator_hyprms,
        cv           = 5,
        scoring      = scoring,
        n_jobs       = 4
)

# Fit and evaluate the estimator
gs_lr.fit(X_train, y_train)

# Report the results
GSCV_report(gs_lr, 
            X_train, X_test, 
            y_train, y_test)

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.826
- Train ROC-AUC:  	0.870
- Test ROC-AUC:   	0.805
- Best hyperparameters {'C': 0.01, 'penalty': 'l2'}
******************************************

CPU times: user 1.71 s, sys: 120 ms, total: 1.83 s
Wall time: 6.08 s


In [9]:
gs_lr.best_params_

{'C': 0.01, 'penalty': 'l2'}

In [10]:
gs_lr.best_estimator_

LogisticRegression(C=0.01, max_iter=400)

***
<h3 style='color: black; background-color: #F9E5AB; padding: 5px;'>
    Randomized Search: Gradient Boosting Trees
</h3>

In [11]:
from xgboost import XGBClassifier

In [12]:
%%time
scoring          = 'roc_auc'
estimator_name   = 'XGB_tree'
estimator = XGBClassifier(use_label_encoder=False)

estimator_hyprms = {
   'n_estimators'    : [200, 300, 500],
   'max_depth'       : [3, 5, 10, 20],
   'gamma'           : [0.01, 0.1, 0.5, 1],
   'learning_rate'   : [0.05, 0.1],
   'subsample'       : [0.3, 0.5, 0.6],
   'alpha'           : [0.01, 0.1, 0.5, 1],
   'colsample_bytree': [0.3, 0.5, 1]
}

# Grid search
gs_gbt = RandomizedSearchCV(
  estimator             = estimator,
  param_distributions   = estimator_hyprms,
  cv                    = 5,
  scoring               = scoring,
  n_jobs                = 4,
  n_iter                = 50
)

# Fit and evaluate the estimator
gs_gbt.fit(X_train, y_train)

# Report the results
GSCV_report(gs_gbt, 
            X_train, X_test, 
            y_train, y_test)

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC:	0.857
- Train ROC-AUC:  	1.000
- Test ROC-AUC:   	0.855
- Best hyperparameters {'subsample': 0.5, 'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.1, 'gamma': 1, 'colsample_bytree': 1, 'alpha': 0.5}
******************************************

CPU times: user 24.5 s, sys: 469 ms, total: 25 s
Wall time: 2min 18s


In [13]:
gs_gbt.best_params_

{'subsample': 0.5,
 'n_estimators': 200,
 'max_depth': 10,
 'learning_rate': 0.1,
 'gamma': 1,
 'colsample_bytree': 1,
 'alpha': 0.5}

In [14]:
gs_gbt.best_estimator_

XGBClassifier(alpha=0.5, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=1, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=200, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0.5, reg_lambda=1, scale_pos_weight=1, subsample=0.5,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

Finished!