# Recursive Feature Elimination using XGB

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../..')

In [2]:
%run ./1_Helper_functions.ipynb

  **kwargs
  **kwargs


In [3]:
protein_name = 'cdk2'
file_name = '../4_Ensemble_docking_results/' + \
            'df_DkSc_results_COCRYS_CSAR_DEKOIS_DUD.pkl'
df_dk_res = pd.read_pickle(file_name)

# Extract the features columns: Docking scores
X = df_dk_res.drop('activity', axis = 1).values
# Extract the response variable: Activity
y = df_dk_res['activity'].values

### Set a GBT as a base estimator

In [4]:
from xgboost import XGBClassifier

hyparams = {'subsample': 0.5, 
            'n_estimators': 200, 
            'max_depth': 20, 
            'learning_rate': 0.05,
            'alpha': 0.01,
            'gamma': 0.01, 
            'colsample_bytree': 0.5}
xgb = XGBClassifier(**hyparams)

### Recursive Feature Elimination to rank protein conformations 

- This result was used to rank protein conformations.

In [5]:
from sklearn.feature_selection import RFE

In [6]:
# This wrapper function let us to run the RFE
# and save the results to a file
xgb_slector_ranks = RFE_wrapper(
    filename  = f'./cachedir/rfe_selectors/RFE_xgb_{protein_name}.joblib',
    estimator = xgb,
    n_features_to_select = 1,
    X         = X,
    y         = y,
    step      = 1,
    verbose   = 2 
)

File loaded: ./cachedir/rfe_selectors/RFE_xgb_cdk2.joblib


In [7]:
xgb_slector_ranks.ranking_

array([123, 227, 277, 105, 319, 119,  70, 146, 161, 133,  94, 396,  27,
       293, 343,  61, 131, 297, 308, 128,  73, 356, 148, 393, 142, 151,
       326, 216,  91, 248, 113, 189, 118, 314, 190, 269, 108, 310, 249,
       364,  36, 338, 125,  55,  17, 402, 299, 226, 271,  85, 309, 177,
       153,  93,  98,  75, 349, 273, 260, 112, 186, 211, 306,  97,  22,
       362,  50, 187,  23, 259, 115, 195, 240, 221,  44, 228, 238,  40,
       307, 191, 192, 231, 178, 160, 291,  32, 132,  89, 374, 209, 156,
       224, 104, 173, 168, 109, 136, 111, 274, 253, 255, 246, 210,  81,
       230, 201, 233, 179, 371, 316,  80, 245, 158, 311, 335, 262, 182,
       281, 317, 251, 286, 169,   5,  49,  34, 196,   7, 302, 235, 165,
       348, 263, 166, 154, 399, 137,  28, 124, 193,   2, 205, 330, 337,
       351, 324, 354, 358, 219, 239,  45, 176, 303, 385,  11,  65, 318,
       340, 252, 171, 398, 143,  88, 298, 295, 203, 222, 236,  18, 296,
       368, 301, 223, 101, 167, 140, 242, 350, 339, 315,   9, 37

### Instantiate the RFE with Cross Validation

In [8]:
from sklearn.feature_selection import RFECV

In [9]:
# This wrapper function let us to run the RFECV
# and save the results to a file
xgb_selector  = REFCV_wrapper(
    filename  = f'./cachedir/rfe_selectors/RFECV_xgb_{protein_name}.joblib',
    estimator = xgb,
    min_features_to_select = 1,
    X_train   = X,
    y_train   = y,
    scoring   = 'roc_auc',
    cv        = 4, 
    step      = 2,
    verbose   = 2,
    n_jobs    = 4
)

File loaded: ./cachedir/rfe_selectors/RFECV_xgb_cdk2.joblib


In [10]:
# Visualize the rank of each conformation
print('Conformations ranking (RFECV):')
print(xgb_selector.ranking_)
print('Number of conformations with rank = 1:')
print((xgb_selector.ranking_ == 1).sum())

Conformations ranking (RFECV):
[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  4  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 12  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  9  1  6  1  1 11
  1  1  1  1  3  1  1  1  5  1  1  1  1  1  1  1  1  1  3  1  1  7  1  1
  1  1  1  1  1  1  1  1  1  1  2  1  1  1  1  1  1  1  1  1 10  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1 11  1  1  1  1  9  1 12  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1 10  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  8  1  1  1  1  1  1  1  1  1  1  1  1  8  1
  7  1  1  1  1  1  1  5  1  1  1  1  1  1  2  1  1  1  1  1  1  1  1  1
  1  1  1  6  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  