# Recursive Feature Elimination using XGB

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../..')

In [2]:
%run ../../helper_modules/Helper_functions_for_nRepeats_x_kCV.ipynb

In [3]:
protein_name = 'fxa'
file_name = '../4_Ensemble_docking_results/' + \
            'df_DkSc_results_COCRYS_DEKOIS_DUD.pkl'
df_dk_res = pd.read_pickle(file_name)

# Extract the features columns: Docking scores
X = df_dk_res.drop('activity', axis = 1).values
# Extract the response variable: Activity
y = df_dk_res['activity'].values

### Set a GBT as a base estimator

In [4]:
from xgboost import XGBClassifier

hyparams = {'subsample'       : 0.5, 
            'n_estimators'    : 200, 
            'max_depth'       : 10, 
            'learning_rate'   : 0.1,
            'alpha'           : 0.5,
            'gamma'           : 1, 
            'colsample_bytree': 1,
            'use_label_encoder': False
           }

xgb = XGBClassifier(**hyparams)

### Recursive Feature Elimination to rank protein conformations 

- This result was used to rank protein conformations.

In [5]:
from sklearn.feature_selection import RFE

In [6]:
# This wrapper function let us to run the RFE
# and save the results to a file
xgb_slector_ranks = RFE_wrapper(
    filename  = f'./cachedir/rfe_selectors/RFE_xgb_{protein_name}.joblib',
    estimator = xgb,
    n_features_to_select = 1,
    X         = X,
    y         = y,
    step      = 1,
    verbose   = 2 
)

File loaded: ./cachedir/rfe_selectors/RFE_xgb_fxa.joblib


In [7]:
xgb_slector_ranks.ranking_

array([ 85,  42,  89,  33,  11,  84,  72,  48,  37,  41,   1,  31,  53,
        28,  64,  90,  30,  95,  44,  68, 105,  15,   2,  19,  52,  39,
        80,  60,  59,  69, 130,  14, 125,  56,  25,  99,  86,   8, 107,
        13,  24, 111,  78,  87, 118,  97,  77,  58,  62,  18,  55, 134,
        82, 114,  38,  61,  71,  20,  17,  40,   3,  96, 109,  74, 129,
       136, 110, 113,   6, 122, 106,  67,  98, 121, 108, 133,  75,   7,
         5,  81, 102, 128,  49, 112,  10,  35,  88,  29,   9, 104, 103,
        65, 126,  76,  46, 135, 127, 123,  73, 132, 124, 119, 115, 120,
        16,  21,  45,  51,  27,  93,  91, 101,  70,  36,  63,  92,  57,
       116, 117,  79,  26,  54, 131,  12,   4,  94,  47,  66,  23, 100,
        83,  43,  32,  22,  50,  34])

### Instantiate the RFE with Cross Validation

In [8]:
from sklearn.feature_selection import RFECV

In [9]:
# This wrapper function let us to run the RFECV
# and save the results to a file
xgb_selector  = REFCV_wrapper(
    filename  = f'./cachedir/rfe_selectors/RFECV_xgb_{protein_name}.joblib',
    estimator = xgb,
    min_features_to_select = 1,
    X_train   = X,
    y_train   = y,
    scoring   = 'roc_auc',
    cv        = 4, 
    step      = 2,
    verbose   = 2,
    n_jobs    = 4
)

File loaded: ./cachedir/rfe_selectors/RFECV_xgb_fxa.joblib


In [10]:
# Visualize the rank of each conformation
print('Conformations ranking (RFECV):')
print(xgb_selector.ranking_)
print('Number of conformations with rank = 1:')
print((xgb_selector.ranking_ == 1).sum())

Conformations ranking (RFECV):
[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  9  1  1  1
  1  1  7  1  1  9  8  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
 11  1  1 11  1  8  1  1  1  1  1  3  1  1  1  1  3 12  2  1  1  6  6  1
  1  1 12  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  4 10  1  1  7  1  1  1  1  1  1  1  1  1  5  1  1  1  1  2  1
  1  1  1  1  1  4  1  1  1 10  5  1  1  1  1  1]
Number of conformations with rank = 1:
114
