# Recursive Feature Elimination using XGB

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../..')

In [2]:
%run ../../helper_modules/Helper_functions_for_nRepeats_x_kCV.ipynb

In [3]:
protein_name = 'hsp90'
file_name = '../4_Ensemble_docking_results/' + \
            'df_DkSc_results_COCRYS_DEKOIS_DUD.pkl'
df_dk_res = pd.read_pickle(file_name)

# Extract the features columns: Docking scores
X = df_dk_res.drop('activity', axis = 1).values
# Extract the response variable: Activity
y = df_dk_res['activity'].values

### Set a GBT as a base estimator

In [4]:
from xgboost import XGBClassifier

hyparams = {'subsample'         : 0.6,
             'n_estimators'     : 500,
             'max_depth'        : 5,
             'learning_rate'    : 0.05,
             'gamma'            : 0.01,
             'colsample_bytree' : 0.5,
             'alpha'            : 0.1,
             'eval_metric'      :   'logloss',
             'use_label_encoder': False
           }

xgb = XGBClassifier(**hyparams)

### Recursive Feature Elimination to rank protein conformations 

- This result was used to rank protein conformations.

In [5]:
from sklearn.feature_selection import RFE

In [6]:
# This wrapper function let us to run the RFE
# and save the results to a file
xgb_slector_ranks = RFE_wrapper(
    filename  = f'./cachedir/rfe_selectors/RFE_xgb_{protein_name}.joblib',
    estimator = xgb,
    n_features_to_select = 1,
    X         = X,
    y         = y,
    step      = 1,
    verbose   = 2 
)

Fitting estimator with 64 features.
Fitting estimator with 63 features.
Fitting estimator with 62 features.
Fitting estimator with 61 features.
Fitting estimator with 60 features.
Fitting estimator with 59 features.
Fitting estimator with 58 features.
Fitting estimator with 57 features.
Fitting estimator with 56 features.
Fitting estimator with 55 features.
Fitting estimator with 54 features.
Fitting estimator with 53 features.
Fitting estimator with 52 features.
Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 fe

In [7]:
xgb_slector_ranks.ranking_

array([62,  4, 41, 52, 31, 29, 26,  7, 11, 36, 28, 43,  5, 58, 25, 47, 18,
       23, 51, 35, 30, 24, 61, 64, 19, 33, 59, 22, 50, 63,  3, 21, 20,  2,
       53, 39, 37, 40, 17, 27, 45, 57, 44,  1,  8,  9, 15, 54, 56,  6, 14,
       49, 55, 16, 34, 13, 32, 48, 42, 12, 46, 60, 38, 10])

### Instantiate the RFE with Cross Validation

In [8]:
from sklearn.feature_selection import RFECV

In [9]:
# This wrapper function let us to run the RFECV
# and save the results to a file
xgb_selector  = REFCV_wrapper(
    filename  = f'./cachedir/rfe_selectors/RFECV_xgb_{protein_name}.joblib',
    estimator = xgb,
    min_features_to_select = 1,
    X_train   = X,
    y_train   = y,
    scoring   = 'roc_auc',
    cv        = 4, 
    step      = 2,
    verbose   = 2,
    n_jobs    = 4
)

Fitting estimator with 64 features.
Fitting estimator with 62 features.
Fitting estimator with 60 features.
Fitting estimator with 58 features.
File saved: ./cachedir/rfe_selectors/RFECV_xgb_hsp90.joblib


In [10]:
# Visualize the rank of each conformation
print('Conformations ranking (RFECV):')
print(xgb_selector.ranking_)
print('Number of conformations with rank = 1:')
print((xgb_selector.ranking_ == 1).sum())

Conformations ranking (RFECV):
[3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 5 5 1 1 1 1 1 4 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 1 1 1 1 2 1 1 1 2 1 1]
Number of conformations with rank = 1:
56
