## CDK2: Conformational Selection using Recursive Feature Elimination

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('..')

### Load the data

In [2]:
file_name = './df_DkSc_results_COCRYS_CSAR_DEKOIS_DUD.pkl'
X_merged_dksc = pd.read_pickle(file_name)
# Extract activity column
y_true_merged = X_merged_dksc['activity']
# Drop column from merged_dkksc
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)
X_merged_dksc.shape

(3466, 402)

#### Function and data to perform Scaffold Train-Test splitting

In [1]:

sys.path.append('../2_Docking_analysis/')
from scaffold_splitter import train_test_scaffold_split

# Compute or load the dataframe containing the Generic Murcko Scaffolds
file = '../2_Docking_analysis/df_COCRYS_CSAR_DUD_DEKOIS_Murcko_Scaffolds_SMILES.obj'

df_scff_murcko = pd.read_pickle(file)
df_scff_murcko.shape

NameError: name 'sys' is not defined

In [4]:
# Run or load decorador
from modules.run_or_load_decorator import run_or_load_joblib

### Function to perform RFE with cross Validation

In [5]:
from sklearn.feature_selection import RFECV

import joblib
import os

def _run_RFECV(estimator, X_train, y_train, cv=5, scoring='roc_auc', **kwargs):
    '''Simply runs a RFECV procedure'''
    # Intance and fit the rfe selector
    selector = RFECV(estimator,  cv = cv,
                     scoring = scoring, **kwargs)
    
    selector = selector.fit(X_train, y_train)
    return selector

def get_selected_features_per_step(fitted_selector, X):
    # Get the features ranking
    df_ = pd.DataFrame({'pdb_id': X.columns, 'rfe_ranking': fitted_selector.ranking_})
    # Sort features by ranking
    df_.sort_values('rfe_ranking', inplace = True)
    # Create a list of ranked features from size 1 to n
    list_of_confs_per_k = [ df_.index[:i+1].tolist() for i in range(len(df_))]
    # Create a dataframe indicating which features belong to a k subset
    df_confs_per_k = pd.DataFrame({'list_of_confs_rfe': list_of_confs_per_k})
    return df_confs_per_k

@run_or_load_joblib
def REFCV_wrapper(filename, estimator, X_train, y_train,
                  cv=5, scoring='roc_auc', **kwargs):
    estimator = _run_RFECV(estimator, X_train, y_train, cv=5, scoring='roc_auc', **kwargs)
    return estimator


In [6]:
from sklearn.model_selection import train_test_split
from scaffold_splitter import train_test_scaffold_split
from sklearn.metrics import roc_auc_score

#### Define X and y

In [7]:
# Train and test sets
X = X_merged_dksc
y = y_true_merged

## Random Splitting

#### RANDOM split: Logistic Regression as base  estimator

In [8]:
%%time
from sklearn.linear_model import LogisticRegression

# RFECV object to be save 
dataset = 'MERGED'
model_name = 'LogReg'
split = 'random'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# Base Estimator Parameters
hyparams = {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs', 'max_iter':200}
estimator = LogisticRegression(**hyparams)

#*********************************************************************************
# Do the RANDOM splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

#*********************************************************************************
# Run RFE
selector = REFCV_wrapper(filename, estimator, X_train, y_train, 
                         cv=5, step=1, min_features_to_select=1, n_jobs=4)

# selector.grid_scores_

File loaded: ./ml_models/RFECV_selector_LogReg_MERGED_random_split.joblib
CPU times: user 12.8 ms, sys: 20.1 ms, total: 32.9 ms
Wall time: 32 ms


#### RANDOM split: Random Forest as base estimator 

In [9]:
%%time
from sklearn.ensemble import RandomForestClassifier

# RFECV object to be save 
dataset = 'MERGED'
model_name = 'RandomForest'
split = 'random'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# Base Estimator Parameters
hyparams = {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 0.05, 
            'min_samples_split': 0.1, 'n_estimators': 400}
estimator = RandomForestClassifier(**hyparams)

#*********************************************************************************
# Do the RANDOM splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

#*********************************************************************************
# Run RFE
selector = REFCV_wrapper(filename, estimator, X_train, y_train, 
                         cv=5, step=1, min_features_to_select=1, n_jobs=4)

# selector.grid_scores_

File loaded: ./ml_models/RFECV_selector_RandomForest_MERGED_random_split.joblib
CPU times: user 221 ms, sys: 48.1 ms, total: 269 ms
Wall time: 262 ms


#### RANDOM split: XGBoost Classifier as base estimator 

In [10]:
%%time
from xgboost import XGBClassifier 

# RFECV object to be save 
dataset = 'MERGED'
model_name = 'XGB_tree'
split = 'random'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# Base Estimator Parameters
hyparams = {'subsample': 0.3, 'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.1, 
            'gamma': 0.5, 'colsample_bytree': 1, 'alpha': 1}
estimator = XGBClassifier(**hyparams)

#*********************************************************************************
# Do the RANDOM splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

#*********************************************************************************
# Run RFE
selector = REFCV_wrapper(filename, estimator, X_train, y_train, 
                         cv=5, step=1, min_features_to_select=1, n_jobs=4)

# selector.grid_scores_

File loaded: ./ml_models/RFECV_selector_XGB_tree_MERGED_random_split.joblib
CPU times: user 163 ms, sys: 27.9 ms, total: 191 ms
Wall time: 185 ms


## Scaffold Splitting

#### Define X and y

In [11]:
# Train and test sets
X = X_merged_dksc
y = y_true_merged
scaffold_series = df_scff_murcko['scff_generic']

#### SCAFFOLD split: Logistic Regression as  estimator

In [12]:
%%time
from sklearn.linear_model import LogisticRegression

# RFECV object to be save 
dataset = 'MERGED'
model_name = 'LogReg'
split = 'scaffold'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# Base Estimator Parameters
hyparams = {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs', 'max_iter': 200}
estimator = LogisticRegression(**hyparams)

#*********************************************************************************
# Do the RANDOM splitting
X_train, X_test, y_train, y_test = train_test_scaffold_split(X, y, 
                                    scaffold_series = scaffold_series,
                                    test_size=0.25, stratify=y)

#*********************************************************************************
# Run RFE
selector = REFCV_wrapper(filename, estimator, X_train, y_train, 
                         cv=5, step=1, min_features_to_select=1, n_jobs=4)
selector.grid_scores_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


File saved: ./ml_models/RFECV_selector_LogReg_MERGED_scaffold_split.joblib
CPU times: user 47.2 s, sys: 18.2 s, total: 1min 5s
Wall time: 4min 44s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


array([0.65722357, 0.69108759, 0.701354  , 0.6965847 , 0.71788471,
       0.72216953, 0.73508885, 0.74457074, 0.75593613, 0.75935385,
       0.7563164 , 0.7575412 , 0.75524042, 0.75485701, 0.76058068,
       0.76820342, 0.77339964, 0.77934975, 0.7815374 , 0.78506295,
       0.78714513, 0.79233217, 0.79840348, 0.80350879, 0.80784316,
       0.80977092, 0.81327822, 0.81338271, 0.8122264 , 0.81552557,
       0.81622914, 0.81517384, 0.81583698, 0.81452338, 0.81542583,
       0.81590096, 0.81619092, 0.81822326, 0.81697479, 0.81876271,
       0.81927206, 0.82062653, 0.82049733, 0.81914431, 0.82067852,
       0.81922986, 0.82072369, 0.82078974, 0.82103182, 0.82308158,
       0.82324342, 0.82606471, 0.82680716, 0.82878217, 0.83079878,
       0.82965717, 0.82861377, 0.8287116 , 0.8293335 , 0.8305899 ,
       0.83137507, 0.83169224, 0.83228613, 0.83278828, 0.83350612,
       0.83397026, 0.83446053, 0.83542128, 0.83605962, 0.83645111,
       0.83615389, 0.83645142, 0.83611991, 0.83663694, 0.83803

#### SCAFFOLD split: Random Forest as estimator 

In [13]:
%%time
from sklearn.ensemble import RandomForestClassifier

# RFECV object to be save 
dataset = 'MERGED'
model_name = 'RandomForest'
split = 'scaffold'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# Base Estimator Parameters
hyparams = {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 0.05,
            'min_samples_split': 0.1, 'n_estimators': 300}
estimator = RandomForestClassifier(**hyparams)

#*********************************************************************************
# Do the SCAFFOLD splitting
X_train, X_test, y_train, y_test = train_test_scaffold_split(X, y, 
                                    scaffold_series = scaffold_series,
                                    test_size=0.25, stratify=y)

#*********************************************************************************
# Run RFE
selector = REFCV_wrapper(filename, estimator, X_train, y_train, 
                         cv=5, step=1, min_features_to_select=1, n_jobs=4)
selector.grid_scores_

File saved: ./ml_models/RFECV_selector_RandomForest_MERGED_scaffold_split.joblib
CPU times: user 6min 43s, sys: 2.19 s, total: 6min 46s
Wall time: 24min 41s


array([0.64212408, 0.66291433, 0.68697324, 0.69792746, 0.71863323,
       0.73187589, 0.74308404, 0.74614224, 0.7487489 , 0.76199379,
       0.75706105, 0.76082452, 0.76066966, 0.75977247, 0.75810547,
       0.75957848, 0.76213157, 0.7619874 , 0.76203302, 0.76527011,
       0.76601414, 0.76746526, 0.76577492, 0.77140024, 0.77309967,
       0.77143201, 0.76960577, 0.77073659, 0.77160262, 0.77065182,
       0.76926318, 0.77457056, 0.77364837, 0.77206542, 0.77435316,
       0.77736179, 0.77931305, 0.77633251, 0.77492096, 0.7753026 ,
       0.7734453 , 0.77533459, 0.77746165, 0.77150659, 0.77380162,
       0.77286793, 0.77457959, 0.77507993, 0.77406348, 0.77456677,
       0.77437239, 0.77424819, 0.77521393, 0.77581298, 0.77776849,
       0.77312788, 0.7732394 , 0.77535896, 0.77634406, 0.77731802,
       0.77619298, 0.77483357, 0.77755705, 0.77859316, 0.77515667,
       0.77680362, 0.77558552, 0.77748782, 0.77728795, 0.77371672,
       0.77536769, 0.77699864, 0.77401847, 0.77654182, 0.77563

#### SCAFFOLD split: XGBoost Classifier as base estimator 

In [14]:
%%time
from xgboost import XGBClassifier 

# RFECV object to be save 
dataset = 'MERGED'
model_name = 'XGB_tree'
split = 'scaffold'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# Base Estimator Parameters
hyparams = {'subsample': 0.3, 'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.1, 
            'gamma': 0.1, 'colsample_bytree': 0.3, 'alpha': 1}
estimator = XGBClassifier(**hyparams)

#*********************************************************************************
# Do the SCAFFOLD splitting
X_train, X_test, y_train, y_test = train_test_scaffold_split(X, y, 
                                    scaffold_series = scaffold_series,
                                    test_size=0.25, stratify=y)

#*********************************************************************************
# Run RFE
selector = REFCV_wrapper(filename, estimator, X_train, y_train, 
                         cv=5, step=1, min_features_to_select=1, n_jobs=6)

# selector.grid_scores_

File saved: ./ml_models/RFECV_selector_XGB_tree_MERGED_scaffold_split.joblib
CPU times: user 23.7 s, sys: 4.53 s, total: 28.2 s
Wall time: 18min 8s


In [15]:
# Get the features ranking
df_ = pd.DataFrame({'pdb_id': X.columns, 'rfe_ranking': selector.ranking_})
# Sort features by ranking
df_.sort_values('rfe_ranking', inplace = True)
#
list_of_confs_per_k = [ df_.index[:i+1].tolist() for i in range(len(df_))]
df_confs_per_k = pd.DataFrame({'list_of_confs_rfe': list_of_confs_per_k})
df_confs_per_k

Unnamed: 0,list_of_confs_rfe
0,[0]
1,"[0, 272]"
2,"[0, 272, 271]"
3,"[0, 272, 271, 270]"
4,"[0, 272, 271, 270, 269]"
...,...
397,"[0, 272, 271, 270, 269, 268, 267, 266, 265, 26..."
398,"[0, 272, 271, 270, 269, 268, 267, 266, 265, 26..."
399,"[0, 272, 271, 270, 269, 268, 267, 266, 265, 26..."
400,"[0, 272, 271, 270, 269, 268, 267, 266, 265, 26..."
