## Conformational Selection using Recursive Feature Elimination

In [2]:
import pandas as pd
import numpy as np
import sys
sys.path.append('..')

### Load the data

In [3]:
file_name = './df_DkSc_results_COCRYS_DEKOIS_DUD.pkl'
X_merged_dksc = pd.read_pickle(file_name)
# Extract activity column
y_true_merged = X_merged_dksc['activity']
# Drop column from merged_dkksc
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)
X_merged_dksc.shape

(6233, 136)

#### Function and data to perform Scaffold Train-Test splitting

In [4]:
from scaffold_splitter import train_test_scaffold_split

# Compute or load the dataframe containing the Generic Murcko Scaffolds
file = './df_COCRYS_DUD_DEKOIS_Murcko_Scaffolds_SMILES.obj'

df_scff_murcko = pd.read_pickle(file)

In [5]:
# Run or load decorador
from modules.run_or_load_decorator import run_or_load_joblib

### Function to perform RFE with cross Validation

In [6]:
from sklearn.feature_selection import RFECV

import joblib
import os

def _run_RFECV(estimator, X_train, y_train, cv=5, scoring='roc_auc', **kwargs):
    '''Simply runs a RFECV procedure'''
    # Intance and fit the rfe selector
    selector = RFECV(estimator,  cv = cv,
                     scoring = scoring, **kwargs)
    
    selector = selector.fit(X_train, y_train)
    return selector

def get_selected_features_per_step(fitted_selector, X):
    # Get the features ranking
    df_ = pd.DataFrame({'pdb_id': X.columns, 'rfe_ranking': fitted_selector.ranking_})
    # Sort features by ranking
    df_.sort_values('rfe_ranking', inplace = True)
    # Create a list of ranked features from size 1 to n
    list_of_confs_per_k = [ df_.index[:i+1].tolist() for i in range(len(df_))]
    # Create a dataframe indicating which features belong to a k subset
    df_confs_per_k = pd.DataFrame({'list_of_confs_rfe': list_of_confs_per_k})
    return df_confs_per_k

@run_or_load_joblib
def REFCV_wrapper(filename, estimator, X_train, y_train,
                  cv=5, scoring='roc_auc', **kwargs):
    estimator = _run_RFECV(estimator, X_train, y_train, cv=5, scoring='roc_auc', **kwargs)
    return estimator


In [7]:
from sklearn.model_selection import train_test_split
from scaffold_splitter import train_test_scaffold_split
from sklearn.metrics import roc_auc_score

#### Define X and y

In [8]:
# Train and test sets
X = X_merged_dksc
y = y_true_merged

## Random Splitting

#### RANDOM split: Logistic Regression as  base estimator

In [9]:
%%time
from sklearn.linear_model import LogisticRegression

# RFECV object to be save 
dataset = 'MERGED'
model_name = 'LogReg'
split = 'random'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# Base Estimator Parameters
hyparams = {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear', 'max_iter': 150}
estimator = LogisticRegression(**hyparams)

#*********************************************************************************
# Do the RANDOM splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

#*********************************************************************************
# Run RFE
selector = REFCV_wrapper(filename, estimator, X_train, y_train, 
                         cv=5, step=1, min_features_to_select=1, n_jobs=4)

# selector.grid_scores_

File loaded: ./ml_models/RFECV_selector_LogReg_MERGED_random_split.joblib
CPU times: user 31.2 ms, sys: 0 ns, total: 31.2 ms
Wall time: 53.5 ms


#### RANDOM split: Random Forest as base estimator 

In [10]:
%%time
from sklearn.ensemble import RandomForestClassifier

# RFECV object to be save 
dataset = 'MERGED'
model_name = 'RandomForest'
split = 'random'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# Base Estimator Parameters
hyparams = {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 0.02, 
            'min_samples_split': 0.1, 'n_estimators': 300}
estimator = RandomForestClassifier(**hyparams)

#*********************************************************************************
# Do the RANDOM splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

#*********************************************************************************
# Run RFE
selector = REFCV_wrapper(filename, estimator, X_train, y_train, 
                         cv=5, step=1, min_features_to_select=1, n_jobs=4)

# selector.grid_scores_

File loaded: ./ml_models/RFECV_selector_RandomForest_MERGED_random_split.joblib
CPU times: user 254 ms, sys: 0 ns, total: 254 ms
Wall time: 617 ms


#### RANDOM split: XGBoost Classifier as base estimator 

In [11]:
%%time
from xgboost import XGBClassifier 

# RFECV object to be save 
dataset = 'MERGED'
model_name = 'XGB_tree'
split = 'random'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# Base Estimator Parameters
hyparams = {'subsample': 0.3, 'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.1, 
            'gamma': 0.5, 'colsample_bytree': 1, 'alpha': 1}
estimator = XGBClassifier(**hyparams)

#*********************************************************************************
# Do the RANDOM splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

#*********************************************************************************
# Run RFE
selector = REFCV_wrapper(filename, estimator, X_train, y_train, 
                         cv=5, step=1, min_features_to_select=1, n_jobs=4)

# selector.grid_scores_

File saved: ./ml_models/RFECV_selector_XGB_tree_MERGED_random_split.joblib
CPU times: user 1min 38s, sys: 9.54 s, total: 1min 47s
Wall time: 10min 29s


## Scaffold Splitting

#### Define X and y

In [13]:
# Train and test sets
X = X_merged_dksc
y = y_true_merged
scaffold_series = df_scff_murcko['scff_generic']

#### SCAFFOLD split: Logistic Regression as  estimator

In [21]:
%%time
from sklearn.linear_model import LogisticRegression

# RFECV object to be save 
dataset = 'MERGED'
model_name = 'LogReg'
split = 'scaffold'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# Base Estimator Parameters
hyparams = {'C': 100, 'penalty': 'l1', 'solver': 'liblinear', 'max_iter': 150}
estimator = LogisticRegression(**hyparams)

#*********************************************************************************
# Do the RANDOM splitting
X_train, X_test, y_train, y_test = train_test_scaffold_split(X, y, 
                                    scaffold_series = scaffold_series,
                                    test_size=0.25, stratify=y)

#*********************************************************************************
# Run RFE
selector = REFCV_wrapper(filename, estimator, X_train, y_train, 
                         cv=5, step=1, min_features_to_select=1, n_jobs=4)
selector.grid_scores_

File saved: ./ml_models/RFECV_selector_LogReg_MERGED_scaffold_split.joblib
CPU times: user 1min 21s, sys: 1.3 s, total: 1min 23s
Wall time: 20min 3s


array([0.66428692, 0.66549177, 0.6993898 , 0.70941584, 0.7192033 ,
       0.74192861, 0.73487809, 0.73388821, 0.74811534, 0.74278356,
       0.75642433, 0.75178342, 0.75878336, 0.75366434, 0.75811005,
       0.76068315, 0.77266108, 0.77099344, 0.77757806, 0.77607828,
       0.77768882, 0.77807311, 0.77899918, 0.77992081, 0.78022081,
       0.78243483, 0.78260477, 0.78473415, 0.78886859, 0.79369802,
       0.79307664, 0.79235934, 0.79586249, 0.79229095, 0.7903834 ,
       0.79031039, 0.79256823, 0.79133774, 0.79514669, 0.79556705,
       0.7958663 , 0.79754487, 0.79856415, 0.79976898, 0.80050597,
       0.79930084, 0.79876064, 0.80068526, 0.80212661, 0.80344725,
       0.80305814, 0.80240535, 0.80176369, 0.80318348, 0.80201042,
       0.80154713, 0.80378967, 0.80419501, 0.8036534 , 0.80326458,
       0.80455309, 0.80510083, 0.80484228, 0.80443207, 0.80516801,
       0.8058352 , 0.80673495, 0.80654478, 0.80878567, 0.8086385 ,
       0.8086391 , 0.80990669, 0.81064812, 0.81032194, 0.81012

#### SCAFFOLD split: Random Forest as estimator 

In [19]:
%%time
from sklearn.ensemble import RandomForestClassifier

# RFECV object to be save 
dataset = 'MERGED'
model_name = 'RandomForest'
split = 'scaffold'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# Base Estimator Parameters
hyparams = {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 0.02, 
            'min_samples_split': 0.1, 'n_estimators': 400}
estimator = RandomForestClassifier(**hyparams)

#*********************************************************************************
# Do the SCAFFOLD splitting
X_train, X_test, y_train, y_test = train_test_scaffold_split(X, y, 
                                    scaffold_series = scaffold_series,
                                    test_size=0.25, stratify=y)

#*********************************************************************************
# Run RFE
selector = REFCV_wrapper(filename, estimator, X_train, y_train, 
                         cv=5, step=1, min_features_to_select=1, n_jobs=4)
selector.grid_scores_

File loaded: ./ml_models/RFECV_selector_RandomForest_MERGED_scaffold_split.joblib
CPU times: user 373 ms, sys: 16 ms, total: 389 ms
Wall time: 426 ms


array([0.62807247, 0.65434801, 0.68180049, 0.71914251, 0.70185898,
       0.70949886, 0.71749916, 0.7168106 , 0.71777319, 0.73021456,
       0.73064559, 0.73845956, 0.73222367, 0.73472117, 0.73014669,
       0.7303391 , 0.73304695, 0.73231804, 0.73266971, 0.7320845 ,
       0.73012843, 0.73215795, 0.73131385, 0.74233804, 0.74040013,
       0.74264437, 0.73952067, 0.74240573, 0.74052518, 0.7428915 ,
       0.74201735, 0.74108235, 0.73729055, 0.7396859 , 0.74209018,
       0.74045662, 0.73914572, 0.73733552, 0.73797667, 0.7341799 ,
       0.73532397, 0.73809966, 0.74164115, 0.73717583, 0.73732063,
       0.73850945, 0.74305161, 0.74007098, 0.73950101, 0.73785406,
       0.73780903, 0.73992409, 0.7355391 , 0.74218836, 0.74176609,
       0.73612555, 0.7408378 , 0.74455828, 0.73729774, 0.73457117,
       0.73558999, 0.74013687, 0.73814917, 0.74163249, 0.73870498,
       0.73937818, 0.74002351, 0.74002363, 0.73821912, 0.73507202,
       0.73543098, 0.74156108, 0.7385141 , 0.74026242, 0.73733

In [None]:
#### SCAFFOLD split: XGBoost Classifier as base estimator 

In [14]:
%%time
from xgboost import XGBClassifier 

# RFECV object to be save 
dataset = 'MERGED'
model_name = 'XGB_tree'
split = 'scaffold'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# Base Estimator Parameters
hyparams = {'subsample': 0.3, 'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.1, 
            'gamma': 0.1, 'colsample_bytree': 0.3, 'alpha': 1}
estimator = XGBClassifier(**hyparams)

#*********************************************************************************
# Do the SCAFFOLD splitting
X_train, X_test, y_train, y_test = train_test_scaffold_split(X, y, 
                                    scaffold_series = scaffold_series,
                                    test_size=0.25, stratify=y)

#*********************************************************************************
# Run RFE
selector = REFCV_wrapper(filename, estimator, X_train, y_train, 
                         cv=5, step=1, min_features_to_select=1, n_jobs=6)

# selector.grid_scores_

File saved: ./ml_models/RFECV_selector_XGB_tree_MERGED_scaffold_split.joblib
CPU times: user 1min 19s, sys: 24.9 s, total: 1min 44s
Wall time: 3min 44s


In [13]:
# Get the features ranking
df_ = pd.DataFrame({'pdb_id': X.columns, 'rfe_ranking': selector.ranking_})
# Sort features by ranking
df_.sort_values('rfe_ranking', inplace = True)
#
list_of_confs_per_k = [ df_.index[:i+1].tolist() for i in range(len(df_))]
df_confs_per_k = pd.DataFrame({'list_of_confs_rfe': list_of_confs_per_k})
df_confs_per_k

Unnamed: 0,list_of_confs_rfe
0,[135]
1,"[135, 61]"
2,"[135, 61, 60]"
3,"[135, 61, 60, 58]"
4,"[135, 61, 60, 58, 57]"
...,...
131,"[135, 61, 60, 58, 57, 56, 55, 62, 54, 50, 113,..."
132,"[135, 61, 60, 58, 57, 56, 55, 62, 54, 50, 113,..."
133,"[135, 61, 60, 58, 57, 56, 55, 62, 54, 50, 113,..."
134,"[135, 61, 60, 58, 57, 56, 55, 62, 54, 50, 113,..."
