## CDK2: Conformational Selection using Recursive Feature Elimination

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('..')

### Load the data

In [2]:
file_name = './df_DkSc_results_COCRYS_CSAR_DEKOIS_DUD.pkl'
X_merged_dksc = pd.read_pickle(file_name)
# Extract activity column
y_true_merged = X_merged_dksc['activity']
# Drop column from merged_dkksc
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)
X_merged_dksc.shape

(3466, 402)

#### Function and data to perform Scaffold Train-Test splitting

In [3]:
sys.path.append('../2_Docking_analysis/')
from scaffold_splitter import train_test_scaffold_split

# Compute or load the dataframe containing the Generic Murcko Scaffolds
file = '../2_Docking_analysis/df_COCRYS_CSAR_DUD_DEKOIS_Murcko_Scaffolds_SMILES.obj'

df_scff_murcko = pd.read_pickle(file)
df_scff_murcko.shape

(3466, 3)

In [4]:
# Run or load decorador
from modules.run_or_load_decorator import run_or_load_joblib

### Function to perform RFE with cross Validation

In [5]:
from sklearn.feature_selection import RFECV

import joblib
import os

def _run_RFECV(estimator, X_train, y_train, cv=5, scoring='roc_auc', **kwargs):
    '''Simply runs a RFECV procedure'''
    # Intance and fit the rfe selector
    selector = RFECV(estimator,  cv = cv,
                     scoring = scoring, **kwargs)
    
    selector = selector.fit(X_train, y_train)
    return selector

def get_selected_features_per_step(fitted_selector, X):
    # Get the features ranking
    df_ = pd.DataFrame({'pdb_id': X.columns, 'rfe_ranking': fitted_selector.ranking_})
    # Sort features by ranking
    df_.sort_values('rfe_ranking', inplace = True)
    # Create a list of ranked features from size 1 to n
    list_of_confs_per_k = [ df_.index[:i+1].tolist() for i in range(len(df_))]
    # Create a dataframe indicating which features belong to a k subset
    df_confs_per_k = pd.DataFrame({'list_of_confs_rfe': list_of_confs_per_k})
    return df_confs_per_k

@run_or_load_joblib
def REFCV_wrapper(filename, estimator, X_train, y_train,
                  cv=5, scoring='roc_auc', **kwargs):
    estimator = _run_RFECV(estimator, X_train, y_train, cv=5, scoring='roc_auc', **kwargs)
    return estimator


In [6]:
from sklearn.model_selection import train_test_split
from scaffold_splitter import train_test_scaffold_split
from sklearn.metrics import roc_auc_score

#### Define X and y

In [10]:
# Train and test sets
X = X_merged_dksc
y = y_true_merged

## Random Splitting

#### RANDOM split: Logistic Regression as  estimator

In [17]:
%%time
from sklearn.linear_model import LogisticRegression

# RFECV object to be save 
dataset = 'MERGED'
model_name = 'LogReg'
split = 'random'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# Base Estimator Parameters
hyparams = {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs', 'max_iter':200}
estimator = LogisticRegression(**hyparams)

#*********************************************************************************
# Do the RANDOM splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

#*********************************************************************************
# Run RFE
selector = REFCV_wrapper(filename, estimator, X_train, y_train, 
                         cv=5, step=1, min_features_to_select=1, n_jobs=4)

selector.grid_scores_

File loaded: ./ml_models/RFECV_selector_LogReg_MERGED_random_split.joblib
CPU times: user 16.5 ms, sys: 24 µs, total: 16.5 ms
Wall time: 13.2 ms


array([0.67011665, 0.67715281, 0.7040832 , 0.710093  , 0.74021461,
       0.74754837, 0.75589301, 0.75987528, 0.77063537, 0.78068747,
       0.79123316, 0.79249419, 0.80006845, 0.80155663, 0.80530172,
       0.80964641, 0.80874967, 0.81589292, 0.81668554, 0.81824319,
       0.81965424, 0.82260578, 0.82409629, 0.82928885, 0.83103938,
       0.83246193, 0.83436865, 0.83640605, 0.83635533, 0.83490017,
       0.83450479, 0.83630974, 0.8370822 , 0.83673954, 0.83835229,
       0.83882684, 0.83952881, 0.84117628, 0.84246203, 0.84411607,
       0.84478983, 0.8455481 , 0.84491106, 0.84517844, 0.84617807,
       0.84637883, 0.84579565, 0.84652299, 0.84742123, 0.84760125,
       0.84805018, 0.84878909, 0.84935683, 0.85025962, 0.8508776 ,
       0.85270809, 0.85350448, 0.85378172, 0.85565537, 0.85719625,
       0.85672321, 0.85804307, 0.85974706, 0.85985665, 0.85918628,
       0.85997895, 0.86037923, 0.86082375, 0.86133477, 0.86086708,
       0.86064585, 0.86068103, 0.86139934, 0.8617972 , 0.86183

#### RANDOM split: Random Forest as estimator 

In [12]:
%%time
from sklearn.ensemble import RandomForestClassifier

# RFECV object to be save 
dataset = 'MERGED'
model_name = 'RandomForest'
split = 'random'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# Base Estimator Parameters
hyparams = {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 0.05, 
            'min_samples_split': 0.1, 'n_estimators': 400}
estimator = RandomForestClassifier(**hyparams)

#*********************************************************************************
# Do the RANDOM splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

#*********************************************************************************
# Run RFE
selector = REFCV_wrapper(filename, estimator, X_train, y_train, 
                         cv=5, step=1, min_features_to_select=1, n_jobs=4)

selector.grid_scores_

File saved: ./ml_models/RFECV_selector_RandomForest_MERGED_random_split.joblib
CPU times: user 7min 27s, sys: 2.37 s, total: 7min 30s
Wall time: 28min 36s


array([0.67046537, 0.66931672, 0.67968767, 0.69444232, 0.69593437,
       0.69601733, 0.6944973 , 0.72677674, 0.72850933, 0.74444737,
       0.74834906, 0.74629443, 0.74733067, 0.74880909, 0.75500555,
       0.76242449, 0.76045895, 0.76073367, 0.75959666, 0.76075046,
       0.76293949, 0.76074038, 0.76127403, 0.7614514 , 0.76324952,
       0.7673282 , 0.76748708, 0.7688164 , 0.77044948, 0.76919773,
       0.77016655, 0.77162826, 0.77212371, 0.77198736, 0.77086406,
       0.77562489, 0.77326824, 0.77619211, 0.77600594, 0.7781361 ,
       0.77738331, 0.77834064, 0.77736513, 0.7767135 , 0.78067728,
       0.77945651, 0.77705892, 0.77765322, 0.77972506, 0.77882275,
       0.7773634 , 0.77882123, 0.77612102, 0.77925157, 0.77863561,
       0.77700855, 0.77833295, 0.77645326, 0.77759818, 0.77987968,
       0.77615821, 0.77639223, 0.7803966 , 0.78117259, 0.77834529,
       0.77826235, 0.78122283, 0.78077641, 0.77895318, 0.78025785,
       0.78004573, 0.78065877, 0.78018108, 0.7795236 , 0.77706

## Scaffold Splitting

#### Define X and y

In [13]:
# Train and test sets
X = X_merged_dksc
y = y_true_merged
scaffold_series = df_scff_murcko['scff_generic']

#### SCAFFOLD split: Logistic Regression as  estimator

In [14]:
%%time
from sklearn.linear_model import LogisticRegression

# RFECV object to be save 
dataset = 'MERGED'
model_name = 'LogReg'
split = 'scaffold'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# Base Estimator Parameters
hyparams = {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs', 'max_iter': 200}
estimator = LogisticRegression(**hyparams)

#*********************************************************************************
# Do the RANDOM splitting
X_train, X_test, y_train, y_test = train_test_scaffold_split(X, y, 
                                    scaffold_series = scaffold_series,
                                    test_size=0.25, stratify=y)

#*********************************************************************************
# Run RFE
selector = REFCV_wrapper(filename, estimator, X_train, y_train, 
                         cv=5, step=1, min_features_to_select=1, n_jobs=4)
selector.grid_scores_

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

File saved: ./ml_models/RFECV_selector_LogReg_MERGED_scaffold_split.joblib
CPU times: user 39.3 s, sys: 16.6 s, total: 55.9 s
Wall time: 4min 39s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


array([0.66543564, 0.70128459, 0.70932294, 0.70655382, 0.71683842,
       0.71779068, 0.72213063, 0.73570172, 0.7477938 , 0.75094776,
       0.76036244, 0.76089622, 0.76231569, 0.7689154 , 0.77064964,
       0.77330091, 0.77818862, 0.78324134, 0.78927825, 0.79281485,
       0.79624182, 0.79762519, 0.80058426, 0.80170714, 0.80264277,
       0.80475013, 0.80693679, 0.80771934, 0.80900271, 0.81223655,
       0.81412058, 0.81646417, 0.81600004, 0.81429212, 0.81673961,
       0.81740931, 0.81790244, 0.81878554, 0.82098742, 0.82262003,
       0.82240959, 0.82006142, 0.8218635 , 0.82226644, 0.82384154,
       0.82388242, 0.82483451, 0.82789387, 0.82922939, 0.82769554,
       0.82969008, 0.8293335 , 0.83170026, 0.82991942, 0.83251243,
       0.8320589 , 0.83351072, 0.83264893, 0.83225214, 0.83329766,
       0.83375454, 0.83420314, 0.83524989, 0.83532789, 0.83373545,
       0.83476052, 0.83660441, 0.83791382, 0.83670746, 0.83640291,
       0.83759777, 0.83938621, 0.83991273, 0.8394703 , 0.84004

#### SCAFFOLD split: Random Forest as estimator 

In [15]:
%%time
from sklearn.ensemble import RandomForestClassifier

# RFECV object to be save 
dataset = 'MERGED'
model_name = 'RandomForest'
split = 'scaffold'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# Base Estimator Parameters
hyparams = {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 0.05,
            'min_samples_split': 0.1, 'n_estimators': 300}
estimator = RandomForestClassifier(**hyparams)

#*********************************************************************************
# Do the SCAFFOLD splitting
X_train, X_test, y_train, y_test = train_test_scaffold_split(X, y, 
                                    scaffold_series = scaffold_series,
                                    test_size=0.25, stratify=y)

#*********************************************************************************
# Run RFE
selector = REFCV_wrapper(filename, estimator, X_train, y_train, 
                         cv=5, step=1, min_features_to_select=1, n_jobs=4)
selector.grid_scores_

File saved: ./ml_models/RFECV_selector_RandomForest_MERGED_scaffold_split.joblib
CPU times: user 6min 7s, sys: 2.5 s, total: 6min 9s
Wall time: 20min 57s


array([0.67513924, 0.71809133, 0.71536183, 0.71475892, 0.71544668,
       0.73237921, 0.7351029 , 0.76884955, 0.77623329, 0.77672801,
       0.77423083, 0.77649056, 0.77598003, 0.77124974, 0.77618085,
       0.7800509 , 0.78229706, 0.78269801, 0.77953291, 0.78376889,
       0.78546081, 0.78464491, 0.78329961, 0.78716274, 0.78546008,
       0.78745937, 0.78382689, 0.7858644 , 0.78814887, 0.78709351,
       0.78642181, 0.78679943, 0.78505242, 0.78452473, 0.78477269,
       0.78740753, 0.78505781, 0.78683644, 0.78396421, 0.78757711,
       0.78861397, 0.78988913, 0.78796702, 0.7918741 , 0.79318943,
       0.79347194, 0.79147347, 0.79045672, 0.78927537, 0.7912939 ,
       0.79184044, 0.78990408, 0.79268214, 0.79115712, 0.78826246,
       0.78940423, 0.79006548, 0.79408264, 0.79058632, 0.78687615,
       0.7856438 , 0.78690754, 0.79063308, 0.79089381, 0.78619645,
       0.79024609, 0.78849584, 0.78878287, 0.79115612, 0.79012205,
       0.78961409, 0.7870492 , 0.79227794, 0.78930041, 0.79084

In [16]:
# Get the features ranking
df_ = pd.DataFrame({'pdb_id': X.columns, 'rfe_ranking': selector.ranking_})
# Sort features by ranking
df_.sort_values('rfe_ranking', inplace = True)
#
list_of_confs_per_k = [ df_.index[:i+1].tolist() for i in range(len(df_))]
df_confs_per_k = pd.DataFrame({'list_of_confs_rfe': list_of_confs_per_k})
df_confs_per_k

Unnamed: 0,list_of_confs_rfe
0,[311]
1,"[311, 259]"
2,"[311, 259, 263]"
3,"[311, 259, 263, 265]"
4,"[311, 259, 263, 265, 268]"
...,...
397,"[311, 259, 263, 265, 268, 277, 285, 293, 303, ..."
398,"[311, 259, 263, 265, 268, 277, 285, 293, 303, ..."
399,"[311, 259, 263, 265, 268, 277, 285, 293, 303, ..."
400,"[311, 259, 263, 265, 268, 277, 285, 293, 303, ..."
