## Conformational Selection using Recursive Feature Elimination

In [1]:
import pandas as pd
import numpy as np
import sys

### Load the data

In [2]:
file_name = './df_DkSc_results_COCRYS_DEKOIS_DUD.pkl'
X_merged_dksc = pd.read_pickle(file_name)
# Extract activity column
y_true_merged = X_merged_dksc['activity']
# Drop column from merged_dkksc
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)
X_merged_dksc.shape

(5839, 136)

#### Function and data to perform Scaffold Train-Test splitting

In [4]:
from scaffold_splitter import train_test_scaffold_split

# Compute or load the dataframe containing the Generic Murcko Scaffolds
file = './df_COCRYS_DUD_DEKOIS_Murcko_Scaffolds_SMILES.pkl'

df_scff_murcko = pd.read_pickle(file)

### Funtion to perform RFE with cross Validation

In [62]:
from sklearn.feature_selection import RFECV
from functools import wraps
import joblib
import os

def _run_RFECV(estimator, X_train, y_train, cv=5, scoring='roc_auc', 
                  filename=None, **kwargs):
    '''Simply runs a RFECV procedure'''
    # Intance and fit the rfe selector
    selector = RFECV(estimator,  cv = cv,
                     scoring = scoring, **kwargs)
    
    selector = selector.fit(X_train, y_train)
    return selector

def get_selected_features_per_step(fitted_selector, X):
    # Get the features ranking
    df_ = pd.DataFrame({'pdb_id': X.columns, 'rfe_ranking': fitted_selector.ranking_})
    # Sort features by ranking
    df_.sort_values('rfe_ranking', inplace = True)
    # Create a list of ranked features from size 1 to n
    list_of_confs_per_k = [ df_.index[:i+1].tolist() for i in range(len(df_))]
    # Create a dataframe indicating which features belong to a k subset
    df_confs_per_k = pd.DataFrame({'list_of_confs_rfe': list_of_confs_per_k})
    return df_confs_per_k
        
def REFCV_wrapper(estimator, X_train, y_train, filename, 
                  cv=5, scoring='roc_auc', **kwargs):
    if os.path.isfile(filename):
        # Load estimator from disk
        estimator = joblib.load(filename)
    else:
        estimator = _run_RFECV(estimator, X_train, y_train, cv=5, scoring='roc_auc', 
                    filename=None, **kwargs)
        # save to disk
        dump(estimator, filename)
    return estimator


In [88]:
from sklearn.model_selection import train_test_split
from scaffold_splitter import train_test_scaffold_split
from sklearn.metrics import roc_auc_score

#### Define X and y

In [63]:
# Train and test sets
X = X_merged_dksc
y = y_true_merged

(5839, 136)

## Random Splitting

#### RANDOM split: Logistic Regression as  estimator

In [100]:
%%time
from sklearn.linear_model import LogisticRegression

# RFECV object to be save 
dataset = 'MERGED'
model_name = 'LogReg'
split = 'random'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# Base Estimator Parameters
hyparams = {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear', 'max_iter': 150}
estimator = LogisticRegression(**hyparams)

#*********************************************************************************
# Do the RANDOM splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

#*********************************************************************************
# Run RFE
selector = REFCV_wrapper(estimator, X_train, y_train, 
                         filename=filename,
                         cv=5, step=1, min_features_to_select=1, n_jobs=4)

selector.grid_scores_

CPU times: user 13.7 ms, sys: 4.27 ms, total: 17.9 ms
Wall time: 14.4 ms


array([0.62716433, 0.64917252, 0.68705849, 0.70946982, 0.72798752,
       0.73703063, 0.75010978, 0.75391707, 0.7597195 , 0.76207597,
       0.76611985, 0.77098865, 0.77256657, 0.77546558, 0.77558032,
       0.77879425, 0.77893427, 0.77867573, 0.78100244, 0.78564726,
       0.79028027, 0.79185206, 0.7928754 , 0.7916826 , 0.79488738,
       0.79490302, 0.79831524, 0.80043408, 0.80380874, 0.80172766,
       0.80036368, 0.80193767, 0.80491196, 0.80604662, 0.80752347,
       0.80795042, 0.80722439, 0.81157952, 0.81014592, 0.81078167,
       0.80964124, 0.81018151, 0.81324093, 0.81332715, 0.81592818,
       0.81697687, 0.81698715, 0.81639278, 0.8165272 , 0.81550482,
       0.81755424, 0.8192236 , 0.81942178, 0.82023568, 0.82015542,
       0.81967402, 0.81977549, 0.81951307, 0.81944866, 0.81996167,
       0.82074363, 0.82058367, 0.82022013, 0.81987756, 0.81927803,
       0.81829346, 0.81631373, 0.81592847, 0.81612072, 0.8160681 ,
       0.8159877 , 0.81706812, 0.81692374, 0.81689676, 0.81770

#### RANDOM split: Random Forest as estimator 

In [85]:
%%time
from sklearn.ensemble import RandomForestClassifier

# RFECV object to be save 
dataset = 'MERGED'
model_name = 'RandomForest'
split = 'random'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# Base Estimator Parameters
hyparams = {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 0.02, 
            'min_samples_split': 0.1, 'n_estimators': 300}
estimator = RandomForestClassifier(**hyparams)

#*********************************************************************************
# Do the RANDOM splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

#*********************************************************************************
# Run RFE
selector = REFCV_wrapper(estimator, X_train, y_train, 
                         filename=filename,
                         cv=5, step=1, min_features_to_select=1, n_jobs=4)

CPU times: user 1min 58s, sys: 772 ms, total: 1min 59s
Wall time: 7min 28s


## Scaffold Splitting

#### Define X and y

In [90]:
# Train and test sets
X = X_merged_dksc
y = y_true_merged
scaffold_series = df_scff_murcko['scff_generic']

#### SCAFFOLD split: Logistic Regression as  estimator

In [95]:
%%time
from sklearn.linear_model import LogisticRegression

# RFECV object to be save 
dataset = 'MERGED'
model_name = 'LogReg'
split = 'scaffold'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# Base Estimator Parameters
hyparams = {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear', 'max_iter': 150}
estimator = LogisticRegression(**hyparams)

#*********************************************************************************
# Do the RANDOM splitting
X_train, X_test, y_train, y_test = train_test_scaffold_split(X, y, 
                                    scaffold_series = scaffold_series,
                                    test_size=0.25, stratify=y)

#*********************************************************************************
# Run RFE
selector = REFCV_wrapper(estimator, X_train, y_train, 
                         filename=filename,
                         cv=5, step=1, min_features_to_select=1, n_jobs=4)

CPU times: user 241 ms, sys: 83.5 ms, total: 324 ms
Wall time: 320 ms


#### SCAFFOLD split: Random Forest as estimator 

In [97]:
%%time
from sklearn.ensemble import RandomForestClassifier

# RFECV object to be save 
dataset = 'MERGED'
model_name = 'RandomForest'
split = 'scaffold'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# Base Estimator Parameters
hyparams = {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 0.02, 
            'min_samples_split': 0.1, 'n_estimators': 400}
estimator = RandomForestClassifier(**hyparams)

#*********************************************************************************
# Do the SCAFFOLD splitting
X_train, X_test, y_train, y_test = train_test_scaffold_split(X, y, 
                                    scaffold_series = scaffold_series,
                                    test_size=0.25, stratify=y)

#*********************************************************************************
# Run RFE
selector = REFCV_wrapper(estimator, X_train, y_train, 
                         filename=filename,
                         cv=5, step=1, min_features_to_select=1, n_jobs=4)

CPU times: user 2min 11s, sys: 799 ms, total: 2min 12s
Wall time: 10min 7s


In [98]:
selector.grid_scores_

array([0.62807247, 0.65434801, 0.68180049, 0.71914251, 0.70185898,
       0.70949886, 0.71749916, 0.7168106 , 0.71777319, 0.73021456,
       0.73064559, 0.73845956, 0.73222367, 0.73472117, 0.73014669,
       0.7303391 , 0.73304695, 0.73231804, 0.73266971, 0.7320845 ,
       0.73012843, 0.73215795, 0.73131385, 0.74233804, 0.74040013,
       0.74264437, 0.73952067, 0.74240573, 0.74052518, 0.7428915 ,
       0.74201735, 0.74108235, 0.73729055, 0.7396859 , 0.74209018,
       0.74045662, 0.73914572, 0.73733552, 0.73797667, 0.7341799 ,
       0.73532397, 0.73809966, 0.74164115, 0.73717583, 0.73732063,
       0.73850945, 0.74305161, 0.74007098, 0.73950101, 0.73785406,
       0.73780903, 0.73992409, 0.7355391 , 0.74218836, 0.74176609,
       0.73612555, 0.7408378 , 0.74455828, 0.73729774, 0.73457117,
       0.73558999, 0.74013687, 0.73814917, 0.74163249, 0.73870498,
       0.73937818, 0.74002351, 0.74002363, 0.73821912, 0.73507202,
       0.73543098, 0.74156108, 0.7385141 , 0.74026242, 0.73733

In [76]:
# Get the features ranking
df_ = pd.DataFrame({'pdb_id': X.columns, 'rfe_ranking': selector.ranking_})
# Sort features by ranking
df_.sort_values('rfe_ranking', inplace = True)
#
list_of_confs_per_k = [ df_.index[:i+1].tolist() for i in range(len(df_))]
df_confs_per_k = pd.DataFrame({'list_of_confs_rfe': list_of_confs_per_k})
df_confs_per_k

Unnamed: 0,list_of_confs_rfe
0,[32]
1,"[32, 113]"
2,"[32, 113, 73]"
3,"[32, 113, 73, 31]"
4,"[32, 113, 73, 31, 98]"
...,...
131,"[32, 113, 73, 31, 98, 33, 34, 58, 36, 37, 92, ..."
132,"[32, 113, 73, 31, 98, 33, 34, 58, 36, 37, 92, ..."
133,"[32, 113, 73, 31, 98, 33, 34, 58, 36, 37, 92, ..."
134,"[32, 113, 73, 31, 98, 33, 34, 58, 36, 37, 92, ..."
