# Conformational Selection using Recursive Feature Elimination

In [61]:
import pandas as pd
import numpy as np
import sys
from glob import glob
import matplotlib.pyplot as plt
import seaborn as sns
sys.path.append('..')
# Helper functions
%run ./helper_functions_S6.ipynb

#### Load the data

In [62]:
# Dictionary of docking results
scores_dic = get_docking_scores_dict()

In [63]:
# Description

In [64]:
from sklearn.feature_selection import RFECV

In [65]:
### Function to perform RFECV

In [66]:
def RFE_wrapper(ml_model, model_name, hyparms, scores_dic, train_key, score_type,
                n_jobs = 4, cv = 5, scoring = 'roc_auc', file_sufix = '',
                file_path = '../data/ml_evaluations/ml_models/'):
    # File name
    file_name = file_path + train_key + \
            F"_{model_name}_RFE_{score_type.replace('_', '')}" + \
            file_sufix + '.json'
    
    if os.path.isfile(file_name):
        df_confs_per_k = pd.read_json(file_name)
        df_confs_per_k.sort_index(inplace=True)
    else:
        # Intantiate the external estimator used for rank the features
        estimator_ref = ml_model(**hyparms)
        X = scores_dic[train_key][score_type]['X']
        y = scores_dic[train_key][score_type]['y']

        # Performs the RFE with crossvalidation
        selector = RFECV(estimator_ref, step = 1, cv = cv, scoring = scoring, n_jobs = n_jobs)
        selector = selector.fit(X, y)
        # Get the features ranking
        df_ = pd.DataFrame({'pdb_id': X.columns, 'rfe_ranking': selector.ranking_})
        df_.sort_values('rfe_ranking', inplace = True)
        # Create a lsit of ranked features from 1 to n (n=402)
        list_of_confs_per_k = [ df_.index[:i+1].tolist() for i in range(len(df_))]
        df_confs_per_k = pd.DataFrame({'list_of_confs_rfe': list_of_confs_per_k})
        # Returns a dataframe with the same structure used for k-means or correlated features selection
    
        #*** Saves the file ***
        df_confs_per_k.to_json(file_name)
    return(df_confs_per_k)

## Linear SVC

In [67]:
from sklearn.svm import SVC

In [68]:
%%time
ml_model = SVC
model_name = 'LinearSVC'
hyparms = dict(kernel = 'linear', probability = True, C = 0.1)
scores_dic = scores_dic
list_train_keys = ['CSAR', 'DUD', 'DEKOIS']
list_score_types = ['Dk_sc', 'Dk_lef']

for train_key in list_train_keys:
    for score_type in list_score_types:
        rfe_sel_conf = RFE_wrapper(ml_model, model_name, hyparms, 
                             scores_dic, train_key, score_type,
                             n_jobs = 4, cv = 5, scoring = 'roc_auc')
        print(model_name, train_key, score_type)

LinearSVC CSAR Dk_sc
LinearSVC CSAR Dk_lef
LinearSVC DUD Dk_sc
LinearSVC DUD Dk_lef
LinearSVC DEKOIS Dk_sc
LinearSVC DEKOIS Dk_lef
CPU times: user 11min 55s, sys: 4min 3s, total: 15min 58s
Wall time: 42min 26s


In [69]:
from sklearn.linear_model import LogisticRegression

In [74]:
%%time
ml_model = LogisticRegression
model_name = 'LogRg'
hyparms = dict(C = 1, penalty = 'l2', solver = 'lbfgs', max_iter = 200)
scores_dic = scores_dic
list_train_keys = ['CSAR', 'DUD', 'DEKOIS']
list_score_types = ['Dk_sc', 'Dk_lef']

for train_key in list_train_keys:
    for score_type in list_score_types:
        rfe_sel_conf = RFE_wrapper(ml_model, model_name, hyparms, 
                             scores_dic, train_key, score_type,
                             n_jobs = 4, cv = 5, scoring = 'roc_auc')
        print(model_name, train_key, score_type)

LogRg CSAR Dk_sc
LogRg CSAR Dk_lef
LogRg DUD Dk_sc
LogRg DUD Dk_lef
LogRg DEKOIS Dk_sc
LogRg DEKOIS Dk_lef
CPU times: user 67.5 ms, sys: 5.54 ms, total: 73.1 ms
Wall time: 67.9 ms


In [75]:
from sklearn.tree import DecisionTreeClassifier

In [76]:
%%time
ml_model = DecisionTreeClassifier
model_name = 'Tree'
hyparms = dict(criterion = 'gini', max_depth = 2)
scores_dic = scores_dic
list_train_keys = ['CSAR', 'DUD', 'DEKOIS']
list_score_types = ['Dk_sc', 'Dk_lef']

for train_key in list_train_keys:
    for score_type in list_score_types:
        rfe_sel_conf = RFE_wrapper(ml_model, model_name, hyparms, 
                             scores_dic, train_key, score_type,
                             n_jobs = 4, cv = 5, scoring = 'roc_auc')
        print(model_name, train_key, score_type)

Tree CSAR Dk_sc
Tree CSAR Dk_lef
Tree DUD Dk_sc
Tree DUD Dk_lef
Tree DEKOIS Dk_sc
Tree DEKOIS Dk_lef
CPU times: user 1min 8s, sys: 247 ms, total: 1min 8s
Wall time: 3min 23s
