# CDK2: Evaluating ML Model with k selected conformation

In [1]:
import pandas as pd
import numpy as np
import joblib
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
file_name = './df_DkSc_results_COCRYS_CSAR_DEKOIS_DUD.pkl'
X_merged_dksc = pd.read_pickle(file_name)
# Extract activity column
y_true_merged = X_merged_dksc['activity']
# Drop column from merged_dkksc
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)
X_merged_dksc.shape

(3466, 402)

In [3]:
X = X_merged_dksc
y = y_true_merged

## Read the RFE Selectors

### Random and Scaffold Splitting 
The preselected conformations using RFE are processed inside the following notebook:

In [4]:
%run ./5_Helper_get_RFE_preselected_conformations.ipynb

### Start the analysis

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

In [6]:
# Entradas:

In [7]:
import os
import sys
import pickle
sys.path.append('..')
from modules.run_or_load_decorator import run_or_load

In [8]:
#*************************************************
# Functions to compute stratify scaffold splitting
#*************************************************
sys.path.append('../2_Docking_analysis/')
from scaffold_splitter import train_test_scaffold_split

# Compute or load the dataframe containing the Generic Murcko Scaffolds
file = '../2_Docking_analysis/df_COCRYS_CSAR_DUD_DEKOIS_Murcko_Scaffolds_SMILES.obj'

df_scff_murcko = pd.read_pickle(file)
df_scff_murcko.shape

(3466, 3)

In [9]:
# ********************
def conf_picking_ML(estimator, X_train, X_test, y_train, y_test, df_preselected_confs):
    '''This function returns a dictionary with the y_test values parsed, and
    with k entries, each one with an array with the y_pred values at a given k'''
    # Create the initial dictionary
    dic_y_preds = {'y_true_index': y_test.index, 'y_true': y_test.values}
    for n, k in zip(df_preselected_confs.index, 
                    df_preselected_confs['confs_idx_per_k']):
        # 1) Substract X_train and X_test
        X_train_sub = X_train.iloc[:, k]
        X_test_sub = X_test.iloc[:, k]

        # 2) Entrenamiento del modelo
        pipe = Pipeline([('scaler', StandardScaler()), ('estimator', estimator)])
        pipe.fit(X_train_sub, y_train)

        # 3) Prediction
        y_pred_proba = pipe.predict_proba(X_test_sub)[:, 1]

        # 5) Add the predicted values to the dictionary with n confs
        dic_y_preds[n+1] = y_pred_proba
        
    return(dic_y_preds)

@run_or_load
def randsel_ML(filename, estimator, X, y, 
                       split_type='randSplit', scaffold_series=None,
                       test_size=0.25, nreps=10, verbose=False):
    n_confs = X.shape[1]
    dic_results = {}
    
    if split_type == 'scffSplit' and scaffold_series is not None:
        #** If scaffold splitting: just do once!
        X_train, X_test, y_train, y_test = \
            train_test_scaffold_split(X, y, 
            scaffold_series=scaffold_series, test_size=test_size, stratify=y)
    
    for rep in range(nreps):
        # ***** Select conformations randomly 
        confs_per_k = [np.random.choice(a = range(n_confs), size=k, replace = False) 
                       for k in range(1, n_confs + 1)]
        df_preselected_confs = pd.DataFrame({'confs_idx_per_k': confs_per_k})
        # ***** 
        roc_auc_values = []
        #** If random split: Randomly split on each rep
        if split_type == 'randSplit':
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y)
            
        dic_y_preds = conf_picking_ML(estimator, X_train, X_test, y_train, y_test, df_preselected_confs)
        dic_results[f'rep_{rep}'] = dic_y_preds
        
        if verbose:
            print(f'rep: {rep}')
            
#     df = pd.DataFrame(dic_y_preds)
    return(dic_results)
        
@run_or_load     
def presel_ML(filename, estimator, X, y, df_preselected_confs, 
                       split_type='randSplit', scaffold_series=None,
                       test_size=0.25, nreps=5, verbose=False):
    dic_results = {}
    if split_type == 'scffSplit' and scaffold_series is not None:
        #** If scaffold splitting: just do once!
        X_train, X_test, y_train, y_test = \
            train_test_scaffold_split(X, y, 
            scaffold_series=scaffold_series, test_size=test_size, stratify=y)
        #**
        # override nreps
        nreps = 1 # Only for preselected features
   
    for rep in range(nreps): 
        
        roc_auc_values = []
        
        #** If random split: Randomly split on each rep
        if split_type == 'randSplit':
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y)
            
        dic_y_preds = conf_picking_ML(estimator, X_train, X_test, y_train, y_test, df_preselected_confs)
        dic_results[f'rep_{rep}'] = dic_y_preds
        
        if verbose:
            print(f'rep: {rep}')
            
#     df = pd.DataFrame(dic_roc_auc_reps)
    return(dic_results)


In [10]:
### File naming pattern
# {Score type}_{Train}-{Test}_{split type}__{estimator}_{selection method}

In [11]:
TRAIN_DB = 'MergedDB'
TEST_DB = 'MergedDB'
SCORE_TYPE = 'DkSc'
scaffold_series = df_scff_murcko['scff_generic']

FILEPATH = f'./ml_models/conf_selection_evaluation/{SCORE_TYPE}_{TRAIN_DB}-{TRAIN_DB}'

def format_filename(*args):
    return '_'.join([*args] ) + '.obj'

## Logistic Regression
- *Logistic Regression as classifier*

In [12]:
%%time
from sklearn.linear_model import LogisticRegression

estimator_name = 'LogReg'
hyparams = {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs', 'max_iter':200}
estimator = LogisticRegression(**hyparams)
nreps=15

#*******************
# Splitting: Random
#*******************
split_type = 'randSplit'
#------------------
# Random Selection
#------------------
selection_type = 'randSel'
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
randSel_res_rds = randsel_ML(filename, estimator, X, y, nreps=nreps, verbose = True)
#-------------------------
# RFE Selection (Log. Reg selector)
#-------------------------
selection_type = 'rfeSel-LR'
df_preselected_confs = df_sel_confs_RFE_LR_rd
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_LR_res_rds = presel_ML(filename, estimator, X, y, df_preselected_confs, nreps=nreps, verbose=True)
#-------------------------
# RFE Selection (Rand. Forest selector)
#-------------------------
selection_type = 'rfeSel-RF'
df_preselected_confs = df_sel_confs_RFE_RF_rd
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_RF_res_rds = presel_ML(filename, estimator, X, y, df_preselected_confs, nreps=nreps, verbose=True)
#-------------------------
# RFE Selection (XGBoost Selector)
#-------------------------
selection_type = 'rfeSel-XGB'
df_preselected_confs = df_sel_confs_RFE_XGB_rd
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_XGB_res_rds = presel_ML(filename, estimator, X, y, df_preselected_confs, nreps=nreps, verbose=True)



#*********************
# Splitting: Scaffold
#*********************
split_type = 'scffSplit'
hyparams = {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
#------------------
# Random Selection
#------------------
selection_type = 'randSel'
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
randSel_res_scff = randsel_ML(filename, estimator, X, y, 
                   split_type=split_type, scaffold_series=scaffold_series, nreps=nreps, verbose=True)
#-------------------------
# RFE Selection (Log. Reg selector)
#-------------------------
selection_type = 'rfeSel-LR'
df_preselected_confs = df_sel_confs_RFE_LR_scff
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_LR_res_scff = presel_ML(filename, estimator, X, y, df_preselected_confs, 
                                split_type=split_type, scaffold_series=scaffold_series, verbose=True)
#-------------------------
# RFE Selection (Rand. Forest selector)
#-------------------------
selection_type = 'rfeSel-RF'
df_preselected_confs = df_sel_confs_RFE_RF_scff # Confs selected by RFE with RandForest base estimator
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_RF_res_scff = presel_ML(filename, estimator, X, y, df_preselected_confs, 
                                split_type=split_type, scaffold_series=scaffold_series, verbose=True)
#-------------------------
# RFE Selection (XGBoost Selector)
#-------------------------
selection_type = 'rfeSel-XGB'
df_preselected_confs = df_sel_confs_RFE_XGB_scff
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_XGB_res_scff = presel_ML(filename, estimator, X, y, df_preselected_confs, 
                                split_type=split_type, scaffold_series=scaffold_series, verbose=True)


File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_randSplit_LogReg_randSel_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_randSplit_LogReg_rfeSel-LR_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_randSplit_LogReg_rfeSel-RF_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_randSplit_LogReg_rfeSel-XGB_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_scffSplit_LogReg_randSel_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_scffSplit_LogReg_rfeSel-LR_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_scffSplit_LogReg_rfeSel-RF_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_scffSplit_LogReg_rfeSel-XGB_nreps15.obj
CPU times: user 454 ms, sys: 705 ms, total: 1.16 s
Wall time: 3.01 s


## RBF SVM

In [13]:
%%time
from sklearn.svm import SVC

estimator_name = 'rbfSVC'
hyparams = {'kernel': 'rbf', 'probability': True, 'C': 10.0, 'gamma': 0.01}
estimator = SVC(**hyparams)
nreps=15

#*******************
# Splitting: Random
#*******************
split_type = 'randSplit'
#------------------
# Random Selection
#------------------
selection_type = 'randSel'
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
randSel_res_rds = randsel_ML(filename, estimator, X, y, nreps=nreps, verbose = True)
#-------------------------
# RFE Selection (Log. Reg base estimator)
#-------------------------
selection_type = 'rfeSel-LR'
df_preselected_confs = df_sel_confs_RFE_LR_rd
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_LR_res_rds = presel_ML(filename, estimator, X, y, df_preselected_confs, nreps=nreps, verbose=True)
#-------------------------
# RFE Selection (Rand. Forest base estimator)
#-------------------------
selection_type = 'rfeSel-RF'
df_preselected_confs = df_sel_confs_RFE_RF_rd
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_RF_res_rds = presel_ML(filename, estimator, X, y, df_preselected_confs, nreps=nreps, verbose=True)
#-------------------------
# RFE Selection (XGBoost Selector)
#-------------------------
selection_type = 'rfeSel-XGB'
df_preselected_confs = df_sel_confs_RFE_XGB_rd
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_XGB_res_rds = presel_ML(filename, estimator, X, y, df_preselected_confs, nreps=nreps, verbose=True)



#*********************
# Splitting: Scaffold
#*********************
split_type = 'scffSplit'
hyparams = {'C': 100, 'penalty': 'l1', 'solver': 'lbfgs', 'max_iter': 150}
#------------------
# Random Selection
#------------------
selection_type = 'randSel'
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
randSel_res_scff = randsel_ML(filename, estimator, X, y, 
                   split_type=split_type, scaffold_series=scaffold_series, nreps=nreps, verbose=True)
#-------------------------
# RFE Selection (Log. Reg base estimator)
#-------------------------
selection_type = 'rfeSel-LR'
df_preselected_confs = df_sel_confs_RFE_LR_scff
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_LR_res_scff = presel_ML(filename, estimator, X, y, df_preselected_confs, 
                                split_type=split_type, scaffold_series=scaffold_series, verbose=True)
#-------------------------
# RFE Selection (Rand. Forest base estimator)
#-------------------------
selection_type = 'rfeSel-RF'
df_preselected_confs = df_sel_confs_RFE_RF_scff # Confs selected by RFE with RandForest base estimator
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_RF_res_scff = presel_ML(filename, estimator, X, y, df_preselected_confs, 
                                split_type=split_type, scaffold_series=scaffold_series, verbose=True)
#-------------------------
# RFE Selection (XGBoost Selector)
#-------------------------
selection_type = 'rfeSel-XGB'
df_preselected_confs = df_sel_confs_RFE_XGB_scff
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_XGB_res_scff = presel_ML(filename, estimator, X, y, df_preselected_confs, 
                                split_type=split_type, scaffold_series=scaffold_series, verbose=True)


File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_randSplit_rbfSVC_randSel_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_randSplit_rbfSVC_rfeSel-LR_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_randSplit_rbfSVC_rfeSel-RF_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_randSplit_rbfSVC_rfeSel-XGB_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_scffSplit_rbfSVC_randSel_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_scffSplit_rbfSVC_rfeSel-LR_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_scffSplit_rbfSVC_rfeSel-RF_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_scffSplit_rbfSVC_rfeSel-XGB_nreps15.obj
CPU times: user 616 ms, sys: 513 ms, total: 1.13 s
Wall time: 3.41 s


## 1NN Classifier

In [14]:
%%time
from sklearn.neighbors import KNeighborsClassifier

estimator_name = '1NN'
hyparams ={'n_neighbors': 1, 'n_jobs': 7}
estimator = KNeighborsClassifier(**hyparams)
nreps=15

#*******************
# Splitting: Random
#*******************
split_type = 'randSplit'
#------------------
# Random Selection
#------------------
selection_type = 'randSel'
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
randSel_res_rds = randsel_ML(filename, estimator, X, y, nreps=nreps, verbose = True)
#-------------------------
# RFE Selection (Log. Reg base estimator)
#-------------------------
selection_type = 'rfeSel-LR'
df_preselected_confs = df_sel_confs_RFE_LR_rd
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_LR_res_rds = presel_ML(filename, estimator, X, y, df_preselected_confs, nreps=nreps, verbose=True)
#-------------------------
# RFE Selection (Rand. Forest base estimator)
#-------------------------
selection_type = 'rfeSel-RF'
df_preselected_confs = df_sel_confs_RFE_RF_rd
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_RF_res_rds = presel_ML(filename, estimator, X, y, df_preselected_confs, nreps=nreps, verbose=True)
#-------------------------
# RFE Selection (XGBoost Selector)
#-------------------------
selection_type = 'rfeSel-XGB'
df_preselected_confs = df_sel_confs_RFE_XGB_rd
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_XGB_res_rds = presel_ML(filename, estimator, X, y, df_preselected_confs, nreps=nreps, verbose=True)



#*********************
# Splitting: Scaffold
#*********************
split_type = 'scffSplit'
hyparams = {'n_neighbors': 1}
#------------------
# Random Selection
#------------------
selection_type = 'randSel'
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
randSel_res_scff = randsel_ML(filename, estimator, X, y, 
                   split_type=split_type, scaffold_series=scaffold_series, nreps=nreps, verbose=True)
#-------------------------
# RFE Selection (Log. Reg base estimator)
#-------------------------
selection_type = 'rfeSel-LR'
df_preselected_confs = df_sel_confs_RFE_LR_scff
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_LR_res_scff = presel_ML(filename, estimator, X, y, df_preselected_confs, 
                                split_type=split_type, scaffold_series=scaffold_series, verbose=True)
#-------------------------
# RFE Selection (Rand. Forest base estimator)
#-------------------------
selection_type = 'rfeSel-RF'
df_preselected_confs = df_sel_confs_RFE_RF_scff # Confs selected by RFE with RandForest base estimator
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_RF_res_scff = presel_ML(filename, estimator, X, y, df_preselected_confs, 
                                split_type=split_type, scaffold_series=scaffold_series, verbose=True)
#-------------------------
# RFE Selection (XGBoost Selector)
#-------------------------
selection_type = 'rfeSel-XGB'
df_preselected_confs = df_sel_confs_RFE_XGB_scff
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_XGB_res_scff = presel_ML(filename, estimator, X, y, df_preselected_confs, 
                                split_type=split_type, scaffold_series=scaffold_series, verbose=True)

File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_randSplit_1NN_randSel_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_randSplit_1NN_rfeSel-LR_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_randSplit_1NN_rfeSel-RF_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_randSplit_1NN_rfeSel-XGB_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_scffSplit_1NN_randSel_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_scffSplit_1NN_rfeSel-LR_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_scffSplit_1NN_rfeSel-RF_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_scffSplit_1NN_rfeSel-XGB_nreps15.obj
CPU times: user 582 ms, sys: 395 ms, total: 977 ms
Wall time: 3.28 s


## XGBoost

In [15]:
%%time
from xgboost import XGBClassifier 

estimator_name = 'XGB_tree'
hyparams ={'subsample': 0.5, 'n_estimators': 200, 'max_depth': 20, 'learning_rate': 0.05, 
           'gamma': 0.01, 'colsample_bytree': 0.5, 'alpha': 0.01}
estimator = XGBClassifier(**hyparams)
nreps=15

#*******************
# Splitting: Random
#*******************
split_type = 'randSplit'
#------------------
# Random Selection
#------------------
selection_type = 'randSel'
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
randSel_res_rds = randsel_ML(filename, estimator, X, y, nreps=nreps, verbose = True)
#-------------------------
# RFE Selection (Log. Reg base estimator)
#-------------------------
selection_type = 'rfeSel-LR'
df_preselected_confs = df_sel_confs_RFE_LR_rd
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_LR_res_rds = presel_ML(filename, estimator, X, y, df_preselected_confs, nreps=nreps, verbose=True)
#-------------------------
# RFE Selection (Rand. Forest base estimator)
#-------------------------
selection_type = 'rfeSel-RF'
df_preselected_confs = df_sel_confs_RFE_RF_rd
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_RF_res_rds = presel_ML(filename, estimator, X, y, df_preselected_confs, nreps=nreps, verbose=True)
# #-------------------------
# # RFE Selection (XGBoost Selector)
# #-------------------------
selection_type = 'rfeSel-XGB'
df_preselected_confs = df_sel_confs_RFE_XGB_rd
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_XGB_res_rds = presel_ML(filename, estimator, X, y, df_preselected_confs, nreps=nreps, verbose=True)



#*********************
# Splitting: Scaffold
#*********************
split_type = 'scffSplit'
hyparams = {'subsample': 0.5, 'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.1,
            'gamma': 0.1, 'colsample_bytree': 1, 'alpha': 1}
#------------------
# Random Selection
#------------------
selection_type = 'randSel'
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
randSel_res_scff = randsel_ML(filename, estimator, X, y, 
                   split_type=split_type, scaffold_series=scaffold_series, nreps=nreps, verbose=True)
#-------------------------
# RFE Selection (Log. Reg base estimator)
#-------------------------
selection_type = 'rfeSel-LR'
df_preselected_confs = df_sel_confs_RFE_LR_scff
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_LR_res_scff = presel_ML(filename, estimator, X, y, df_preselected_confs, 
                                split_type=split_type, scaffold_series=scaffold_series, verbose=True)
#-------------------------
# RFE Selection (Rand. Forest base estimator)
#-------------------------
selection_type = 'rfeSel-RF'
df_preselected_confs = df_sel_confs_RFE_RF_scff # Confs selected by RFE with RandForest base estimator
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_RF_res_scff = presel_ML(filename, estimator, X, y, df_preselected_confs, 
                                split_type=split_type, scaffold_series=scaffold_series, verbose=True)
#-------------------------
# RFE Selection (XGBoost Selector)
#-------------------------
selection_type = 'rfeSel-XGB'
df_preselected_confs = df_sel_confs_RFE_XGB_scff
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_XGB_res_scff = presel_ML(filename, estimator, X, y, df_preselected_confs, 
                                split_type=split_type, scaffold_series=scaffold_series, verbose=True)

File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_randSplit_XGB_tree_randSel_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_randSplit_XGB_tree_rfeSel-LR_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_randSplit_XGB_tree_rfeSel-RF_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_randSplit_XGB_tree_rfeSel-XGB_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_scffSplit_XGB_tree_randSel_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_scffSplit_XGB_tree_rfeSel-LR_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_scffSplit_XGB_tree_rfeSel-RF_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_scffSplit_XGB_tree_rfeSel-XGB_nreps15.obj
CPU times: user 381 ms, sys: 228 ms, total: 609 ms
Wall time: 1.47 s


## Random Forest 

In [19]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator_name = 'RandForest'
hyparams = {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 
            'max_features': 'sqrt', 'max_depth': 10}
estimator = RandomForestClassifier(**hyparams)
nreps=15

#*******************
# Splitting: Random
#*******************
split_type = 'randSplit'
#------------------
# Random Selection
#------------------
selection_type = 'randSel'
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
randSel_res_rds = randsel_ML(filename, estimator, X, y, nreps=nreps, verbose = True)
#-------------------------
# RFE Selection (Log. Reg base estimator)
#-------------------------
# selection_type = 'rfeSel-LR'
# df_preselected_confs = df_sel_confs_RFE_LR_rd
# filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
# rfe_LR_res_rds = presel_ML(filename, estimator, X, y, df_preselected_confs, nreps=nreps, verbose=True)
#-------------------------
# RFE Selection (Rand. Forest base estimator)
#-------------------------
selection_type = 'rfeSel-RF'
df_preselected_confs = df_sel_confs_RFE_RF_rd
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_RF_res_rds = presel_ML(filename, estimator, X, y, df_preselected_confs, nreps=nreps, verbose=True)
# #-------------------------
# # RFE Selection (XGBoost Selector)
# #-------------------------
# selection_type = 'rfeSel-XGB'
# df_preselected_confs = df_sel_confs_RFE_XGB_rd
# filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
# rfe_XGB_res_rds = presel_ML(filename, estimator, X, y, df_preselected_confs, nreps=nreps, verbose=True)



#*********************
# Splitting: Scaffold
#*********************
split_type = 'scffSplit'
hyparams = {'n_estimators': 300, 'min_samples_split': 10, 'min_samples_leaf': 4, 
            'max_features': 'sqrt', 'max_depth': 10}
estimator = RandomForestClassifier(**hyparams)
#------------------
# Random Selection
#------------------
selection_type = 'randSel'
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
randSel_res_scff = randsel_ML(filename, estimator, X, y, 
                   split_type=split_type, scaffold_series=scaffold_series, nreps=nreps, verbose=True)
#-------------------------
# RFE Selection (Log. Reg base estimator)
#-------------------------
# selection_type = 'rfeSel-LR'
# df_preselected_confs = df_sel_confs_RFE_LR_scff
# filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
# rfe_LR_res_scff = presel_ML(filename, estimator, X, y, df_preselected_confs, 
#                                 split_type=split_type, scaffold_series=scaffold_series, verbose=True)
#-------------------------
# RFE Selection (Rand. Forest base estimator)
#-------------------------
selection_type = 'rfeSel-RF'
df_preselected_confs = df_sel_confs_RFE_RF_scff # Confs selected by RFE with RandForest base estimator
filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
rfe_RF_res_scff = presel_ML(filename, estimator, X, y, df_preselected_confs, 
                                split_type=split_type, scaffold_series=scaffold_series, verbose=True)
#-------------------------
# RFE Selection (XGBoost Selector)
#-------------------------
# selection_type = 'rfeSel-XGB'
# df_preselected_confs = df_sel_confs_RFE_XGB_scff
# filename = format_filename(FILEPATH, split_type, estimator_name, selection_type, f'nreps{nreps}') 
# rfe_XGB_res_scff = presel_ML(filename, estimator, X, y, df_preselected_confs, 
#                                 split_type=split_type, scaffold_series=scaffold_series, verbose=True)

File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_randSplit_RandForest_randSel_nreps15.obj
File loaded: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_randSplit_RandForest_rfeSel-RF_nreps15.obj
rep: 0
rep: 1
rep: 2
rep: 3
rep: 4
rep: 5
rep: 6
rep: 7
rep: 8
rep: 9
rep: 10
rep: 11
rep: 12
rep: 13
rep: 14
File saved: ./ml_models/conf_selection_evaluation/DkSc_MergedDB-MergedDB_scffSplit_RandForest_randSel_nreps15.obj


KeyboardInterrupt: 