# Get preselected conformations from RFE objects

In [1]:
import pandas as pd
import numpy as np
import joblib
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
file_name = './df_DkSc_results_COCRYS_DEKOIS_DUD.pkl'
X_merged_dksc = pd.read_pickle(file_name)
# Extract activity column
y_true_merged = X_merged_dksc['activity']
# Drop column from merged_dkksc
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)

In [3]:
# RFE Evaluation

In [4]:
# Funtion to get the dataframe with selected k conformations using RFE
def selected_confs_from_RFE(rfe_selector, X):
    '''Returns a Data Frame with n rows (n = num of protein conformations) and one
       column where each cell contains a list of k indices indicating the selected conformations.
       Selected conformatios are obtained from an rfe_selector estimator and the X matrix.'''
    # Get the features ranking
    df_ranks = pd.DataFrame({'pdb_id': X.columns, 'rfe_ranking': rfe_selector.ranking_})
    # Sort features by ranking
    df_ranks.sort_values('rfe_ranking', inplace = True)
    # Get the dataframe with the list of conf indices per k conformations
    confs_per_k = [ df_ranks.index[:i + 1].tolist() for i in range(len(df_ranks))]
    df_sel_confs = pd.DataFrame({'confs_idx_per_k': confs_per_k})
    return df_sel_confs

In [5]:
X = X_merged_dksc
y = y_true_merged

## Read the RFE Selectors

In [12]:
# This dictionary is used for consensus scoring computation
rfe_preselections = {}

### Random Splitting

In [6]:
# Open RFE_estimator
dataset = 'MERGED'
model_name = 'LogReg'
split = 'random'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# RFE selector LogReg Random split
rfe_selector_LR_rd = joblib.load(filename)

#**************************
# Get the features ranking
df_sel_confs_RFE_LR_rd = selected_confs_from_RFE(rfe_selector_LR_rd, X)

# Add the last row to dict
key_name = ''.join([c for c in model_name if c.isupper()]) + '_rand'
rfe_preselections[key_name] = df_sel_confs_RFE_LR_rd.iloc[-1].values[0]

In [7]:
# Open RFE_estimator
dataset = 'MERGED'
model_name = 'RandomForest'
split = 'random'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# RFE selector LogReg Random split
rfe_selector_RF_rd = joblib.load(filename)

#**************************
# Get the features ranking
df_sel_confs_RFE_RF_rd = selected_confs_from_RFE(rfe_selector_RF_rd, X)

# Add the last row to dict
key_name = ''.join([c for c in model_name if c.isupper()]) + '_rand'
rfe_preselections[key_name] = df_sel_confs_RFE_RF_rd.iloc[-1].values[0]

In [8]:
# Open RFE_estimator
dataset = 'MERGED'
model_name = 'XGB_tree'
split = 'random'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# RFE selector LogReg Random split
rfe_selector_XGB_rd = joblib.load(filename)

#**************************
# Get the features ranking
df_sel_confs_RFE_XGB_rd = selected_confs_from_RFE(rfe_selector_XGB_rd, X)

# Add the last row to dict
key_name = ''.join([c for c in model_name if c.isupper()]) + '_rand'
rfe_preselections[key_name] = df_sel_confs_RFE_XGB_rd.iloc[-1].values[0]

### Scaffold Splitting

In [9]:
# Open RFE_estimator
dataset = 'MERGED'
model_name = 'LogReg'
split = 'scaffold'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# RFE selector LogReg Random split
rfe_selector_LR_rd = joblib.load(filename)

#**************************
# Get the features ranking
df_sel_confs_RFE_LR_scff = selected_confs_from_RFE(rfe_selector_LR_rd, X)

# Add the last row to dict
key_name = ''.join([c for c in model_name if c.isupper()]) + '_scff'
rfe_preselections[key_name] = df_sel_confs_RFE_LR_scff.iloc[-1].values[0]

In [10]:
# Open RFE_estimator
dataset = 'MERGED'
model_name = 'RandomForest'
split = 'scaffold'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# RFE selector LogReg Random split
rfe_selector_RF_rd = joblib.load(filename)

#**************************
# Get the features ranking
df_sel_confs_RFE_RF_scff = selected_confs_from_RFE(rfe_selector_RF_rd, X)

# Add the last row to dict
key_name = ''.join([c for c in model_name if c.isupper()]) + '_scff'
rfe_preselections[key_name] = df_sel_confs_RFE_RF_scff.iloc[-1].values[0]

In [11]:
# Open RFE_estimator
dataset = 'MERGED'
model_name = 'XGB_tree'
split = 'scaffold'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# RFE selector LogReg Random split
rfe_selector_XGB_rd = joblib.load(filename)

#**************************
# Get the features ranking
df_sel_confs_RFE_XGB_scff = selected_confs_from_RFE(rfe_selector_XGB_rd, X)

# Add the last row to dict
key_name = ''.join([c for c in model_name if c.isupper()]) + '_scff'
rfe_preselections[key_name] = df_sel_confs_RFE_XGB_scff.iloc[-1].values[0]

In [13]:
# Consensus Scoting Functions