In [8]:
import pandas as pd
import numpy as np
import joblib

In [3]:
file_name = './df_DkSc_results_COCRYS_DEKOIS_DUD.pkl'
X_merged_dksc = pd.read_pickle(file_name)
# Extract activity column
y_true_merged = X_merged_dksc['activity']
# Drop column from merged_dkksc
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)
X_merged_dksc.shape

(5839, 136)

In [4]:
# RFE Evaluation

In [7]:
!ls ml_models/

RFECV_selector_LogReg_MERGED_random_split.joblib
RFECV_selector_LogReg_MERGED_scaffold_split.joblib
RFECV_selector_RandomForest_MERGED_random_split.joblib
RFECV_selector_RandomForest_MERGED_scaffold_split.joblib


In [18]:
# Funtion to get the dataframe with selected k conformations using RFE
def selected_confs_from_RFE(rfe_selector, X):
    '''Returns a Data Frame with n rows (n = num of protein conformations) and one
       column where each cell contains a list of k indices indicating the selected conformations.
       Selected conformatios are obtained from an rfe_selector estimator and the X matrix.'''
    # Get the features ranking
    df_ranks = pd.DataFrame({'pdb_id': X.columns, 'rfe_ranking': rfe_selector.ranking_})
    # Sort features by ranking
    df_ranks.sort_values('rfe_ranking', inplace = True)
    # Get the dataframe with the list of conf indices per k conformations
    confs_per_k = [ df_ranks.index[:i + 1].tolist() for i in range(len(df_ranks))]
    df_sel_confs = pd.DataFrame({'list_of_confs_rfe': confs_per_k})
    return df_sel_confs

In [14]:
X = X_merged_dksc
y = y_true_merged

In [20]:
# Open RFE_estimator
dataset = 'MERGED'
model_name = 'LogReg'
split = 'random'
filename = f'./ml_models/RFECV_selector_{model_name}_{dataset}_{split}_split.joblib'
# RFE selector LogReg Random split
rfe_selector_LR_rd = joblib.load(filename)

#**************************
# Get the features ranking
df_sel_confs_RFE_LR_rd = selected_confs_from_RFE(rfe_selector_LR_rd, X)

In [40]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

In [41]:
# Entradas:

In [47]:
%%time
from sklearn.linear_model import LogisticRegression

hyparams = {'C': 1.0, 'penalty': 'l1', 'solver': 'liblinear', 'max_iter': 150}

estimator = LogisticRegression(**hyparams)

df_preselected_confs = df_sel_confs_RFE_LR_rd


dic_roc_auc_reps = {}
for rep in range(5): # Just for random splitting
    roc_auc_values = []

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

    for k in df_preselected_confs.list_of_confs_rfe:
        if len(k) % 2 == 0:
           continue 
        # 1) Substract X_train and X_test
        X_train_sub = X_train.iloc[:, k]
        X_test_sub = X_test.iloc[:, k]

        # 2) Entrenamiento del modelo
        pipe = Pipeline([('scaler', StandardScaler()), ('estimator', estimator)])
        pipe.fit(X_train_sub, y_train)

        # 3) Prediction
        y_pred_proba = pipe.predict_proba(X_test_sub)[:, 1]

        # 4) Evaluation
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        
        # 5) Append the ROC-AUC to the list
        roc_auc_values.append(roc_auc)
        print( len(k), round(roc_auc, 3))
    
    dic_roc_auc_reps[f'rep_{rep}'] = roc_auc_values
    


1 0.618
3 0.664
5 0.674
7 0.689
9 0.699
11 0.701
13 0.709
15 0.71
17 0.729
19 0.765
21 0.788
23 0.79
25 0.799
27 0.802
29 0.797
31 0.798
33 0.796
35 0.798
37 0.801
39 0.803
41 0.814
43 0.833
45 0.832
47 0.832
49 0.827
51 0.852
53 0.859
55 0.85
57 0.85
59 0.85
61 0.858
63 0.858
65 0.857
67 0.854
69 0.853
71 0.853
73 0.853
75 0.854
77 0.856
79 0.855
81 0.853
83 0.851
85 0.854
87 0.852
89 0.853
91 0.853
93 0.853
95 0.856
97 0.853
99 0.853
101 0.853
103 0.851
105 0.851
107 0.851
109 0.848
111 0.847
113 0.847
115 0.847
117 0.848
119 0.847
121 0.847
123 0.847
125 0.844
127 0.843
129 0.843
131 0.841
133 0.841
135 0.84
1 0.545
3 0.625
5 0.654
7 0.719
9 0.722
11 0.729
13 0.729
15 0.74
17 0.748
19 0.787
21 0.785
23 0.785
25 0.79
27 0.786
29 0.788
31 0.79
33 0.788
35 0.787
37 0.797
39 0.791
41 0.786
43 0.798
45 0.796
47 0.797
49 0.797
51 0.796
53 0.818
55 0.818
57 0.817
59 0.818
61 0.822
63 0.821
65 0.819
67 0.813
69 0.807
71 0.807
73 0.809
75 0.809
77 0.811
79 0.811
81 0.81
83 0.811
85 0.808
87 

{'rep_0': [0.6181323706377857,
  0.6638941034897713,
  0.6737617328519856,
  0.6885487364620939,
  0.6988592057761733,
  0.7011889290012033,
  0.7086883273164861,
  0.7103152827918171,
  0.7288471720818291,
  0.7648808664259927,
  0.788072202166065,
  0.7904693140794222,
  0.7993646209386281,
  0.8016654632972322,
  0.7973429602888087,
  0.7982478941034897,
  0.7963802647412757,
  0.7981997593261132,
  0.8006642599277979,
  0.8031576413959085,
  0.8143152827918172,
  0.8327412755716005,
  0.832057761732852,
  0.8320962695547534,
  0.8274079422382671,
  0.8517352587244285,
  0.8587821901323706,
  0.8500794223826715,
  0.8497424789410349,
  0.8496077015643803,
  0.8578580024067388,
  0.8577039711191335,
  0.8572129963898917,
  0.8543345367027677,
  0.8527749699157641,
  0.8529771359807461,
  0.8533910950661853,
  0.8537569193742479,
  0.8556149217809867,
  0.8551239470517449,
  0.8530252707581227,
  0.8514753309265944,
  0.8536895306859206,
  0.8516774969915765,
  0.8526787003610108,
  0

Unnamed: 0,rep_0,rep_1,rep_2,rep_3,rep_4
0,0.618132,0.544529,0.533497,0.558700,0.610758
1,0.663894,0.625410,0.624082,0.650051,0.657324
2,0.673762,0.653795,0.658445,0.675726,0.682272
3,0.688549,0.719451,0.679740,0.742286,0.700496
4,0.698859,0.721993,0.676448,0.749458,0.716236
...,...,...,...,...,...
63,0.843225,0.782854,0.834349,0.854046,0.823933
64,0.843187,0.782835,0.830662,0.854527,0.824279
65,0.840510,0.782623,0.829613,0.854546,0.824279
66,0.840501,0.781333,0.827764,0.853892,0.824010
