# Model Selection

In [1]:
import pandas as pd
import numpy as np
import glob, sys, os
sys.path.append('..')

In [2]:
from modules.plotting_metrics import PlotMetric
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white', context='talk', font_scale=0.8)

In [3]:
## Run the helper fucntions form the local CDK2 directory
# %run ./helper_1_load_data.ipynb

In [4]:
# Helper fucntion to train and test
# %run ./helper_functions_S6.ipynb

In [5]:
file_name = './df_DkSc_results_COCRYS_DEKOIS_DUD.pkl'
X_merged_dksc = pd.read_pickle(file_name)
# Extract activity column
y_true_merged = X_merged_dksc['activity']
# Drop column from merged_dkksc
X_merged_dksc = X_merged_dksc.drop('activity', axis=1)
X_merged_dksc.shape

(5839, 136)

### Timeout Decorator

In [6]:
import signal
from functools import wraps

def timeout(n_seconds=7*60):
    '''Stops a function execution after n seconds'''
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            # Set alarm for n seconds
            signal.alarm(n_seconds)
            try:
                # Call decorated func
                return func(*args, **kwargs)
            except TimeoutError as e:
                print(f'Execution finished after {n_seconds}:', e)
            finally:
                # Cancel Alarm
                signal.alarm(0)
        return wrapper
    return decorator

## Scaffold Splitting

In [7]:
#*************************************************
# Functions to compute stratify scaffold splitting
#*************************************************
from scaffold_splitter import train_test_scaffold_split

In [8]:
%%time
# Compute or load the dataframe containing the Generic Murcko Scaffolds
file = './df_COCRYS_DUD_DEKOIS_Murcko_Scaffolds_SMILES.pkl'

df_scff_murcko = pd.read_pickle(file)

CPU times: user 367 ms, sys: 360 ms, total: 726 ms
Wall time: 724 ms


## Train/test on the same dataset 

### Learning Curves

In [9]:
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import learning_curve

In [10]:
def plot_learning_curves(estimator, X, y, title, ylim=[0.5,1], axes=None,
                         cv=3, train_sizes=np.linspace(0.1, 1.0, 10), 
                         scoring='roc_auc', n_jobs=4):
    '''
    Plot estimator performance on the training and validation
    sets as a function of the training set size.

    Parameters
    ----------
    estimator: sklearn estimator object type
       Object type that implements the "fit" and "predict method"

    X: array-like, shape (m_samples, n_features)
        Training array with m_samples and n_features.
    y: array-like, shape (m_samples)
        Target array relative to X with m labels.
    axes: array of 3 axes
        matplotlib axes array to append the generated plot
    ylim: array
       
    '''
    if axes == None:
        _, axes = plt.subplots(1, 1, figsize=(5, 5))
        
    axes.set(title=title, ylim=ylim, xlabel='Training examples', ylabel=f'Metric: {scoring}')
    # Use learning_curve function from sklearn
    train_sizes, train_scores, test_scores, fit_times, _ = \
       learning_curve(estimator, X, y, scoring=scoring, return_times = True,
                      cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    # Compute useful metrics
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
#     fit_times_mean = np.mean(fit_times, axis=1)
#     fit_times_std = np.std(fit_times, axis=1)
    # Plot the learning curve
    axes.grid()
    axes.fill_between(train_sizes, train_scores_mean - train_scores_std,
                        train_scores_mean + train_scores_std, alpha=0.1, color='r')
    axes.fill_between(train_sizes, test_scores_mean - test_scores_std,
                        test_scores_mean + test_scores_std, alpha=0.1, color='g')
    axes.plot(train_sizes, train_scores_mean, 'o-', color='r',
                label='Train score')
    axes.plot(train_sizes, test_scores_mean, 'o-', color='g',
                label='Cross-validation score')
    axes.legend(loc='lower right')
    

### Hyperparameters tunning: Grid Search

In [11]:
@timeout(300) #Finish execution after 5 mins
def run_grid_search(estimator, X_train, y_train, X_test, y_test, hyperparams,  cv_value=5, 
                    scoring='roc_auc', splitting='random'):

    # Format the hyperparms
    hyperparams_dict = {'estimator__' + key: val for key, val in hyperparams.items()}

    # Create the Pipe
    scaler = StandardScaler()
    pipe = Pipeline([('scaler', scaler),
                     ('estimator', estimator)])

    gs = GridSearchCV(estimator = pipe, param_grid = hyperparams_dict,
                     cv = cv_value, scoring = scoring, n_jobs = 6, refit = True)
                    # refit=True means to train the final model using the whole training set

    # Train the model
    gs.fit(X_train, y_train) 

    # Predictions
    y_train_predict = gs.predict_proba(X_train)
    y_test_predict = gs.predict_proba(X_test)

    # Print some values
    print(f'No. of molecules in train set: {y_train.shape[0]}, with {y_train.sum()} actives.')
    print(f'No. of molecules in test set: {y_test.shape[0]}, with {y_test.sum()} actives.')
    print('')
    print('*'*10, 'GRID SEARCH RESULTS', '*'*10)
    print('- Mean CV ROC-AUC score: {:.3f}'.format(gs.best_score_))
    print('- Train ROC-AUC: {:.3f}'.format(roc_auc_score(y_train, y_train_predict[:, 1])))
    print('- Test ROC-AUC: {:.3f}'.format(roc_auc_score(y_test, y_test_predict[:, 1])))
    print('- Best hyperparameters', gs.best_params_)
    print('**'*21)
    print('')

In [12]:
def run_cross_val_bias_var_tradeoff(estimator, X_train, y_train, X_test, y_test, cv_value=5, scoring='roc_auc',
                                   score_type='Dk_sc', pred_prob = True):
    
    # Print some values
    print(f'No. of molecules in train set: {y_train.shape[0]}, with {y_train.sum()} actives.')
    print(f'No. of molecules in test set: {y_test.shape[0]}, with {y_test.sum()} actives.')

    # Scaler
    scaler = StandardScaler()

    # Set the pipe
    pipe = Pipeline([('scaler', scaler), ('estimator', estimator)])

    # Set the CV score
    roc_cv = cross_val_score(pipe, X_train, y_train, cv=cv_value, scoring=scoring, n_jobs= -1)

    pipe.fit(X_train, y_train)
    
    if pred_prob:
        # Predict proba
        y_predict_train = pipe.predict_proba(X_train)
        y_predict_test = pipe.predict_proba(X_test)
    else:
        y_predict_train = pipe.predict(X_train)
        y_predict_test = pipe.predict(X_test)

    # CV Mean ROC-AUC
    print('CV={} ROC-AUC: {:.4f}'.format(cv_value, roc_cv.mean()))

    # Training ROC-AUC:
    print('Train ROC-AUC: {:.4f}'.format(roc_auc_score(y_train, y_predict_train[:, 1])))

    # Test ROC-AUC:
    print('Test ROC-AUC: {:.4f}'.format(roc_auc_score(y_test, y_predict_test[:, 1])))

### Function to report the Best Conformation's ROC-AUC for a given subset of samples

In [13]:
#************************************************************************
# Returns the best conformatio's ROC-AUC value of a given subset X and y
#************************************************************************
def _get_best_conf_roc_auc(X_train, y_train, X_test, y_test, verbose=True):
    best_roc_auc_train = X_train.apply(
        lambda x: roc_auc_score(y_true= y_train, y_score= -x), axis=0)
    best_roc_auc_test = X_test.apply(
        lambda x: roc_auc_score(y_true= y_test, y_score= -x), axis=0)
    if verbose:
        print('--'*30)
        print("**** Best Conformation's ROC-AUC using docking scores ****")
        print(f"> Train best conf. ROC-AUC: {round(best_roc_auc_train.max(), 3)}" +
              f" \t> Median ROC-AUC: {round(best_roc_auc_train.median(), 3)}")
        print(f"> Test best conf. ROC-AUC: {round(best_roc_auc_test.max(), 3)}" +
              f" \t> Median ROC-AUC: {round(best_roc_auc_test.median(), 3)}")
        print('--'*30)
        
        
def split_and_gs(X, y, estimator, hyperparams, splitting='random', 
                 test_size=0.25, scaffold_series=None):
    if splitting == 'scaffold':
        X_train, X_test, y_train, y_test = \
            train_test_scaffold_split(X, y, scaffold_series = scaffold_series,
                test_size=test_size, stratify=y)
    elif splitting == 'random':
        X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=test_size, stratify=y)

    run_grid_search(estimator, 
                    X_train, y_train, X_test, y_test, 
                    hyperparams = hyperparams)

    _get_best_conf_roc_auc(X_train, y_train, X_test, y_test)

##  Grid Search
### Hyperparameter Tunning
#### DEKOIS Library

In [14]:
#### library = 'DEKOIS'
library = 'DEKOIS'

# Train and test over DEKOIS
X = X_merged_dksc.loc[library]
y = y_true_merged.loc[library]
scaffold_series = df_scff_murcko['scff_generic'].loc[library]

### GS: Linear SVM 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [15]:
%%time
from sklearn.svm import SVC
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-6, 4)}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.881
- Train ROC-AUC: 0.893
- Test ROC-AUC: 0.875
- Best hyperparameters {'estimator__C': 1e-12}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.894 	> Median ROC-AUC: 0.839
> Test best conf. ROC-AUC: 0.942 	> Median ROC-AUC: 0.82
------------------------------------------------------------
CPU times: user 628 ms, sys: 298 ms, total: 927 ms
Wall time: 2.16 s


In [16]:
%%time
# from sklearn.svm import SVC
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-6, 4)}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.858
- Train ROC-AUC: 0.867
- Test ROC-AUC: 0.956
- Best hyperparameters {'estimator__C': 1e-12}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 545 ms, sys: 76.1 ms, total: 621 ms
Wall time: 693 ms


### GS: Radial Basis Function SVM
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [17]:
%%time
estimator = SVC(kernel = 'rbf', probability=True)

hyperparams = {'C': np.geomspace(1e-8, 1e-2, 4), 
               'gamma': np.geomspace(1e-6, 1e0, 4)}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.887
- Train ROC-AUC: 0.902
- Test ROC-AUC: 0.866
- Best hyperparameters {'estimator__C': 1e-08, 'estimator__gamma': 0.0001}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.906 	> Median ROC-AUC: 0.839
> Test best conf. ROC-AUC: 0.896 	> Median ROC-AUC: 0.816
------------------------------------------------------------
CPU times: user 871 ms, sys: 134 ms, total: 1 s
Wall time: 2.05 s


In [18]:
%%time
estimator = SVC(kernel = 'rbf', probability=True)

hyperparams = {'C': np.geomspace(1e-8, 1e-2, 4), 
               'gamma': np.geomspace(1e-6, 1e0, 4)}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.853
- Train ROC-AUC: 0.867
- Test ROC-AUC: 0.956
- Best hyperparameters {'estimator__C': 1e-08, 'estimator__gamma': 1e-06}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 838 ms, sys: 132 ms, total: 970 ms
Wall time: 1.91 s


### GS: Logistic Regession 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [19]:
%%time
from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()

hyperparams = {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1], 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.882
- Train ROC-AUC: 0.983
- Test ROC-AUC: 0.916
- Best hyperparameters {'estimator__C': 0.1, 'estimator__penalty': 'l2', 'estimator__solver': 'liblinear'}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.885 	> Median ROC-AUC: 0.809
> Test best conf. ROC-AUC: 0.959 	> Median ROC-AUC: 0.895
------------------------------------------------------------
CPU times: user 1.05 s, sys: 540 ms, total: 1.59 s
Wall time: 1.45 s


In [20]:
%%time
# from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()

hyperparams = {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1], 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.862
- Train ROC-AUC: 0.868
- Test ROC-AUC: 0.958
- Best hyperparameters {'estimator__C': 0.0001, 'estimator__penalty': 'l2', 'estimator__solver': 'liblinear'}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 1.11 s, sys: 539 ms, total: 1.64 s
Wall time: 1.22 s


### GS: K-Neighbors Classifier
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [21]:
%%time
from sklearn.neighbors import KNeighborsClassifier 
estimator = KNeighborsClassifier()

hyperparams = {'n_neighbors': [25, 55, 125, 225], 
               'p': [1, 2]
             }

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.885
- Train ROC-AUC: 0.890
- Test ROC-AUC: 0.872
- Best hyperparameters {'estimator__n_neighbors': 225, 'estimator__p': 2}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.902 	> Median ROC-AUC: 0.837
> Test best conf. ROC-AUC: 0.92 	> Median ROC-AUC: 0.829
------------------------------------------------------------
CPU times: user 888 ms, sys: 134 ms, total: 1.02 s
Wall time: 1.94 s


In [22]:
%%time
from sklearn.neighbors import KNeighborsClassifier 
estimator = KNeighborsClassifier()

hyperparams = {'n_neighbors': [25, 55, 125, 225], 
               'p': [1, 2]
             }

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.838
- Train ROC-AUC: 0.862
- Test ROC-AUC: 0.956
- Best hyperparameters {'estimator__n_neighbors': 225, 'estimator__p': 2}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 930 ms, sys: 83.1 ms, total: 1.01 s
Wall time: 1.53 s


### GS: Decision Tree Classifier
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [23]:
%%time
from sklearn.tree import DecisionTreeClassifier

estimator = DecisionTreeClassifier(splitter='best')
hyperparams = {'criterion': ['gini', 'entropy'], 
               'max_depth': [2, 3, 5],
               'min_samples_split': [0.2, 0.25, 0.3],
               'max_features': [None, 'sqrt', 'log2']}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.815
- Train ROC-AUC: 0.900
- Test ROC-AUC: 0.857
- Best hyperparameters {'estimator__criterion': 'entropy', 'estimator__max_depth': 3, 'estimator__max_features': 'log2', 'estimator__min_samples_split': 0.2}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.883 	> Median ROC-AUC: 0.813
> Test best conf. ROC-AUC: 0.981 	> Median ROC-AUC: 0.884
------------------------------------------------------------
CPU times: user 1.26 s, sys: 275 ms, total: 1.53 s
Wall time: 2.21 s


In [24]:
%%time
from sklearn.tree import DecisionTreeClassifier

estimator = DecisionTreeClassifier(splitter='best')
hyperparams = {'criterion': ['gini', 'entropy'], 
               'max_depth': [2, 3, 5],
               'min_samples_split': [0.2, 0.25, 0.3],
               'max_features': [None, 'sqrt', 'log2']}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.802
- Train ROC-AUC: 0.903
- Test ROC-AUC: 0.888
- Best hyperparameters {'estimator__criterion': 'entropy', 'estimator__max_depth': 5, 'estimator__max_features': 'log2', 'estimator__min_samples_split': 0.25}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 1.39 s, sys: 278 ms, total: 1.66 s
Wall time: 2.11 s


### GS: Bagging Classifier (k-NN as base estimator) 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [55]:
%%time
from sklearn.ensemble import BaggingClassifier

knn = KNeighborsClassifier(n_neighbors=125, p=1, 
                           weights='distance')

estimator = BaggingClassifier(base_estimator=knn, 
                              n_jobs=6, oob_score=True)

hyperparams = {'n_estimators': [300]}

# RANDOM Train test splitting
# split_and_gs(X, y, estimator, hyperparams, 
#              splitting='random', test_size=0.25, 
#              scaffold_series=None)

CPU times: user 0 ns, sys: 758 µs, total: 758 µs
Wall time: 793 µs


In [56]:
%%time
# from sklearn.ensemble import BaggingClassifier

knn = KNeighborsClassifier(n_neighbors=125, p=1, 
                           weights='distance')

estimator = BaggingClassifier(base_estimator=knn, 
                              n_jobs=6, oob_score=True)

hyperparams = {'n_estimators': [300]}

# SCAFFOLD Train test splitting
# split_and_gs(X, y, estimator, hyperparams, 
#              splitting='scaffold', test_size=0.25, 
#              scaffold_series=scaffold_series)

CPU times: user 50 µs, sys: 5 µs, total: 55 µs
Wall time: 62.7 µs


### GS:Random Forest 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [27]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()

hyperparams = {'n_estimators': [300, 500],
             'max_depth': [2,  5],
             'min_samples_leaf': [0.1, 0.2],
             'max_features': ['log2', 'sqrt']
            }

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 914, with 29 actives.
No. of molecules in test set: 305, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.898
- Train ROC-AUC: 0.933
- Test ROC-AUC: 0.840
- Best hyperparameters {'estimator__max_depth': 5, 'estimator__max_features': 'log2', 'estimator__min_samples_leaf': 0.1, 'estimator__n_estimators': 300}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.914 	> Median ROC-AUC: 0.848
> Test best conf. ROC-AUC: 0.861 	> Median ROC-AUC: 0.795
------------------------------------------------------------
CPU times: user 2.63 s, sys: 683 ms, total: 3.32 s
Wall time: 22.9 s


In [28]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()

hyperparams = {'n_estimators': [300, 500],
             'max_depth': [2,  5],
             'min_samples_leaf': [0.1, 0.2],
             'max_features': ['log2', 'sqrt']
            }

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 885, with 29 actives.
No. of molecules in test set: 334, with 10 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.853
- Train ROC-AUC: 0.895
- Test ROC-AUC: 0.941
- Best hyperparameters {'estimator__max_depth': 2, 'estimator__max_features': 'log2', 'estimator__min_samples_leaf': 0.2, 'estimator__n_estimators': 500}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.879 	> Median ROC-AUC: 0.81
> Test best conf. ROC-AUC: 0.968 	> Median ROC-AUC: 0.898
------------------------------------------------------------
CPU times: user 2.78 s, sys: 246 ms, total: 3.02 s
Wall time: 20.4 s


##  Grid Search
### Hyperparameter Tunning
#### Merged Libraries 

In [29]:
# Train and test over DEKOIS
X = X_merged_dksc
y = y_true_merged
scaffold_series = df_scff_murcko['scff_generic']

### GS: Linear SVM 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [30]:
%%time
from sklearn.svm import SVC
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-6, 4)}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 4379, with 225 actives.
No. of molecules in test set: 1460, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.773
- Train ROC-AUC: 0.801
- Test ROC-AUC: 0.748
- Best hyperparameters {'estimator__C': 1e-06}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.686 	> Median ROC-AUC: 0.608
> Test best conf. ROC-AUC: 0.674 	> Median ROC-AUC: 0.591
------------------------------------------------------------
CPU times: user 3.36 s, sys: 117 ms, total: 3.48 s
Wall time: 16.1 s


In [31]:
%%time
# from sklearn.svm import SVC
estimator = SVC(kernel = 'linear', probability=True)
hyperparams = {'C': np.geomspace(1e-15, 1e-6, 4)}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 4154, with 225 actives.
No. of molecules in test set: 1685, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.770
- Train ROC-AUC: 0.823
- Test ROC-AUC: 0.699
- Best hyperparameters {'estimator__C': 1e-06}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.683 	> Median ROC-AUC: 0.588
> Test best conf. ROC-AUC: 0.704 	> Median ROC-AUC: 0.641
------------------------------------------------------------
CPU times: user 4.15 s, sys: 162 ms, total: 4.31 s
Wall time: 19.1 s


### GS: Radial Basis Function SVM
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [32]:
%%time
estimator = SVC(kernel = 'rbf', probability=True)

hyperparams = {'C': np.geomspace(1e-8, 1e-2, 4), 
               'gamma': np.geomspace(1e-6, 1e0, 4)}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 4379, with 225 actives.
No. of molecules in test set: 1460, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.850
- Train ROC-AUC: 0.971
- Test ROC-AUC: 0.830
- Best hyperparameters {'estimator__C': 0.01, 'estimator__gamma': 0.01}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.684 	> Median ROC-AUC: 0.603
> Test best conf. ROC-AUC: 0.681 	> Median ROC-AUC: 0.611
------------------------------------------------------------
CPU times: user 7.93 s, sys: 194 ms, total: 8.13 s
Wall time: 1min 35s


In [33]:
%%time
estimator = SVC(kernel = 'rbf', probability=True)

hyperparams = {'C': np.geomspace(1e-8, 1e-2, 4), 
               'gamma': np.geomspace(1e-6, 1e0, 4)}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 4154, with 225 actives.
No. of molecules in test set: 1685, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.815
- Train ROC-AUC: 0.974
- Test ROC-AUC: 0.705
- Best hyperparameters {'estimator__C': 0.01, 'estimator__gamma': 0.01}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.683 	> Median ROC-AUC: 0.588
> Test best conf. ROC-AUC: 0.704 	> Median ROC-AUC: 0.641
------------------------------------------------------------
CPU times: user 8.25 s, sys: 265 ms, total: 8.52 s
Wall time: 1min 30s


### GS: Logistic Regession 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [34]:
%%time
from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()

hyperparams = {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1], 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 4379, with 225 actives.
No. of molecules in test set: 1460, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.825
- Train ROC-AUC: 0.881
- Test ROC-AUC: 0.827
- Best hyperparameters {'estimator__C': 0.1, 'estimator__penalty': 'l2', 'estimator__solver': 'liblinear'}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.697 	> Median ROC-AUC: 0.612
> Test best conf. ROC-AUC: 0.672 	> Median ROC-AUC: 0.584
------------------------------------------------------------
CPU times: user 3.13 s, sys: 563 ms, total: 3.69 s
Wall time: 4.66 s


In [35]:
%%time
# from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()

hyperparams = {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1], 
               'penalty': ['l1', 'l2'], 
               'solver': ['lbfgs', 'liblinear']}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 4154, with 225 actives.
No. of molecules in test set: 1685, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.823
- Train ROC-AUC: 0.903
- Test ROC-AUC: 0.741
- Best hyperparameters {'estimator__C': 0.1, 'estimator__penalty': 'l2', 'estimator__solver': 'liblinear'}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.683 	> Median ROC-AUC: 0.588
> Test best conf. ROC-AUC: 0.704 	> Median ROC-AUC: 0.641
------------------------------------------------------------
CPU times: user 3.21 s, sys: 478 ms, total: 3.69 s
Wall time: 4.57 s


### GS: Decision Tree Classifier
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [36]:
%%time
from sklearn.tree import DecisionTreeClassifier

estimator = DecisionTreeClassifier(splitter='best')
hyperparams = {'criterion': ['gini', 'entropy'], 
               'max_depth': [2, 3, 5],
               'min_samples_split': [0.2, 0.25, 0.3],
               'max_features': [None, 'sqrt', 'log2']}

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 4379, with 225 actives.
No. of molecules in test set: 1460, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.709
- Train ROC-AUC: 0.791
- Test ROC-AUC: 0.716
- Best hyperparameters {'estimator__criterion': 'entropy', 'estimator__max_depth': 5, 'estimator__max_features': None, 'estimator__min_samples_split': 0.2}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.682 	> Median ROC-AUC: 0.613
> Test best conf. ROC-AUC: 0.679 	> Median ROC-AUC: 0.579
------------------------------------------------------------
CPU times: user 5.35 s, sys: 297 ms, total: 5.64 s
Wall time: 6.26 s


In [37]:
%%time
from sklearn.tree import DecisionTreeClassifier

estimator = DecisionTreeClassifier(splitter='best')
hyperparams = {'criterion': ['gini', 'entropy'], 
               'max_depth': [2, 3, 5],
               'min_samples_split': [0.2, 0.25, 0.3],
               'max_features': [None, 'sqrt', 'log2']}

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 4154, with 225 actives.
No. of molecules in test set: 1685, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.717
- Train ROC-AUC: 0.767
- Test ROC-AUC: 0.586
- Best hyperparameters {'estimator__criterion': 'entropy', 'estimator__max_depth': 5, 'estimator__max_features': 'sqrt', 'estimator__min_samples_split': 0.3}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.683 	> Median ROC-AUC: 0.588
> Test best conf. ROC-AUC: 0.704 	> Median ROC-AUC: 0.641
------------------------------------------------------------
CPU times: user 5.24 s, sys: 288 ms, total: 5.53 s
Wall time: 6.04 s


### GS: Bagging Classifier (k-NN as base estimator) 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [57]:
%%time
from sklearn.ensemble import BaggingClassifier

knn = KNeighborsClassifier(n_neighbors=125, p=1, 
                           weights='distance')

estimator = BaggingClassifier(base_estimator=knn, 
                              n_jobs=3, oob_score=True)

hyperparams = {'n_estimators': [300]}

# RANDOM Train test splitting
# split_and_gs(X, y, estimator, hyperparams, 
#              splitting='random', test_size=0.25, 
#              scaffold_series=None)

CPU times: user 44 µs, sys: 5 µs, total: 49 µs
Wall time: 55.8 µs


In [58]:
%%time
# from sklearn.ensemble import BaggingClassifier

knn = KNeighborsClassifier(n_neighbors=125, p=1, 
                           weights='distance')

estimator = BaggingClassifier(base_estimator=knn, 
                              n_jobs=3, oob_score=True)

hyperparams = {'n_estimators': [300]}

# SCAFFOLD Train test splitting
# split_and_gs(X, y, estimator, hyperparams, 
#              splitting='scaffold', test_size=0.25, 
#              scaffold_series=scaffold_series)

CPU times: user 45 µs, sys: 4 µs, total: 49 µs
Wall time: 58.7 µs


### GS:Random Forest 
#### DEKOIS database - Random and Stratified Scaffold Splitting

In [52]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()

hyperparams = {'n_estimators': [300, 500],
             'max_depth': [3,  5, 7],
             'min_samples_leaf': [0.03, 0.05, 0.1],
             'max_features': ['sqrt']
            }

# RANDOM Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='random', test_size=0.25, 
             scaffold_series=None)

No. of molecules in train set: 4379, with 225 actives.
No. of molecules in test set: 1460, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.779
- Train ROC-AUC: 0.875
- Test ROC-AUC: 0.788
- Best hyperparameters {'estimator__max_depth': 7, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 0.03, 'estimator__n_estimators': 500}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.688 	> Median ROC-AUC: 0.606
> Test best conf. ROC-AUC: 0.672 	> Median ROC-AUC: 0.599
------------------------------------------------------------
CPU times: user 5.8 s, sys: 116 ms, total: 5.92 s
Wall time: 55.1 s


In [53]:
%%time
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()

hyperparams = {'n_estimators': [300, 500],
             'max_depth': [3,  5, 7],
             'min_samples_leaf': [0.03, 0.05, 0.1],
             'max_features': ['sqrt']
            }

# SCAFFOLD Train test splitting
split_and_gs(X, y, estimator, hyperparams, 
             splitting='scaffold', test_size=0.25, 
             scaffold_series=scaffold_series)

No. of molecules in train set: 4154, with 225 actives.
No. of molecules in test set: 1685, with 75 actives.

********** GRID SEARCH RESULTS **********
- Mean CV ROC-AUC score: 0.755
- Train ROC-AUC: 0.874
- Test ROC-AUC: 0.672
- Best hyperparameters {'estimator__max_depth': 7, 'estimator__max_features': 'sqrt', 'estimator__min_samples_leaf': 0.03, 'estimator__n_estimators': 500}
******************************************

------------------------------------------------------------
**** Best Conformation's ROC-AUC using docking scores ****
> Train best conf. ROC-AUC: 0.683 	> Median ROC-AUC: 0.588
> Test best conf. ROC-AUC: 0.704 	> Median ROC-AUC: 0.641
------------------------------------------------------------
CPU times: user 5.72 s, sys: 127 ms, total: 5.85 s
Wall time: 55.3 s
