In [None]:
import numpy as np

In [None]:
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, RepeatedStratifiedKFold

def _validate(clf, clf_name, 
                X_test, y_test,
                metric_name, metric_params):
    if clf_name.startswith('ml_'):
        # Make the predictions
        y_pred = clf.predict_proba(X_test)[:,1]
    elif clf_name.startswith('cs_'):
        # apply consensus
        y_pred = clf(pd.DataFrame(X_test))
    else:
        print(clf_name, 'not found. Ommited.')
        return
    
    # Make the evaluation
    metric = PlotMetric(y_test, {'': y_pred},
                decreasing=False)\
                .format_metric_results(
                    rounded=5,
                    #metric_name=metric_name,
        # metric_params already includes metric_name
                    **metric_params)

    return metric.values[0][0]


def _train_cfl(clf, X_train, y_train):
    # Fit the estimator
    clf.fit(X_train, y_train)
    return clf
    

def _do_replicates(splits, 
                   estimators, X, y,
                   metrics):
    results={}
    # Machine Learning Classifiers
    for clf_name, clf in estimators.items():
        folds = []
        for i, (train, test) in enumerate(splits):
            if clf_name.startswith('ml_'):
                # Fit the ml classifier once per fold
                cfl = _train_cfl(clf, X[train], y[train])
            
            # for each metric
#             metric_results = {}
            for metric_name, metric_params in metrics.items():
            
                metric = _validate(
                    clf, clf_name, 
                    X[test], y[test],
                    metric_name, metric_params
                )
                # Append the results
                folds.append(metric)

        # Add to the results dictonary 
        results[clf_name] = folds

    return results

def _format_results_to_df(metrics, results, n):
        # Format into a dataframe
    # Create the metric names and repeat them 
    n_metrics = len(metrics)
    index_names = [*metrics.keys()]*n
    
    # convert to a dataframe
    df_res = pd.DataFrame(
        results, 
        index= pd.MultiIndex.from_tuples(
            zip(index_names,
                np.repeat(range(n), n_metrics))
        ))
    df_res = df_res.sort_index()
    
    return df_res


@cached()
def k_cross_validation(
          estimators, X, y,
          metrics,
          n_splits=5, 
          random_state=None, 
          shuffle=True):
    # Compute the Stratified K folds
    cv = StratifiedKFold(n_splits=n_splits, 
                         random_state=random_state,
                         shuffle=shuffle)
    splits = [*cv.split(X, y)]
    
    results = _do_replicates(splits, estimators, X, y, 
                             metrics)
    
    df_res = _format_results_to_df(metrics, results, n=n_splits)
    
    return df_res 


@cached()
def n_hold_out_validation(
          estimators, X, y,
          metrics,
          n_reps=5, test_size=0.25,
          random_state=None):
    # Compute the Stratified K folds
    cv = StratifiedShuffleSplit(
                        n_splits=n_reps, 
                        test_size=test_size,
                        random_state=random_state)
    splits = [*cv.split(X, y)]
    
    results = _do_replicates(splits, estimators, X, y,
          metrics)
    
    df_res = _format_results_to_df(metrics, results, n=n_reps)
    
    return df_res 


@cached()
def nk_rep_cross_validation(
          estimators, X, y,
          metrics,
          n_splits=2, 
          n_repeats=5,
          random_state=None, 
          shuffle=True):
    # Compute the Stratified K folds
    cv = RepeatedStratifiedKFold(
                         n_splits=n_splits,
                         n_repeats=n_repeats,
                         random_state=random_state)
    splits = [*cv.split(X, y)]
    
    results = _do_replicates(splits, estimators, X, y, 
                             metrics)
    
    df_res = _format_results_to_df(metrics, results, n=n_splits*n_repeats)
    
    return df_res 

In [None]:
# Memory memoization
from joblib import Memory
location = './cachedir'
memory = Memory(location, verbose=0)

n_hold_out_validation = memory.cache(n_hold_out_validation) 

k_cross_validation = memory.cache(k_cross_validation) 

nk_rep_cross_validation = memory.cache(nk_rep_cross_validation) 