In [1]:
import numpy as np
import scipy as scp
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import (permutation_test_score, learning_curve, LeaveOneGroupOut,
                                     KFold, cross_val_score, cross_val_predict, cross_validate)
from sklearn.utils import parallel_backend
from sklearn.base import clone
from sklearn import datasets
from joblib.parallel import Parallel, delayed

## Setup

In [20]:
## IRIS dataset
# X_iris, y_iris = datasets.load_iris(return_X_y=True)
# mask = y_iris < 2
# X_iris, y_iris = X_iris[mask], y_iris[mask]

In [2]:
X = np.load("random_data_X.npy")
y = np.load("random_data_y.npy")

In [3]:
estimator = LogisticRegressionCV(class_weight='balanced', Cs=4)
cv = KFold(n_splits=3, shuffle=True, random_state=0)
n_permutations = 5000

## Pre-training permutation

In [4]:
score, null, p = permutation_test_score(estimator=estimator,
                                        X=X, y=y,
                                        cv = cv,
                                        n_permutations=n_permutations,
                                        scoring='roc_auc',
                                        verbose=True, n_jobs=-1
                                       )

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 370 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 720 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done 1170 tasks      | elapsed:   29.7s
[Parallel(n_jobs=-1)]: Done 1720 tasks      | elapsed:   43.0s
[Parallel(n_jobs=-1)]: Done 2370 tasks      | elapsed:   58.3s
[Parallel(n_jobs=-1)]: Done 3120 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 3970 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 4920 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:  2.0min finished


In [5]:
p

0.0001999600079984003

## Post-training permutation

In [6]:
from permutation_helpers import post_hoc_permuation

In [7]:
y_pred = cross_val_predict(clone(estimator), X, y, cv=cv, method='predict_proba')
# post_score, post_null, post_p = post_hoc_permuation(y, y_pred[:, 1], n_jobs=-1, verbose=True, n_permutations=n_permutations)

In [8]:
cross_val_score(clone(estimator), X, y, cv=cv, scoring="roc_auc", verbose=1)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.1s finished


array([0.91790977, 0.92486726, 0.91869629])

Above permutes across CV folds. If we want to permute within CV folds we need to use something like the below and run the permutation function each time, then aggregate

In [9]:
def post_hoc_permuation_cv(y_true, y_pred)
    holdout_sets = [test for _, test in cv.split(y_true)]
    all_score = []
    all_null = []
    for holdout_idx in holdout_sets:
        score, null, p = post_hoc_permuation(y_true[holdout_idx], y_pred[holdout_idx, 1], n_jobs=-1, verbose=True)
        all_score.append(score)
        all_null.append(null)
    score = np.mean(all_score)
    avg_null = np.vstack(all_null).mean(0)
    pvalue = (np.sum(np.mean(all_score) <= avg_null)+1.)/(all_null.shape[1]+1.)
    return score, avg_null, pvalue

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.
[Parallel(n_jobs=-1)]: Done 166 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 2048 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 9126 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 10000 out of 10000 | elapsed:    2.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 2064 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 7664 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 9854 tasks      | elapsed:    2.8s
[Parallel(n_jobs=-1)]: Done 10000 out of 10000 | elapsed:    2.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.
[Parallel(n_jobs=-1)]: Done 126 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 1168 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 3968 tasks      | elapsed: 

In [211]:
from functools import partial, wraps

## decorator factory for simulation
def simulate(parameter_range):
    """
    Decorator factory for simulating a function over a range of parameters. 
    """
    def decorator(function):
        wraps(function)
        def wrapper(*args, **kwargs):
            result = {}
            for p in parameter_range:
                result[p] = function(*args, param=p, **kwargs)
            return result
        return wrapper
    return decorator

In [214]:
@simulate(parameter_range=np.arange(10))
def scale(x, param=None, intercept=None):
#     if not s:
#         raise ValueError
#     print("inside actual func")
    return param*x + intercept

In [215]:
scale(np.array([1, 2, 4]), intercept=2)

{0: array([2, 2, 2]),
 1: array([3, 4, 6]),
 2: array([ 4,  6, 10]),
 3: array([ 5,  8, 14]),
 4: array([ 6, 10, 18]),
 5: array([ 7, 12, 22]),
 6: array([ 8, 14, 26]),
 7: array([ 9, 16, 30]),
 8: array([10, 18, 34]),
 9: array([11, 20, 38])}