In [1]:
import warnings
warnings.simplefilter('ignore', FutureWarning)
import numpy as np
import scipy as scp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import (permutation_test_score, learning_curve, LeaveOneGroupOut,
                                     KFold, cross_val_score, cross_val_predict, cross_validate,
                                     train_test_split)
from sklearn.utils import parallel_backend
from sklearn.base import clone
from sklearn import datasets
from joblib.parallel import Parallel, delayed
import pickle
from permutation_helpers import random_data_gen, post_hoc_permutation, simulate

## Setup

In [None]:
## IRIS dataset
# X_iris, y_iris = datasets.load_iris(return_X_y=True)
# mask = y_iris < 2
# X_iris, y_iris = X_iris[mask], y_iris[mask]

In [None]:
X = np.load("random_data_X.npy")
y = np.load("random_data_y.npy")

In [None]:
estimator = LogisticRegressionCV(class_weight='balanced', Cs=4)
cv = KFold(n_splits=3, shuffle=True, random_state=0)
n_permutations = 5000

## Pre-training permutation

In [None]:
score, null, p = permutation_test_score(estimator=estimator,
                                        X=X, y=y,
                                        cv = cv,
                                        n_permutations=n_permutations,
                                        scoring='roc_auc',
                                        verbose=True, n_jobs=-1
                                       )

In [None]:
null

## Post-training permutation

In [None]:
y_pred = cross_val_predict(clone(estimator), X, y, cv=cv, method='predict_proba')
# post_score, post_null, post_p = post_hoc_permutation(y, y_pred[:, 1], n_jobs=-1, verbose=True, n_permutations=n_permutations)

In [None]:
cross_val_score(clone(estimator), X, y, cv=cv, scoring="roc_auc", verbose=1)

Above permutes across CV folds. If we want to permute within CV folds we need to use something like the below and run the permutation function each time, then aggregate

In [None]:
from permutation_helpers import post_hoc_permutation_cv

## Running simulations

In [None]:
@simulate(parameter_range=np.linspace(0, 5), n_sim=10)
def scale(param=None, x=None, intercept=None, seed=None):
    # if np.random.randn()>0:
    #     return None
    return param*x + intercept

In [None]:
# try:
#     client.shutdown()
#     client = Client()
# except:
#     client = Client()
result = scale(x=np.array([1, 2, 4]), intercept=2)
result

In [None]:
with open("simulation_results.pkl", "wb") as f:
    pickle.dump(result, f)

## Simulate permutations with different params

In [2]:
import cmldask.CMLDask as da
rhino_client = da.new_dask_client(
    job_name="simulations",
    memory_per_job="1.5GB",
    max_n_jobs=400, threads_per_job=5, 
    adapt=True,
    local_directory="/home1/jrudoler/dask_worker_space",
    log_directory="/home1/jrudoler/logs/",
    )

Unique port for jrudoler is 51360
{'dashboard_address': ':51360'}
To view the dashboard, run: 
`ssh -fN jrudoler@rhino2.psych.upenn.edu -L 8000:128.91.77.24:51360` in your local computer's terminal (NOT rhino) 
and then navigate to localhost:8000 in your browser


In [None]:
rhino_client.shutdown()

In [None]:
rhino_client.cluster.scale(400)

In [None]:
@simulate(parameter_range=np.logspace(2, 5, 10).astype(int), n_sim=1000)
def simulate_samplesize(param=None, seed=None):
    X, y = random_data_gen(n_samples=param, n_feats=5, maha=0.1, ratio=0.5, seed=seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    estimator = LogisticRegressionCV(class_weight='balanced', Cs=4)
    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    n_permutations = 5000
    estimator.fit(X=X_train, y=y_train)
    y_pred = estimator.predict_proba(X_test)[:, 1]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    return score, permutation_scores, pvalue

@simulate(parameter_range=np.linspace(0., 1.5, 25), n_sim=500)
def simulate_maha(param=None, seed=None):
    X, y = random_data_gen(n_samples=5000, n_feats=5, maha=param, ratio=0.5, seed=seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    estimator = LogisticRegressionCV(class_weight='balanced', Cs=4)
    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    n_permutations = 5000
    estimator.fit(X=X_train, y=y_train)
    y_pred = estimator.predict_proba(X_test)[:, 1]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    return score, permutation_scores, pvalue

@simulate(parameter_range=np.logspace(1, 10, 10, base=2).astype(int), n_sim=500)
def simulate_nfeats(param=None, seed=None):
    X, y = random_data_gen(n_samples=5000, n_feats=param, maha=.1, ratio=0.5, seed=seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    estimator = LogisticRegressionCV(class_weight='balanced', Cs=4)
    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    n_permutations = 5000
    estimator.fit(X=X_train, y=y_train)
    y_pred = estimator.predict_proba(X_test)[:, 1]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    return score, permutation_scores, pvalue

@simulate(parameter_range=np.linspace(.1, .9, 8), n_sim=500)
def simulate_ratio(param=None, seed=None):
    X, y = random_data_gen(n_samples=5000, n_feats=5, maha=.1, ratio=param, seed=seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    estimator = LogisticRegressionCV(class_weight='balanced', Cs=4)
    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    n_permutations = 5000
    estimator.fit(X=X_train, y=y_train)
    y_pred = estimator.predict_proba(X_test)[:, 1]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    return score, permutation_scores, pvalue
    
    

In [None]:
result = simulate_samplesize()

In [None]:
df_result = pd.DataFrame(result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result.to_pickle("simulate_samplesize.pkl")
# loaded = pd.read_pickle("simulate_samplesize.pkl")

In [None]:
nfeats_result, nfeats_futures = simulate_nfeats()
df_result = pd.DataFrame(nfeats_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result.to_pickle("simulate_nfeats.pkl")

In [None]:
maha_result, maha_futures = simulate_maha()
df_result = pd.DataFrame(maha_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result.to_pickle("simulate_maha.pkl")
# loaded = pd.read_pickle("simulate_samplesize.pkl")

In [None]:
ratio_result, ratio_futures = simulate_ratio()
df_result = pd.DataFrame(nfeats_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result.to_pickle("simulate_ratio.pkl")

In [None]:
rhino_client.shutdown()

## Compare with original permutation test

In [None]:
from sklearn.metrics import make_scorer

In [13]:
def _train_score(estimator, X_train, X_test, y_train, y_test, 
                score_func, shuffle_labels=False):
    if shuffle_labels:
        indices = np.random.default_rng().permutation(len(y_train))
        y_train = y_train[indices]
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict_proba(X_test)[:,1]
    score = score_func(y_true=y_test, y_score=y_pred)
    return score



def pre_training_permutation(estimator, X_train, X_test, y_train, y_test,
                            n_permutations, score_func, verbose=False, n_jobs=None):
    score = _train_score(
        clone(estimator), X_train, X_test, y_train, y_test, score_func, shuffle_labels=False
    )
    permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(_train_score)(
            clone(estimator),
            X_train, X_test, y_train, y_test,
            score_func,
            shuffle_labels=True,
        )
        for _ in range(n_permutations)
    )
    permutation_scores = np.array(permutation_scores)
    pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1)
    return score, permutation_scores, pvalue


In [14]:
@simulate(parameter_range=np.linspace(0., 1.5, 25), n_sim=100)
def simulate_maha_pre(param=None, seed=None):
    X, y = random_data_gen(n_samples=1000, n_feats=5, maha=param, ratio=0.5, seed=seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    estimator = LogisticRegressionCV(class_weight='balanced', Cs=4)
    n_permutations = 5000
    score, null, p = pre_training_permutation(
        estimator,
        X_train, X_test, y_train, y_test,
        n_permutations=n_permutations,
        score_func=roc_auc_score,
        verbose=True, n_jobs=-1
    )
    return score, null, p

Running 100 simulations
using dask client at http://128.91.77.24:51360/status


In [15]:
maha_result, maha_futures = simulate_maha_pre()
df_result = pd.DataFrame(maha_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result.to_pickle("simulate_maha_pre.pkl")

Running 100 simulations
Using dask client at http://128.91.77.24:51360/status
2500 parallel jobs
[#######################                 ] | 57% Completed |  2hr 55min 35.9s

In [12]:
da.get_exceptions(maha_futures, range(len(maha_futures)))

Unnamed: 0_level_0,exception,traceback_obj
param,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"TypeError(""predict_proba() missing 1 required ...",<traceback object at 0x7f2f05816a40>
1,"TypeError(""predict_proba() missing 1 required ...",<traceback object at 0x7f2efbf929c0>
2,"TypeError(""predict_proba() missing 1 required ...",<traceback object at 0x7f2f05e67500>
3,"TypeError(""predict_proba() missing 1 required ...",<traceback object at 0x7f2f016b25c0>
4,"TypeError(""predict_proba() missing 1 required ...",<traceback object at 0x7f2f2d86b280>
...,...,...
2495,"TypeError(""predict_proba() missing 1 required ...",<traceback object at 0x7f2ef1f74e00>
2496,"TypeError(""predict_proba() missing 1 required ...",<traceback object at 0x7f2eefe5f200>
2497,"TypeError(""predict_proba() missing 1 required ...",<traceback object at 0x7f2eea7a98c0>
2498,"TypeError(""predict_proba() missing 1 required ...",<traceback object at 0x7f2ee83e6a00>
