In [1]:
import warnings
warnings.simplefilter('ignore', FutureWarning)
import numpy as np
import scipy as scp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import (permutation_test_score, learning_curve, LeaveOneGroupOut,
                                     KFold, cross_val_score, cross_val_predict, cross_validate,
                                     train_test_split)
from sklearn.utils import parallel_backend
from sklearn.base import clone
from sklearn import datasets
from joblib.parallel import Parallel, delayed
import pickle
from permutation_helpers import random_data_gen, post_hoc_permutation, simulate
from dask.distributed import progress, Client

## Setup

In [None]:
## IRIS dataset
# X_iris, y_iris = datasets.load_iris(return_X_y=True)
# mask = y_iris < 2
# X_iris, y_iris = X_iris[mask], y_iris[mask]

In [None]:
X = np.load("random_data_X.npy")
y = np.load("random_data_y.npy")

In [None]:
estimator = LogisticRegressionCV(class_weight='balanced', Cs=4)
cv = KFold(n_splits=3, shuffle=True, random_state=0)
n_permutations = 5000

## Pre-training permutation

In [None]:
score, null, p = permutation_test_score(estimator=estimator,
                                        X=X, y=y,
                                        cv = cv,
                                        n_permutations=n_permutations,
                                        scoring='roc_auc',
                                        verbose=True, n_jobs=-1
                                       )

In [None]:
null

## Post-training permutation

In [None]:
y_pred = cross_val_predict(clone(estimator), X, y, cv=cv, method='predict_proba')
# post_score, post_null, post_p = post_hoc_permutation(y, y_pred[:, 1], n_jobs=-1, verbose=True, n_permutations=n_permutations)

In [None]:
cross_val_score(clone(estimator), X, y, cv=cv, scoring="roc_auc", verbose=1)

Above permutes across CV folds. If we want to permute within CV folds we need to use something like the below and run the permutation function each time, then aggregate

In [None]:
from permutation_helpers import post_hoc_permutation_cv

## Running simulations

In [3]:
@simulate(parameter_range=np.linspace(0, 5), n_sim=10, client=Client())
def scale(param=None, x=None, intercept=None, seed=None):
    # if np.random.randn()>0:
    #     return None
    return param*x + intercept

Running 10 simulations
using dask client at http://127.0.0.1:8787/status


In [4]:
# try:
#     client.shutdown()
#     client = Client()
# except:
#     client = Client()
result = scale(x=np.array([1, 2, 4]), intercept=2)
result

Running 10 simulations
Using dask client at http://127.0.0.1:8787/status
500 parallel jobs


([<Future: pending, key: scale-5161944533a84565037563cef54d136e>,
  <Future: pending, key: scale-41e5df86842a69c850466cf2d272c8b1>,
  <Future: pending, key: scale-35defe9616b5941a8b2f79f1f963ecbe>,
  <Future: pending, key: scale-b89a8ab1bf32d2057a460ea187d36eea>,
  <Future: pending, key: scale-68729f502bddf463e135e312eeebfce1>,
  <Future: pending, key: scale-5b655d7839af9f601342441efe6980be>,
  <Future: pending, key: scale-be3e2127192820188cb0f780689d7171>,
  <Future: pending, key: scale-57da071cff37646d151f621b2e47b451>,
  <Future: pending, key: scale-eef3efac0adc9bb1f0524a369c678fb3>,
  <Future: pending, key: scale-bafd0731cf79175ca3863c0766b9acfe>,
  <Future: pending, key: scale-b6d01cee9f491ad0e46b97a8d3951809>,
  <Future: pending, key: scale-85ffaa4655c22402321c0eab112802d3>,
  <Future: pending, key: scale-ae9be0ea083d3f291921a07c3e784dd7>,
  <Future: pending, key: scale-dd540a8d8ce9fc42ecfdc79b5f8b5bb7>,
  <Future: pending, key: scale-cdcf82b7bcc4619609bc403e72ee34a3>,
  <Future:

In [6]:
result[1](result[0])

{0.0: {0: array([2., 2., 2.]),
  1: array([2., 2., 2.]),
  2: array([2., 2., 2.]),
  3: array([2., 2., 2.]),
  4: array([2., 2., 2.]),
  5: array([2., 2., 2.]),
  6: array([2., 2., 2.]),
  7: array([2., 2., 2.]),
  8: array([2., 2., 2.]),
  9: array([2., 2., 2.])},
 0.10204081632653061: {0: array([2.10204082, 2.20408163, 2.40816327]),
  1: array([2.10204082, 2.20408163, 2.40816327]),
  2: array([2.10204082, 2.20408163, 2.40816327]),
  3: array([2.10204082, 2.20408163, 2.40816327]),
  4: array([2.10204082, 2.20408163, 2.40816327]),
  5: array([2.10204082, 2.20408163, 2.40816327]),
  6: array([2.10204082, 2.20408163, 2.40816327]),
  7: array([2.10204082, 2.20408163, 2.40816327]),
  8: array([2.10204082, 2.20408163, 2.40816327]),
  9: array([2.10204082, 2.20408163, 2.40816327])},
 0.20408163265306123: {0: array([2.20408163, 2.40816327, 2.81632653]),
  1: array([2.20408163, 2.40816327, 2.81632653]),
  2: array([2.20408163, 2.40816327, 2.81632653]),
  3: array([2.20408163, 2.40816327, 2.816

In [None]:
with open("simulation_results.pkl", "wb") as f:
    pickle.dump(result, f)

## Simulate permutations with different params

In [2]:
import cmldask.CMLDask as da
rhino_client = da.new_dask_client(
    job_name="simulations",
    memory_per_job="1.5GB",
    max_n_jobs=400, threads_per_job=10, 
    adapt=True,
    local_directory="/home1/jrudoler/dask_worker_space",
    log_directory="/home1/jrudoler/logs/",
    )

Unique port for jrudoler is 51360
{'dashboard_address': ':51360'}
To view the dashboard, run: 
`ssh -fN jrudoler@rhino2.psych.upenn.edu -L 8000:192.168.86.143:51360` in your local computer's terminal (NOT rhino) 
and then navigate to localhost:8000 in your browser


In [None]:
rhino_client.shutdown()

In [None]:
rhino_client.cluster.scale(400)

In [3]:
@simulate(parameter_range=np.logspace(2, 5, 5).astype(int), n_sim=500)
def simulate_samplesize(param=None, seed=None):
    X, y = random_data_gen(n_samples=param, n_feats=10, maha=0., ratio=0.5, seed=seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    estimator = LogisticRegressionCV(class_weight='balanced', Cs=6)
#     cv = KFold(n_splits=5, shuffle=True, random_state=0)
    n_permutations = 5000
    estimator.fit(X=X_train, y=y_train)
    y_pred = estimator.predict_proba(X_test)[:, 1]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    return score, permutation_scores, pvalue

@simulate(parameter_range=np.linspace(0., 1.5, 5), n_sim=500)
def simulate_maha(param=None, seed=None):
    X, y = random_data_gen(n_samples=1000, n_feats=10, maha=param, ratio=0.5, seed=seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    estimator = LogisticRegressionCV(class_weight='balanced', Cs=6)
#     cv = KFold(n_splits=5, shuffle=True, random_state=0)
    n_permutations = 5000
    estimator.fit(X=X_train, y=y_train)
    y_pred = estimator.predict_proba(X_test)[:, 1]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    return score, permutation_scores, pvalue

@simulate(parameter_range=np.logspace(1, 10, 5, base=2).astype(int), n_sim=500)
def simulate_nfeats(param=None, seed=None):
    X, y = random_data_gen(n_samples=1000, n_feats=param, maha=0., ratio=0.5, seed=seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    estimator = LogisticRegressionCV(class_weight='balanced', Cs=6)
#     cv = KFold(n_splits=5, shuffle=True, random_state=0)
    n_permutations = 5000
    estimator.fit(X=X_train, y=y_train)
    y_pred = estimator.predict_proba(X_test)[:, 1]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    return score, permutation_scores, pvalue

@simulate(parameter_range=np.linspace(.1, .9, 8), n_sim=500)
def simulate_ratio(param=None, seed=None):
    X, y = random_data_gen(n_samples=1000, n_feats=10, maha=0., ratio=param, seed=seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    estimator = LogisticRegressionCV(class_weight='balanced', Cs=6)
#     cv = KFold(n_splits=5, shuffle=True, random_state=0)
    n_permutations = 5000
    estimator.fit(X=X_train, y=y_train)
    y_pred = estimator.predict_proba(X_test)[:, 1]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    return score, permutation_scores, pvalue
    
    

Running 500 simulations
using dask client at http://192.168.86.143:51360/status
Running 500 simulations
using dask client at http://192.168.86.143:51360/status
Running 500 simulations
using dask client at http://192.168.86.143:51360/status
Running 500 simulations
using dask client at http://192.168.86.143:51360/status


In [4]:
samplesize_futures, samplesize_gather = simulate_samplesize()
nfeats_futures, nfeats_gather = simulate_nfeats()
maha_futures, maha_gather = simulate_maha()
ratio_futures, ratio_gather = simulate_ratio()

Running 500 simulations
Using dask client at http://192.168.86.143:51360/status
2500 parallel jobs
Running 500 simulations
Using dask client at http://192.168.86.143:51360/status
2500 parallel jobs
Running 500 simulations
Using dask client at http://192.168.86.143:51360/status
2500 parallel jobs
Running 500 simulations
Using dask client at http://192.168.86.143:51360/status
4000 parallel jobs


In [32]:
samplesize_result = samplesize_gather(samplesize_futures) 
df_result = pd.DataFrame(samplesize_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result.to_pickle("simulate_samplesize_post.pkl")
# loaded = pd.read_pickle("simulate_samplesize.pkl")

Running 500 simulations
Using dask client at http://192.168.86.143:51360/status
5000 parallel jobs
[########################################] | 100% Completed |  8min 37.3s

In [None]:
nfeats_result = nfeats_gather(nfeats_futures)
df_result = pd.DataFrame(nfeats_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result.to_pickle("simulate_nfeats_post.pkl")

In [5]:
progress(maha_futures)

VBox()

In [8]:
maha_result = maha_gather(maha_futures)
df_result = pd.DataFrame(maha_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result.to_pickle("simulate_maha_post.pkl")

Running 500 simulations
Using dask client at http://127.0.0.1:8787/status
2500 parallel jobs


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

KeyboardInterrupt: 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
pd.DataFrame.melt?

In [34]:

df_result = pd.DataFrame(ratio_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result.to_pickle("simulate_ratio_post.pkl")

Running 500 simulations
Using dask client at http://192.168.86.143:51360/status
4000 parallel jobs
[########################################] | 100% Completed |  0.0s

In [None]:
rhino_client.shutdown()

## Compare with original permutation test

In [35]:
def _train_score(estimator, X_train, X_test, y_train, y_test, 
                score_func, shuffle_labels=False):
    if shuffle_labels:
        indices = np.random.default_rng().permutation(len(y_train))
        y_train = y_train[indices]
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict_proba(X_test)[:,1]
    score = score_func(y_true=y_test, y_score=y_pred)
    return score



def pre_training_permutation(estimator, X_train, X_test, y_train, y_test,
                            n_permutations, score_func, verbose=False, n_jobs=None):
    score = _train_score(
        clone(estimator), X_train, X_test, y_train, y_test, score_func, shuffle_labels=False
    )
    permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(_train_score)(
            clone(estimator),
            X_train, X_test, y_train, y_test,
            score_func,
            shuffle_labels=True,
        )
        for _ in range(n_permutations)
    )
    permutation_scores = np.array(permutation_scores)
    pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1)
    return score, permutation_scores, pvalue


In [36]:
@simulate(parameter_range=np.linspace(0., 1.5, 10), n_sim=500)
def simulate_maha_pre(param=None, seed=None):
    X, y = random_data_gen(n_samples=1000, n_feats=10, maha=param, ratio=0.5, seed=seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    estimator = LogisticRegressionCV(class_weight='balanced', Cs=4)
    n_permutations = 5000
    score, null, p = pre_training_permutation(
        estimator,
        X_train, X_test, y_train, y_test,
        n_permutations=n_permutations,
        score_func=roc_auc_score,
        verbose=True, n_jobs=-1
    )
    return score, null, p

Running 500 simulations
using dask client at http://192.168.86.143:51360/status


In [None]:
maha_result, maha_futures = simulate_maha_pre()
df_result = pd.DataFrame(maha_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result.to_pickle("simulate_maha_pre.pkl")

Running 500 simulations
Using dask client at http://192.168.86.143:51360/status
5000 parallel jobs
[                                        ] | 0% Completed | 27min 39.7s

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[                                        ] | 0% Completed | 41min 34.0s

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[                                        ] | 0% Completed | 45min 31.1s

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[                                        ] | 0% Completed | 59min 35.4s

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[                                        ] | 0% Completed |  1hr  3min 33.7s

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[                                        ] | 0% Completed |  1hr 19min 33.0s

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[                                        ] | 0% Completed |  1hr 36min  9.0s

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[                                        ] | 0% Completed |  1hr 54min  6.4s

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[                                        ] | 0% Completed |  2hr 12min  8.3s

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[                                        ] | 0% Completed |  2hr 28min 33.9s

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[                                        ] | 0% Completed |  2hr 46min  4.7s

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[                                        ] | 0% Completed |  3hr  3min 49.7s

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[                                        ] | 0% Completed |  3hr 21min 35.3s

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



[#                                       ] | 2% Completed |  4hr 46min 11.7s

In [7]:
da.get_exceptions(maha_futures, range(len(maha_futures)))

Exception: None of the given futures resulted in exceptions

In [8]:
rhino_client.cancel(maha_futures)

## Next steps
* Re-rerun post-hoc simulations with null maha distance
* Check out over/under-fitting with more or less regularization values. Compare to theoretical auc based on mahalanobis distance
* Check bias of null model (proportion of false positives - at different $\alpha$ thresholds [.1, .05, .01, .005])
* Check power of the test for a non-null model (positive mahalanobis distance) (proportion of predicted positives)
* Compare bias and power/sensitivity to the pre-trained model

## Farther along
* See how class balance, number of samples, etc. effect the above

In [28]:
pd.read_pickle("test_cli.pkl")

Unnamed: 0,param,score,perm_scores,pval
0,0.0,0.552584,"[0.5261418269230769, 0.5464743589743589, 0.493...",0.117764
1,0.0,0.521905,"[0.6139348370927319, 0.4930325814536341, 0.591...",0.289421
2,0.0,0.554046,"[0.4213969404186795, 0.4878220611916264, 0.528...",0.075848
3,0.0,0.554622,"[0.4754901960784314, 0.5567226890756303, 0.533...",0.087824
4,0.0,0.495398,"[0.511904761904762, 0.5066026410564226, 0.5304...",0.544910
...,...,...,...,...
95,1.5,0.668067,"[0.48839535814325735, 0.5104041616646658, 0.44...",0.001996
96,1.5,0.640456,"[0.5083033213285313, 0.521108443377351, 0.4257...",0.001996
97,1.5,1.000000,"[0.5496794871794873, 0.5052083333333334, 0.505...",0.001996
98,1.5,0.684195,"[0.4795673076923077, 0.5384615384615384, 0.494...",0.001996
