# Setup and imports

In [1]:
import warnings; warnings.simplefilter('ignore', FutureWarning)
import numpy as np
import scipy as scp
import time
from copy import deepcopy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (permutation_test_score, learning_curve, LeaveOneGroupOut,
                                     KFold, cross_val_score, cross_val_predict, cross_validate,
                                     train_test_split)
from sklearn.utils import parallel_backend
from sklearn.base import clone
from sklearn import datasets
from joblib.parallel import Parallel, delayed
import pickle
from permutation_helpers import random_data_gen, post_hoc_permutation
from simulate import simulate
from dask.distributed import progress, Client, wait

## Set up client

In [5]:
# import cmldask.CMLDask as da
# rhino_client = da.new_dask_client_slurm(
#     job_name="simulations",
#     memory_per_job="2GB",
#     max_n_jobs=400, threads_per_job=1, 
#     adapt=False,
#     local_directory="/home1/jrudoler/dask-worker-space",
#     log_directory="/home1/jrudoler/logs/",
# )
# client = Client()

In [2]:
from dask_jobqueue import SGECluster

cluster = SGECluster(
    cores = 8,
    memory = "2GB",
)

client = Client(cluster)

In [6]:
client.cluster.scale(5)

In [8]:
futures = client.map(lambda x: x**2, range(1000))

In [12]:
results = client.gather(futures)

In [14]:
client.shutdown()

# Post-hoc simulations

#### Post-hoc simulation parameters

In [4]:
### shared parameters
class_params = {
        "C":np.logspace(np.log10(1e-4), np.log10(1e5), 8),
        "class_weight":"balanced"
    }
permutation_params = {
        "n_permutations": 5000
    }
sim_params = {"n_sim": 500}
data_gen_params = {
    "maha":np.linspace(0., 1.5, 5)[4],
    "psi_diag": 1.0,
    "psi_offdiag": 0.,
    "ddof": 150
}
samplesize_params_post = {
    "sim":{
        "parameter_range":np.logspace(2, 5, 5).astype(int),
        **sim_params
    },
    "data_gen": {
        "maha":0.,
        "n_feats":10,
        "class_ratio":0.5,
        **data_gen_params
        
    },
    "classif": class_params,
    "perm": permutation_params
}

nfeats_params_post = {
    "sim":{
        "parameter_range":np.logspace(1, 10, 5, base=2).astype(int),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "maha":0.,
        "class_ratio":0.5,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params
}

ratio_params_post = {
    "sim":{
        "parameter_range": np.logspace(np.log10(.01), np.log10(.5), 5), #np.linspace(.1, .9, 5),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "n_feats":10,
        "maha":0.,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params
}

testsize_params_post = {
    "sim":{
        "parameter_range":np.logspace(np.log10(.01), np.log10(.5), 5),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "n_feats":10,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params
}

#### Simulation functions

In [5]:
@simulate(**samplesize_params_post["sim"])
def simulate_samplesize_post(param=None, seed=None, settings=samplesize_params_post):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set
    X_val, y_val = random_data_gen(n_samples=1000, seed=None, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_estimator = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        if AUC >= max_AUC:
            max_AUC = AUC
            best_estimator = estimator
    ## use model with tuned penalty
    y_pred = best_estimator.predict_proba(X_test)[:, 1]
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    return score, permutation_scores, pvalue

@simulate(**testsize_params_post["sim"])
def simulate_testsize_post(param=None, seed=None, settings=testsize_params_post):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=param, shuffle=True)
    ## Simulate validation set
    X_val, y_val = random_data_gen(seed=None, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_estimator = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        if AUC >= max_AUC:
            max_AUC = AUC
            best_estimator = estimator
    ## use model with tuned penalty
    y_pred = best_estimator.predict_proba(X_test)[:, 1]
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    return score, permutation_scores, pvalue

@simulate(**nfeats_params_post["sim"])
def simulate_nfeats_post(param=None, seed=None, settings=nfeats_params_post):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(n_feats=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set
    X_val, y_val = random_data_gen(n_feats=param, seed=None, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_estimator = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        if AUC >= max_AUC:
            max_AUC = AUC
            best_estimator = estimator
    ## use model with tuned penalty
    y_pred = best_estimator.predict_proba(X_test)[:, 1]
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    return score, permutation_scores, pvalue

@simulate(**ratio_params_post["sim"])
def simulate_ratio_post(param=None, seed=None, settings=ratio_params_post):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(class_ratio=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set, same as original dataset
    X_val, y_val = random_data_gen(class_ratio=param, seed=None, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_estimator = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        if AUC >= max_AUC:
            max_AUC = AUC
            best_estimator = estimator
    ## use model with tuned penalty
    y_pred = best_estimator.predict_proba(X_test)[:, 1]
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    return score, permutation_scores, pvalue

Running 500 simulations
Using dask client at http://127.0.0.1:8787/status
Running 500 simulations
Using dask client at http://127.0.0.1:8787/status
Running 500 simulations
Using dask client at http://127.0.0.1:8787/status
Running 500 simulations
Using dask client at http://127.0.0.1:8787/status


#### Run functions

In [None]:
testsize_futures_post, testsize_gather = simulate_testsize_post()
samplesize_futures_post, samplesize_gather = simulate_samplesize_post()
nfeats_futures_post, nfeats_gather = simulate_nfeats_post()
ratio_futures_post, ratio_gather = simulate_ratio_post()

Key:       simulate_testsize_post-6b4451ba466ae72e4f33d5779c88d9b9
Function:  simulate_testsize_post
args:      ()
kwargs:    {'param': 0.07071067811865475, 'seed': 0}
Exception: 'TypeError("permutation_helpers.random_data_gen() got multiple values for keyword argument \'n_samples\'")'

Key:       simulate_testsize_post-bc6e1173499f0cbd9437c9e36c714868
Function:  simulate_testsize_post
args:      ()
kwargs:    {'param': 0.026591479484724942, 'seed': 0}
Exception: 'TypeError("permutation_helpers.random_data_gen() got multiple values for keyword argument \'n_samples\'")'

Key:       simulate_testsize_post-09e5274a68166043fbacc8c07900d945
Function:  simulate_testsize_post
args:      ()
kwargs:    {'param': 0.026591479484724942, 'seed': 1}
Exception: 'TypeError("permutation_helpers.random_data_gen() got multiple values for keyword argument \'n_samples\'")'

Key:       simulate_testsize_post-30dc2ad67aef644c4783804f0a0ca176
Function:  simulate_testsize_post
args:      ()
kwargs:    {'param'

2500 parallel jobs


Key:       simulate_testsize_post-ce05db0a062050e61af9c9dae95ea71e
Function:  simulate_testsize_post
args:      ()
kwargs:    {'param': 0.01, 'seed': 81}
Exception: 'TypeError("permutation_helpers.random_data_gen() got multiple values for keyword argument \'n_samples\'")'

Key:       simulate_testsize_post-7a7447dda156f28c89ccda3d3b6bcfea
Function:  simulate_testsize_post
args:      ()
kwargs:    {'param': 0.01, 'seed': 82}
Exception: 'TypeError("permutation_helpers.random_data_gen() got multiple values for keyword argument \'n_samples\'")'

Key:       simulate_testsize_post-8a78598a083b65cc13f5baf63954bd06
Function:  simulate_testsize_post
args:      ()
kwargs:    {'param': 0.07071067811865475, 'seed': 80}
Exception: 'TypeError("permutation_helpers.random_data_gen() got multiple values for keyword argument \'n_samples\'")'

Key:       simulate_testsize_post-a93143527c171b2a4b56bdfbca66bcea
Function:  simulate_testsize_post
args:      ()
kwargs:    {'param': 0.18803015465431966, 'seed'

2500 parallel jobs


Key:       simulate_testsize_post-b89083b596b7a6fc0be70e8af9cc242c
Function:  simulate_testsize_post
args:      ()
kwargs:    {'param': 0.18803015465431966, 'seed': 206}
Exception: 'TypeError("permutation_helpers.random_data_gen() got multiple values for keyword argument \'n_samples\'")'

Key:       simulate_testsize_post-82cfd377d2d36e36a666c5b00cf4ad93
Function:  simulate_testsize_post
args:      ()
kwargs:    {'param': 0.5, 'seed': 207}
Exception: 'TypeError("permutation_helpers.random_data_gen() got multiple values for keyword argument \'n_samples\'")'

Key:       simulate_testsize_post-50ebc17bc0e44e2d918a92eb4ae59dba
Function:  simulate_testsize_post
args:      ()
kwargs:    {'param': 0.18803015465431966, 'seed': 205}
Exception: 'TypeError("permutation_helpers.random_data_gen() got multiple values for keyword argument \'n_samples\'")'

Key:       simulate_testsize_post-714b96ffb7f1d4aa3c09bd494805cc03
Function:  simulate_testsize_post
args:      ()
kwargs:    {'param': 0.02659147

2500 parallel jobs


Key:       simulate_testsize_post-a71d71ae42a8ee80d3cf805da5498f43
Function:  simulate_testsize_post
args:      ()
kwargs:    {'param': 0.01, 'seed': 311}
Exception: 'TypeError("permutation_helpers.random_data_gen() got multiple values for keyword argument \'n_samples\'")'

Key:       simulate_testsize_post-d4e3d22305b22279a82f1d69f73a9992
Function:  simulate_testsize_post
args:      ()
kwargs:    {'param': 0.026591479484724942, 'seed': 312}
Exception: 'TypeError("permutation_helpers.random_data_gen() got multiple values for keyword argument \'n_samples\'")'

Key:       simulate_testsize_post-a6c90f133298a0bb7ea8a4ce003f9d54
Function:  simulate_testsize_post
args:      ()
kwargs:    {'param': 0.5, 'seed': 311}
Exception: 'TypeError("permutation_helpers.random_data_gen() got multiple values for keyword argument \'n_samples\'")'

Key:       simulate_testsize_post-39cba9ce4077b6966923a9fbb569098f
Function:  simulate_testsize_post
args:      ()
kwargs:    {'param': 0.026591479484724942, 's

2500 parallel jobs


Key:       simulate_testsize_post-f7f3906da6d417e637a61202f0dc5325
Function:  simulate_testsize_post
args:      ()
kwargs:    {'param': 0.07071067811865475, 'seed': 381}
Exception: 'TypeError("permutation_helpers.random_data_gen() got multiple values for keyword argument \'n_samples\'")'

Key:       simulate_testsize_post-d2e0d999140291de9843209295a4e7f3
Function:  simulate_testsize_post
args:      ()
kwargs:    {'param': 0.026591479484724942, 'seed': 381}
Exception: 'TypeError("permutation_helpers.random_data_gen() got multiple values for keyword argument \'n_samples\'")'

Key:       simulate_testsize_post-47540b32ed62bab872bab5b3b002e496
Function:  simulate_testsize_post
args:      ()
kwargs:    {'param': 0.026591479484724942, 'seed': 382}
Exception: 'TypeError("permutation_helpers.random_data_gen() got multiple values for keyword argument \'n_samples\'")'

Key:       simulate_testsize_post-1559853e5c527c5f68ff9f78ddab5f48
Function:  simulate_testsize_post
args:      ()
kwargs:    {'

Key:       simulate_testsize_post-e773786a4f01e88a948d3a500d24121a
Function:  simulate_testsize_post
args:      ()
kwargs:    {'param': 0.026591479484724942, 'seed': 393}
Exception: 'TypeError("permutation_helpers.random_data_gen() got multiple values for keyword argument \'n_samples\'")'

Key:       simulate_testsize_post-47d51e844b6a8c71328d137467540abd
Function:  simulate_testsize_post
args:      ()
kwargs:    {'param': 0.026591479484724942, 'seed': 394}
Exception: 'TypeError("permutation_helpers.random_data_gen() got multiple values for keyword argument \'n_samples\'")'

Key:       simulate_testsize_post-6b10e402f9e3d55e500b8049d3a40958
Function:  simulate_testsize_post
args:      ()
kwargs:    {'param': 0.18803015465431966, 'seed': 392}
Exception: 'TypeError("permutation_helpers.random_data_gen() got multiple values for keyword argument \'n_samples\'")'

Key:       simulate_testsize_post-978acca937e036354b1ed39d568687ad
Function:  simulate_testsize_post
args:      ()
kwargs:    {'

#### Gather results

In [7]:
import os
os.makedirs(f"sim_results/maha_{data_gen_params['maha']:.1f}", exist_ok=True)

In [8]:
wait(testsize_futures_post)
testsize_result = testsize_gather(testsize_futures_post) 
df_result = pd.DataFrame(testsize_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = testsize_params_post
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_testsize_post.pkl")
rhino_client.cancel(testsize_futures_post)

In [9]:
wait(samplesize_futures_post)
samplesize_result = samplesize_gather(samplesize_futures_post) 
df_result = pd.DataFrame(samplesize_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = samplesize_params_post
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_samplesize_post.pkl")
rhino_client.cancel(samplesize_futures_post)

In [10]:
wait(nfeats_futures_post)
nfeats_result = nfeats_gather(nfeats_futures_post)
df_result = pd.DataFrame(nfeats_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = nfeats_params_post
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_nfeats_post.pkl")
rhino_client.cancel(nfeats_futures_post)

In [11]:
wait(ratio_futures_post)
ratio_result = ratio_gather(ratio_futures_post)
df_result = pd.DataFrame(ratio_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = ratio_params_post
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_ratio_post.pkl")
rhino_client.cancel(ratio_futures_post)

In [23]:
rhino_client.cancel(samplesize_futures_post + ratio_futures_post + nfeats_futures_post + testsize_futures_post)

In [24]:
rhino_client.shutdown()

# Pre-training permutations (original)

In [10]:
def _train_score(estimator, X_train, X_test, y_train, y_test, 
                score_func, shuffle_labels=False):
    if shuffle_labels:
        indices = np.random.default_rng().permutation(len(y_train))
        y_train = y_train[indices]
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict_proba(X_test)[:,1]
    score = score_func(y_true=y_test, y_score=y_pred)
    return score



def pre_training_permutation(estimator, X_train, X_test, y_train, y_test,
                            n_permutations, score_func, verbose=False, n_jobs=None):
    score = _train_score(
        clone(estimator), X_train, X_test, y_train, y_test, score_func, shuffle_labels=False
    )
    permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(_train_score)(
            clone(estimator),
            X_train, X_test, y_train, y_test,
            score_func,
            shuffle_labels=True,
        )
        for _ in range(n_permutations)
    )
    permutation_scores = np.array(permutation_scores)
    pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1)
    return score, permutation_scores, pvalue

#### Simulation parameters

In [13]:
### shared parameters
# class_params = {
#     "C":1e-3,
#     "class_weight":"balanced"
# }
# permutation_params = {
#     "n_permutations": 5000
# }
# sim_params = {"n_sim": 150}
# data_gen_params = {
#     "maha":np.linspace(0., 1.5, 5)[0],
#     "psi_diag": 1.0,
#     "psi_offdiag": 0.,
#     "ddof": 150
# }
samplesize_params_pre = {
    "sim":{
        "parameter_range":np.logspace(2, 5, 5).astype(int),
        **sim_params
    },
    "data_gen": {
        "n_feats":10,
        "class_ratio":0.5,
        **data_gen_params
        
    },
    "classif": class_params,
    "perm": permutation_params
}

nfeats_params_pre = {
    "sim":{
        "parameter_range":np.logspace(1, 10, 5, base=2).astype(int),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "class_ratio":0.5,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params
}

ratio_params_pre = {
    "sim":{
        "parameter_range": np.logspace(np.log10(.01), np.log10(.5), 5), #np.linspace(.1, .9, 5),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "n_feats":10,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params
}

testsize_params_pre = {
    "sim":{
        "parameter_range":np.logspace(np.log10(.01), np.log10(.5), 5),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "n_feats":10,
        "class_ratio":0.5,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params
}

#### Simulation functions

In [14]:
@simulate(**samplesize_params_pre["sim"])
def simulate_samplesize_pre(param=None, seed=None, settings=samplesize_params_pre):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set
    X_val, y_val = random_data_gen(n_samples=1000, seed=None, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        if AUC >= max_AUC:
            max_AUC = AUC
            best_C = C
    ## set up model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, null, p = pre_training_permutation(
        estimator,
        X_train, X_test, y_train, y_test,
        n_permutations=n_permutations,
        score_func=roc_auc_score,
        verbose=True, n_jobs=-1
    )
    return score, null, p

@simulate(**testsize_params_pre["sim"])
def simulate_testsize_pre(param=None, seed=None, settings=testsize_params_pre):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=param, shuffle=True)
    ## Simulate validation set, same as original dataset
    X_val, y_val = random_data_gen(n_samples=1000, seed=None, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        if AUC >= max_AUC:
            max_AUC = AUC
            best_C = C
    ## set up model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, null, p = pre_training_permutation(
        estimator,
        X_train, X_test, y_train, y_test,
        n_permutations=n_permutations,
        score_func=roc_auc_score,
        verbose=True, n_jobs=-1
    )
    return score, null, p

@simulate(**nfeats_params_pre["sim"])
def simulate_nfeats_pre(param=None, seed=None, settings=nfeats_params_pre):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(n_feats=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set
    X_val, y_val = random_data_gen(n_feats=param, n_samples=1000, seed=None, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        if AUC >= max_AUC:
            max_AUC = AUC
            best_C = C
    ## set up model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, null, p = pre_training_permutation(
        estimator,
        X_train, X_test, y_train, y_test,
        n_permutations=n_permutations,
        score_func=roc_auc_score,
        verbose=True, n_jobs=-1
    )
    return score, null, p

@simulate(**ratio_params_pre["sim"])
def simulate_ratio_pre(param=None, seed=None, settings=ratio_params_pre):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(class_ratio=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set
    X_val, y_val = random_data_gen(class_ratio=param, n_samples=1000, seed=None, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        if AUC >= max_AUC:
            max_AUC = AUC
            best_C = C
    ## set up model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, null, p = pre_training_permutation(
        estimator,
        X_train, X_test, y_train, y_test,
        n_permutations=n_permutations,
        score_func=roc_auc_score,
        verbose=True, n_jobs=-1
    )
    return score, null, p

Running 500 simulations
Using dask client at http://192.168.86.101:51360/status
Running 500 simulations
Using dask client at http://192.168.86.101:51360/status
Running 500 simulations
Using dask client at http://192.168.86.101:51360/status
Running 500 simulations
Using dask client at http://192.168.86.101:51360/status


#### Run functions

In [15]:
testsize_futures_pre, testsize_gather = simulate_testsize_pre()
samplesize_futures_pre, samplesize_gather = simulate_samplesize_pre()
nfeats_futures_pre, nfeats_gather = simulate_nfeats_pre()
ratio_futures_pre, ratio_gather = simulate_ratio_pre()

2500 parallel jobs
2500 parallel jobs
2500 parallel jobs
2500 parallel jobs


#### Gather results

In [16]:
wait(testsize_futures_pre)
testsize_result = testsize_gather(testsize_futures_pre) 
df_result = pd.DataFrame(testsize_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = testsize_params_pre
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_testsize_pre.pkl")
rhino_client.cancel(testsize_futures_pre)

In [None]:
wait(samplesize_futures_pre)
samplesize_result = samplesize_gather(samplesize_futures_pre) 
df_result = pd.DataFrame(samplesize_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = samplesize_params_pre
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_samplesize_pre.pkl")
rhino_client.cancel(samplesize_futures_pre)

In [None]:
wait(nfeats_futures_pre)
nfeats_result = nfeats_gather(nfeats_futures_pre)
df_result = pd.DataFrame(nfeats_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = nfeats_params_pre
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_nfeats_pre.pkl")
rhino_client.cancel(nfeats_futures_pre)

In [None]:
wait(ratio_futures_pre)
ratio_result = ratio_gather(ratio_futures_pre)
df_result = pd.DataFrame(ratio_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = ratio_params_pre
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_ratio_pre.pkl")
rhino_client.cancel(ratio_futures_pre)

In [35]:
rhino_client.cancel(ratio_futures_pre)

In [39]:
rhino_client.cancel(ratio_futures_pre+nfeats_futures_pre+samplesize_futures_pre+testsize_futures_pre)

In [35]:
da.get_exceptions(samplesize_futures_pre, range(len(samplesize_futures_pre)))

Unnamed: 0_level_0,exception,traceback_obj
param,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"NameError(""name 'pre_training_permutation' is ...",<traceback object at 0x2b42fe391ec0>
1,"NameError(""name 'pre_training_permutation' is ...",<traceback object at 0x2b4320207f40>
2,"NameError(""name 'pre_training_permutation' is ...",<traceback object at 0x2b432015bc00>
3,"NameError(""name 'pre_training_permutation' is ...",<traceback object at 0x2b43099f7640>
4,"NameError(""name 'pre_training_permutation' is ...",<traceback object at 0x2b43250bb100>
...,...,...
2495,"NameError(""name 'pre_training_permutation' is ...",<traceback object at 0x2b4320f934c0>
2496,"NameError(""name 'pre_training_permutation' is ...",<traceback object at 0x2b434257e440>
2497,"NameError(""name 'pre_training_permutation' is ...",<traceback object at 0x2b4342099880>
2498,"NameError(""name 'pre_training_permutation' is ...",<traceback object at 0x2b4313809600>


# Comparing runtime

In [38]:
runtime_params = {
    "sim":{
        "parameter_range":np.logspace(2, 5, 5).astype(int),
        **sim_params
    },
    "data_gen": {
        "n_feats":10,
        "class_ratio":0.5,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params
}

@simulate(**runtime_params["sim"])
def simulate_runtime_pre(param=None, seed=None, settings=runtime_params):
    start = time.time()
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set, same as original dataset
    X_val, y_val = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        best_C = C if AUC>max_AUC else best_C
    ## set up model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, null, p = pre_training_permutation(
        estimator,
        X_train, X_test, y_train, y_test,
        n_permutations=n_permutations,
        score_func=roc_auc_score,
        verbose=True, n_jobs=-1
    )
    stop = time.time()
    return stop - start

@simulate(**runtime_params["sim"])
def simulate_runtime_post(param=None, seed=None, settings=runtime_params):
    start = time.time()
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set, same as original dataset
    X_val, y_val = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        best_C = C if AUC>max_AUC else best_C
    ## train model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    estimator.fit(X=X_train, y=y_train)
    y_pred = estimator.predict_proba(X_test)[:, 1]
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    stop = time.time()
    return stop - start

Running 500 simulations
Using dask client at http://192.168.86.120:51360/status
Running 500 simulations
Using dask client at http://192.168.86.120:51360/status


In [39]:
runtime_futures_pre, runtime_gather = simulate_runtime_pre()
runtime_futures_post, runtime_gather = simulate_runtime_post()

2500 parallel jobs
2500 parallel jobs


In [47]:
runtime_result_pre = runtime_gather(runtime_futures_pre)
runtime_result_post = runtime_gather(runtime_futures_post)
df_result_pre = pd.DataFrame(runtime_result_pre).melt(var_name="param")
df_result_post = pd.DataFrame(runtime_result_post).melt(var_name="param")
df_result_pre['test'] = 'pre'
df_result_post['test'] = 'post'
df_result = pd.concat([df_result_pre, df_result_post])
df_result._metadata = runtime_params
df_result.to_pickle(f"sim_results/simulate_runtime.pkl")