# Setup and imports

In [1]:
import warnings; warnings.simplefilter('ignore', FutureWarning)
import numpy as np
import time
import os
from copy import deepcopy
import pandas as pd
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import (permutation_test_score, learning_curve, LeaveOneGroupOut,
                                     KFold, cross_val_score, cross_val_predict, cross_validate,
                                     train_test_split)
from sklearn.utils import parallel_backend
from sklearn.base import clone
import pickle
from joblib.parallel import Parallel, delayed
from permutation_helpers import *
from simulate import simulate

# Dask imports
from dask.distributed import progress, Client, wait, Future
from dask_jobqueue import SGECluster
# from dask_jobqueue import SLURMCluster

# helpful functions for debugging and monitoring Dask jobs
def get_exceptions(futures: Iterable[Future], params: Iterable = None) -> pd.DataFrame:
    """
    Given a list of Dask futures and a list of parameters,
    return a DataFrame of exceptions.
    """
    if params is None:
        params = range(len(futures))
    exceptions = []
    for i, (param, future) in enumerate(zip(params, futures)):
        if future.status == "error":
            exceptions.append(pd.Series(
                {
                    "param": param,
                    "exception": repr(future.exception()),
                    "traceback_obj": future.traceback(),
                }
            ))
    if not len(exceptions):
        raise Exception("None of the given futures resulted in exceptions")
    exceptions = pd.concat(exceptions, axis=1).T
    exceptions.set_index("param", inplace=True)
    return exceptions

import traceback
def print_traceback(error_df, index):
    traceback.print_tb(error_df.loc[index, "traceback_obj"])

## Set up client

In [2]:
cluster = SGECluster( # or replace with SLURMCluster or other
    cores = 4, # threads per job
    memory = "2GB",
    processes = 1,
    log_directory = os.path.join(os.environ["HOME"], "logs/"),
    local_directory = os.path.join(os.environ["HOME"], "dask-worker-space/"),
    walltime = "06:00:00",
    name = "permutations"
)

client = Client(cluster)
cluster.adapt(maximum_jobs=100)

<distributed.deploy.adaptive.Adaptive at 0x148e3b964950>

In [11]:
maha_values = np.linspace(0., 1.5, 5)

def set_params(maha, save=True): 
    ### shared parameters
    class_params = {
            "C":np.logspace(np.log10(1e-4), np.log10(1e5), 8),
            "class_weight":"balanced"
        }
    permutation_params = {
            "n_permutations": 5000
        }
    sim_params = {
        "n_sim": 500,
        }
    
    data_dir = os.path.join(os.environ["HOME"], "data")
    results_dir = os.path.join(data_dir, "sim_results", f"maha_{maha:.1f}")
    file_params = {
        "save": True,
        "results_dir": results_dir
    }
    data_gen_params = {
        "maha": maha,
        "psi_diag": 1.0,
        "psi_offdiag": 0.,
        "ddof": 150,
        "n_samples":1000,
        "n_feats": 10,
        "class_ratio": 0.5,
    }
    ## default parameters for simulations
    default_params ={
        "sim" : deepcopy(sim_params),
        "data_gen" : deepcopy(data_gen_params),
        "classif" : deepcopy(class_params),
        "perm" : deepcopy(permutation_params),
        "file" : deepcopy(file_params)
    }

    ## set up parameters for specific simulations
    ## use default parameters and update the ones that need to be changed
    ## for each simulation, we will vary one parameter at a time and remove 
    ## the corresponding key from the data_gen dictionary to avoid conflicts

    samplesize_params = deepcopy(default_params)
    samplesize_params["sim"]["parameter_range"] = np.logspace(2, 5, 5).astype(int)
    samplesize_params["data_gen"].pop("n_samples")

    nfeats_params = deepcopy(default_params)
    nfeats_params["sim"]["parameter_range"] = np.logspace(1, 10, 5, base=2).astype(int)
    nfeats_params["data_gen"].pop("n_feats")

    ratio_params = deepcopy(default_params)
    ratio_params["sim"]["parameter_range"] = np.logspace(np.log10(.01), np.log10(.5), 5)
    ratio_params["data_gen"].pop("class_ratio")

    testsize_params = deepcopy(default_params)
    testsize_params["sim"]["parameter_range"] = np.logspace(np.log10(.01), np.log10(.5), 5)
    if save:
        pickle.dump(samplesize_params, 
                    open(f"settings/samplesize_params_maha_{data_gen_params['maha']:.1f}.pkl", "wb"))
        pickle.dump(nfeats_params,
                    open(f"settings/nfeats_params_maha_{data_gen_params['maha']:.1f}.pkl", "wb"))
        pickle.dump(ratio_params,
                    open(f"settings/ratio_params_maha_{data_gen_params['maha']:.1f}.pkl", "wb"))
        pickle.dump(testsize_params,
                    open(f"settings/testsize_params_maha_{data_gen_params['maha']:.1f}.pkl", "wb"))
    return samplesize_params, nfeats_params, ratio_params, testsize_params


for maha in maha_values:
    data_dir = os.path.join(os.environ["HOME"], "data")
    results_dir = os.path.join(data_dir, "sim_results", f"maha_{maha:.1f}")
    ## set up directories for saving results
    ## results are separated by the the underlying probability distributions (and their mahalanobis distance)
    os.makedirs(results_dir, exist_ok=True) 

# Post-hoc simulations

In [7]:
for maha in maha_values:
    samplesize_params, nfeats_params, ratio_params, testsize_params = set_params(maha, save=True)
    @simulate(**samplesize_params["sim"])
    def simulate_samplesize_post(param=None, seed=None, simno=None, settings=samplesize_params):
        settings = deepcopy(settings)
        ## Simulate dataset
        X, y = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
        ## Split into train-test set
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, shuffle=True
        )
        ## Simulate validation set
        X_val, y_val = random_data_gen(n_samples=1000, seed=None, **settings["data_gen"])
        ## iterate over possible penalty params
        max_AUC = 0
        best_estimator = None
        for C in settings["classif"].pop("C"):
            estimator = LogisticRegression(**settings["classif"], C=C)
            estimator.fit(X=X_train, y=y_train)
            y_pred = estimator.predict_proba(X_val)[:, 1]
            AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
            if AUC >= max_AUC:
                max_AUC = AUC
                best_estimator = estimator
        ## use model with tuned penalty
        y_pred = best_estimator.predict_proba(X_test)[:, 1]
        ## permutations
        n_permutations = settings["perm"]["n_permutations"]
        score, permutation_scores = post_hoc_permutation(
            y_true=y_test,
            y_score=y_pred,
            n_permutations=n_permutations,
            score_function=score_model,
            n_jobs=-1,
        )
        if settings["file"]["save"]:
            pickle.dump(
                (score, permutation_scores), 
                open(os.path.join(settings["file"]["results_dir"], f"post_samplesize_{param:.4f}_simno_{simno:05}.pkl"), "wb"))
        return score, permutation_scores


    @simulate(**testsize_params["sim"])
    def simulate_testsize_post(param=None, seed=None, simno=None, settings=testsize_params):
        settings = deepcopy(settings)
        ## Simulate dataset
        X, y = random_data_gen(seed=seed, **settings["data_gen"])
        ## Split into train-test set
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=param, shuffle=True
        )
        ## Simulate validation set
        X_val, y_val = random_data_gen(seed=None, **settings["data_gen"])
        ## iterate over possible penalty params
        max_AUC = 0
        best_estimator = None
        for C in settings["classif"].pop("C"):
            estimator = LogisticRegression(**settings["classif"], C=C)
            estimator.fit(X=X_train, y=y_train)
            y_pred = estimator.predict_proba(X_val)[:, 1]
            AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
            if AUC >= max_AUC:
                max_AUC = AUC
                best_estimator = estimator
        ## use model with tuned penalty
        y_pred = best_estimator.predict_proba(X_test)[:, 1]
        ## permutations
        n_permutations = settings["perm"]["n_permutations"]
        score, permutation_scores = post_hoc_permutation(
            y_true=y_test,
            y_score=y_pred,
            n_permutations=n_permutations,
            score_function=score_model,
            n_jobs=-1,
        )
        if settings["file"]["save"]:
            pickle.dump(
                (score, permutation_scores), 
                open(os.path.join(settings["file"]["results_dir"], f"post_testsize_{param:.4f}_simno_{simno:05}.pkl"), "wb"))
        return score, permutation_scores


    @simulate(**nfeats_params["sim"])
    def simulate_nfeats_post(param=None, seed=None, simno=None, settings=nfeats_params):
        settings = deepcopy(settings)
        ## Simulate dataset
        X, y = random_data_gen(n_feats=param, seed=seed, **settings["data_gen"])
        ## Split into train-test set
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, shuffle=True
        )
        ## Simulate validation set
        X_val, y_val = random_data_gen(n_feats=param, seed=None, **settings["data_gen"])
        ## iterate over possible penalty params
        max_AUC = 0
        best_estimator = None
        for C in settings["classif"].pop("C"):
            estimator = LogisticRegression(**settings["classif"], C=C)
            estimator.fit(X=X_train, y=y_train)
            y_pred = estimator.predict_proba(X_val)[:, 1]
            AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
            if AUC >= max_AUC:
                max_AUC = AUC
                best_estimator = estimator
        ## use model with tuned penalty
        y_pred = best_estimator.predict_proba(X_test)[:, 1]
        ## permutations
        n_permutations = settings["perm"]["n_permutations"]
        score, permutation_scores = post_hoc_permutation(
            y_true=y_test,
            y_score=y_pred,
            n_permutations=n_permutations,
            score_function=score_model,
            n_jobs=-1,
        )
        if settings["file"]["save"]:
            pickle.dump(
                (score, permutation_scores), 
                open(os.path.join(settings["file"]["results_dir"], f"post_nfeats_{param:.4f}_simno_{simno:05}.pkl"), "wb"))
        return score, permutation_scores


    @simulate(**ratio_params["sim"])
    def simulate_ratio_post(param=None, seed=None, simno=None, settings=ratio_params):
        settings = deepcopy(settings)
        ## Simulate dataset
        X, y = random_data_gen(class_ratio=param, seed=seed, **settings["data_gen"])
        ## Split into train-test set
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, shuffle=True
        )
        ## Simulate validation set, same as original dataset
        X_val, y_val = random_data_gen(class_ratio=param, seed=None, **settings["data_gen"])
        ## iterate over possible penalty params
        max_AUC = 0
        best_estimator = None
        for C in settings["classif"].pop("C"):
            estimator = LogisticRegression(**settings["classif"], C=C)
            estimator.fit(X=X_train, y=y_train)
            y_pred = estimator.predict_proba(X_val)[:, 1]
            AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
            if AUC >= max_AUC:
                max_AUC = AUC
                best_estimator = estimator
        ## use model with tuned penalty
        y_pred = best_estimator.predict_proba(X_test)[:, 1]
        ## permutations
        n_permutations = settings["perm"]["n_permutations"]
        score, permutation_scores = post_hoc_permutation(
            y_true=y_test,
            y_score=y_pred,
            n_permutations=n_permutations,
            score_function=score_model,
            n_jobs=-1,
        )
        # save with pickle
        if settings["file"]["save"]:
            pickle.dump(
                (score, permutation_scores), 
                open(os.path.join(settings["file"]["results_dir"], f"post_ratio_{param:.4f}_simno_{simno:05}.pkl"), "wb"))
        return score, permutation_scores
    
    testsize_futures_post, testsize_gather = simulate_testsize_post()
    samplesize_futures_post, samplesize_gather = simulate_samplesize_post()
    nfeats_futures_post, nfeats_gather = simulate_nfeats_post()
    ratio_futures_post, ratio_gather = simulate_ratio_post()
    # wait for all futures to complete
    wait(testsize_futures_post + samplesize_futures_post + nfeats_futures_post + ratio_futures_post)
    # give cluster time to clean up before next iteration
    time.sleep(60)
    

Running 500 simulations
Using dask client at http://10.152.32.35:8787/status
Running 500 simulations
Using dask client at http://10.152.32.35:8787/status
Running 500 simulations
Using dask client at http://10.152.32.35:8787/status
Running 500 simulations
Using dask client at http://10.152.32.35:8787/status


In [8]:
# testsize_futures_post, testsize_gather = simulate_testsize_post()
# samplesize_futures_post, samplesize_gather = simulate_samplesize_post()
# nfeats_futures_post, nfeats_gather = simulate_nfeats_post()
# ratio_futures_post, ratio_gather = simulate_ratio_post()

2500 parallel jobs
2500 parallel jobs
2500 parallel jobs
2500 parallel jobs


In [12]:
client.cancel(samplesize_futures_post + ratio_futures_post + nfeats_futures_post + testsize_futures_post)

In [13]:
client.shutdown()

# Pre-training permutations (original)

#### Simulation functions

In [10]:
for maha in maha_values:
    samplesize_params, nfeats_params, ratio_params, testsize_params = set_params(maha, save=False)
    
    @simulate(**samplesize_params["sim"])
    def simulate_samplesize_pre(param=None, seed=None, simno=None, settings=samplesize_params):
        settings = deepcopy(settings)
        ## Simulate dataset
        X, y = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
        ## Split into train-test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
        ## Simulate validation set
        X_val, y_val = random_data_gen(n_samples=1000, seed=None, **settings["data_gen"])
        ## iterate over possible penalty params
        max_AUC = 0
        best_C = None
        for C in settings["classif"].pop("C"):
            estimator = LogisticRegression(**settings["classif"], C=C)
            estimator.fit(X=X_train, y=y_train)
            y_pred = estimator.predict_proba(X_val)[:, 1]
            AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
            if AUC >= max_AUC:
                max_AUC = AUC
                best_C = C
        ## set up model with tuned penalty
        estimator = LogisticRegression(**settings["classif"], C=best_C)
        ## permutations
        n_permutations = settings["perm"]["n_permutations"]
        score, permutation_scores = pre_training_permutation(
            estimator,
            X_train, X_test, y_train, y_test,
            n_permutations=n_permutations,
            score_func=score_model,
            verbose=True, n_jobs=-1
        )
        if settings["file"]["save"]:
            pickle.dump(
                (score, permutation_scores), 
                open(os.path.join(settings["file"]["results_dir"], f"pre_samplesize_{param:.4f}_simno_{simno:05}.pkl"), "wb"))
        return score, permutation_scores

    @simulate(**testsize_params["sim"])
    def simulate_testsize_pre(param=None, seed=None, simno=None, settings=testsize_params):
        settings = deepcopy(settings)
        ## Simulate dataset
        X, y = random_data_gen(seed=seed, **settings["data_gen"])
        ## Split into train-test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=param, shuffle=True)
        ## Simulate validation set, same as original dataset
        X_val, y_val = random_data_gen(seed=None, **settings["data_gen"])
        ## iterate over possible penalty params
        max_AUC = 0
        best_C = None
        for C in settings["classif"].pop("C"):
            estimator = LogisticRegression(**settings["classif"], C=C)
            estimator.fit(X=X_train, y=y_train)
            y_pred = estimator.predict_proba(X_val)[:, 1]
            AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
            if AUC >= max_AUC:
                max_AUC = AUC
                best_C = C
        ## set up model with tuned penalty
        estimator = LogisticRegression(**settings["classif"], C=best_C)
        ## permutations
        n_permutations = settings["perm"]["n_permutations"]
        score, permutation_scores = pre_training_permutation(
            estimator,
            X_train, X_test, y_train, y_test,
            n_permutations=n_permutations,
            score_func=score_model,
            verbose=True, n_jobs=-1
        )
        if settings["file"]["save"]:
            pickle.dump(
                (score, permutation_scores), 
                open(os.path.join(settings["file"]["results_dir"], f"pre_testsize_{param:.4f}_simno_{simno:05}.pkl"), "wb"))
        return score, permutation_scores

    @simulate(**nfeats_params["sim"])
    def simulate_nfeats_pre(param=None, seed=None, simno=None, settings=nfeats_params):
        settings = deepcopy(settings)
        ## Simulate dataset
        X, y = random_data_gen(n_feats=param, seed=seed, **settings["data_gen"])
        ## Split into train-test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
        ## Simulate validation set
        X_val, y_val = random_data_gen(n_feats=param, seed=None, **settings["data_gen"])
        ## iterate over possible penalty params
        max_AUC = 0
        best_C = None
        for C in settings["classif"].pop("C"):
            estimator = LogisticRegression(**settings["classif"], C=C)
            estimator.fit(X=X_train, y=y_train)
            y_pred = estimator.predict_proba(X_val)[:, 1]
            AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
            if AUC >= max_AUC:
                max_AUC = AUC
                best_C = C
        ## set up model with tuned penalty
        estimator = LogisticRegression(**settings["classif"], C=best_C)
        ## permutations
        n_permutations = settings["perm"]["n_permutations"]
        score, permutation_scores = pre_training_permutation(
            estimator,
            X_train, X_test, y_train, y_test,
            n_permutations=n_permutations,
            score_func=score_model,
            verbose=True, n_jobs=-1
        )
        if settings["file"]["save"]:
            pickle.dump(
                (score, permutation_scores), 
                open(os.path.join(settings["file"]["results_dir"], f"pre_nfeats_{param:.4f}_simno_{simno:05}.pkl"), "wb"))
        return score, permutation_scores

    @simulate(**ratio_params["sim"])
    def simulate_ratio_pre(param=None, seed=None, simno=None, settings=ratio_params):
        settings = deepcopy(settings)
        ## Simulate dataset
        X, y = random_data_gen(class_ratio=param, seed=seed, **settings["data_gen"])
        ## Split into train-test set
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
        ## Simulate validation set
        X_val, y_val = random_data_gen(class_ratio=param, seed=None, **settings["data_gen"])
        ## iterate over possible penalty params
        max_AUC = 0
        best_C = None
        for C in settings["classif"].pop("C"):
            estimator = LogisticRegression(**settings["classif"], C=C)
            estimator.fit(X=X_train, y=y_train)
            y_pred = estimator.predict_proba(X_val)[:, 1]
            AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
            if AUC >= max_AUC:
                max_AUC = AUC
                best_C = C
        ## set up model with tuned penalty
        estimator = LogisticRegression(**settings["classif"], C=best_C)
        ## permutations
        n_permutations = settings["perm"]["n_permutations"]
        score, permutation_scores = pre_training_permutation(
            estimator,
            X_train, X_test, y_train, y_test,
            n_permutations=n_permutations,
            score_func=score_model,
            verbose=True, n_jobs=-1
        )
        if settings["file"]["save"]:
            pickle.dump(
                (score, permutation_scores), 
                open(os.path.join(settings["file"]["results_dir"], f"pre_ratio_{param:.4f}_simno_{simno:05}.pkl"), "wb"))
        return score, permutation_scores
    testsize_futures_pre, testsize_gather = simulate_testsize_pre()
    samplesize_futures_pre, samplesize_gather = simulate_samplesize_pre()
    nfeats_futures_pre, nfeats_gather = simulate_nfeats_pre()
    ratio_futures_pre, ratio_gather = simulate_ratio_pre()
    # wait for all futures to complete
    wait(testsize_futures_pre + samplesize_futures_pre + nfeats_futures_pre + ratio_futures_pre)
    # give cluster time to clean up before next iteration
    time.sleep(60)


Running 500 simulations
Using dask client at http://10.152.32.35:8787/status
Running 500 simulations
Using dask client at http://10.152.32.35:8787/status
Running 500 simulations
Using dask client at http://10.152.32.35:8787/status
Running 500 simulations
Using dask client at http://10.152.32.35:8787/status


#### Run functions

In [5]:
testsize_futures_pre, testsize_gather = simulate_testsize_pre()
samplesize_futures_pre, samplesize_gather = simulate_samplesize_pre()
nfeats_futures_pre, nfeats_gather = simulate_nfeats_pre()
ratio_futures_pre, ratio_gather = simulate_ratio_pre()

2500 parallel jobs
2500 parallel jobs
2500 parallel jobs
2500 parallel jobs


#### Gather results

In [69]:
get_exceptions(testsize_futures_pre)

Unnamed: 0_level_0,exception,traceback_obj
param,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"TypeError(""score_model() got an unexpected key...",<traceback object at 0x14e130890540>
1,"TypeError(""score_model() got an unexpected key...",<traceback object at 0x14e1241c5780>
2,"TypeError(""score_model() got an unexpected key...",<traceback object at 0x14e10558c180>
3,"TypeError(""score_model() got an unexpected key...",<traceback object at 0x14e104dff500>
4,"TypeError(""score_model() got an unexpected key...",<traceback object at 0x14e1a59d9380>
...,...,...
2495,"TypeError(""score_model() got an unexpected key...",<traceback object at 0x14e104e1f740>
2496,"TypeError(""score_model() got an unexpected key...",<traceback object at 0x14e104e4ed00>
2497,"TypeError(""score_model() got an unexpected key...",<traceback object at 0x14e104e1c700>
2498,"TypeError(""score_model() got an unexpected key...",<traceback object at 0x14e104e12e40>


In [9]:
client.cancel(ratio_futures_pre+nfeats_futures_pre+samplesize_futures_pre+testsize_futures_pre)

# Comparing runtime

In [38]:
runtime_params = {
    "sim":{
        "parameter_range":np.logspace(2, 5, 5).astype(int),
        **sim_params
    },
    "data_gen": {
        "n_feats":10,
        "class_ratio":0.5,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params
}

@simulate(**runtime_params["sim"])
def simulate_runtime_pre(param=None, seed=None, settings=runtime_params):
    start = time.time()
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set, same as original dataset
    X_val, y_val = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        best_C = C if AUC>max_AUC else best_C
    ## set up model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, null, p = pre_training_permutation(
        estimator,
        X_train, X_test, y_train, y_test,
        n_permutations=n_permutations,
        score_func=roc_auc_score,
        verbose=True, n_jobs=-1
    )
    stop = time.time()
    return stop - start

@simulate(**runtime_params["sim"])
def simulate_runtime_post(param=None, seed=None, settings=runtime_params):
    start = time.time()
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set, same as original dataset
    X_val, y_val = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        best_C = C if AUC>max_AUC else best_C
    ## train model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    estimator.fit(X=X_train, y=y_train)
    y_pred = estimator.predict_proba(X_test)[:, 1]
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    stop = time.time()
    return stop - start

Running 500 simulations
Using dask client at http://192.168.86.120:51360/status
Running 500 simulations
Using dask client at http://192.168.86.120:51360/status


In [39]:
runtime_futures_pre, runtime_gather = simulate_runtime_pre()
runtime_futures_post, runtime_gather = simulate_runtime_post()

2500 parallel jobs
2500 parallel jobs


In [47]:
runtime_result_pre = runtime_gather(runtime_futures_pre)
runtime_result_post = runtime_gather(runtime_futures_post)
df_result_pre = pd.DataFrame(runtime_result_pre).melt(var_name="param")
df_result_post = pd.DataFrame(runtime_result_post).melt(var_name="param")
df_result_pre['test'] = 'pre'
df_result_post['test'] = 'post'
df_result = pd.concat([df_result_pre, df_result_post])
df_result._metadata = runtime_params
df_result.to_pickle(f"sim_results/simulate_runtime.pkl")