# Setup and imports

In [1]:
import warnings; warnings.simplefilter('ignore', FutureWarning)
import numpy as np
import time
import os
from copy import deepcopy
import pandas as pd
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import (permutation_test_score, learning_curve, LeaveOneGroupOut,
                                     KFold, cross_val_score, cross_val_predict, cross_validate,
                                     train_test_split)
from sklearn.utils import parallel_backend
from sklearn.base import clone
import pickle
from joblib.parallel import Parallel, delayed
from permutation_helpers import *
from simulate import simulate
from dask.distributed import progress, Client, wait

## Set up client

In [2]:
# import cmldask.CMLDask as da
# client = da.new_dask_client_slurm(
#     job_name="simulations",
#     memory_per_job="2GB",
#     max_n_jobs=400, threads_per_job=1, 
#     adapt=False,
#     local_directory="/home1/jrudoler/dask-worker-space",
#     log_directory="/home1/jrudoler/logs/",
# )
# client = Client()

In [2]:
# from dask_jobqueue import SGECluster
from dask_jobqueue import SLURMCluster

cluster = SLURMCluster(
    cores = 4, # threads per job
    memory = "2GB",
    processes = 1,
    log_directory = os.path.join(os.environ["HOME"], "logs/"),
    local_directory = os.path.join(os.environ["HOME"], "dask-worker-space/"),
    walltime = "7-00:00:00",
    name = "permutations"
)

# cluster = SGECluster(
#         cores=threads_per_job,
#         processes=processes_per_job,
#         memory=memory_per_job,
#         queue=queue,
#         walltime=walltime,
#         job_name=job_name,
#         local_directory=local_directory or os.environ["HOME"] + "/dask-worker-space/",
#         log_directory=log_directory or os.environ["HOME"],
#         scheduler_options=scheduler_options,
#         **kwargs,
# )

client = Client(cluster)
cluster.adapt(maximum_jobs=200)

<distributed.deploy.adaptive.Adaptive at 0x7fb1fb6d0ad0>

In [7]:
client.shutdown()

2024-07-26 14:09:41,409 - distributed.deploy.adaptive_core - INFO - Adaptive stop


# Post-hoc simulations

#### Post-hoc simulation parameters

In [11]:
### shared parameters
class_params = {
        "C":np.logspace(np.log10(1e-4), np.log10(1e5), 8),
        "class_weight":"balanced"
    }
permutation_params = {
        "n_permutations": 5000
    }
sim_params = {
    "n_sim": 500,
    }
file_params = {
    "save": True,
}
data_gen_params = {
    "maha":np.linspace(0., 1.5, 5)[0],
    "psi_diag": 1.0,
    "psi_offdiag": 0.,
    "ddof": 150
}

## set up directories for saving results

## results are separated by the the underlying probability distributions (and their mahalanobis distance)

import os
data_dir = os.path.join(os.environ["HOME"], "data")
results_dir = os.path.join(data_dir, "sim_results", f"maha_{data_gen_params['maha']:.1f}")
os.makedirs(results_dir, exist_ok=True) 
file_params["results_dir"] = results_dir

## set up parameters for specific simulations

samplesize_params_post = {
    "sim":{
        "parameter_range":np.logspace(2, 5, 5).astype(int),
        **sim_params
    },
    "data_gen": {
        "maha":0.,
        "n_feats":10,
        "class_ratio":0.5,
        **data_gen_params
        
    },
    "classif": class_params,
    "perm": permutation_params,
    "file": file_params
}

nfeats_params_post = {
    "sim":{
        "parameter_range":np.logspace(1, 10, 5, base=2).astype(int),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "maha":0.,
        "class_ratio":0.5,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params,
    "file": file_params
}

ratio_params_post = {
    "sim":{
        "parameter_range": np.logspace(np.log10(.01), np.log10(.5), 5), #np.linspace(.1, .9, 5),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "n_feats":10,
        "maha":0.,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params,
    "file": file_params
}

testsize_params_post = {
    "sim":{
        "parameter_range":np.logspace(np.log10(.01), np.log10(.5), 5),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "n_feats":10,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params,
    "file": file_params
}

#### Simulation functions

In [12]:
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss, brier_score_loss
def score_model(y_true, y_pred):
    """
    Compute performance metrics based on given predictions (output from predict_proba)
      and labels. 
    Returns a dictionary with the following metrics:
    - roc_auc
    - accuracy
    - log_loss
    - brier_score
    """
    # predictions are 1 if the probability of the positive class is greater than 0.5
    y_pred_proba = np.array(y_pred)
    y_pred_disc = (y_pred_proba > 0.5).astype(int)
    # compute metrics
    roc_auc = float(roc_auc_score(y_true, y_pred_proba))
    accuracy = float(accuracy_score(y_true, y_pred_disc))
    logloss = float(log_loss(y_true, y_pred_proba))
    brier_score = float(brier_score_loss(y_true, y_pred_proba, pos_label=1))
    return {"roc_auc":roc_auc, "accuracy":accuracy, "log_loss":logloss, "brier_score":brier_score}

In [13]:
@simulate(**samplesize_params_post["sim"])
def simulate_samplesize_post(param=None, seed=None, simno=None, settings=samplesize_params_post):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, shuffle=True
    )
    ## Simulate validation set
    X_val, y_val = random_data_gen(n_samples=1000, seed=None, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_estimator = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        if AUC >= max_AUC:
            max_AUC = AUC
            best_estimator = estimator
    ## use model with tuned penalty
    y_pred = best_estimator.predict_proba(X_test)[:, 1]
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, permutation_scores = post_hoc_permutation(
        y_true=y_test,
        y_score=y_pred,
        n_permutations=n_permutations,
        score_function=score_model,
        n_jobs=-1,
    )
    if settings["file"]["save"]:
        pickle.dump(
            (score, permutation_scores), 
            open(os.path.join(settings["file"]["results_dir"], f"samplesize_{param:.4f}_simno_{simno}.pkl"), "wb"))
    return score, permutation_scores


@simulate(**testsize_params_post["sim"])
def simulate_testsize_post(param=None, seed=None, simno=None, settings=testsize_params_post):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=param, shuffle=True
    )
    ## Simulate validation set
    X_val, y_val = random_data_gen(seed=None, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_estimator = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        if AUC >= max_AUC:
            max_AUC = AUC
            best_estimator = estimator
    ## use model with tuned penalty
    y_pred = best_estimator.predict_proba(X_test)[:, 1]
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, permutation_scores = post_hoc_permutation(
        y_true=y_test,
        y_score=y_pred,
        n_permutations=n_permutations,
        score_function=score_model,
        n_jobs=-1,
    )
    if settings["file"]["save"]:
        pickle.dump(
            (score, permutation_scores), 
            open(os.path.join(settings["file"]["results_dir"], f"testsize_{param:.4f}_simno_{simno}.pkl"), "wb"))
    return score, permutation_scores


@simulate(**nfeats_params_post["sim"])
def simulate_nfeats_post(param=None, seed=None, simno=None, settings=nfeats_params_post):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(n_feats=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, shuffle=True
    )
    ## Simulate validation set
    X_val, y_val = random_data_gen(n_feats=param, seed=None, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_estimator = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        if AUC >= max_AUC:
            max_AUC = AUC
            best_estimator = estimator
    ## use model with tuned penalty
    y_pred = best_estimator.predict_proba(X_test)[:, 1]
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, permutation_scores = post_hoc_permutation(
        y_true=y_test,
        y_score=y_pred,
        n_permutations=n_permutations,
        score_function=score_model,
        n_jobs=-1,
    )
    if settings["file"]["save"]:
        pickle.dump(
            (score, permutation_scores), 
            open(os.path.join(settings["file"]["results_dir"], f"nfeats_{param:.4f}_simno_{simno}.pkl"), "wb"))
    return score, permutation_scores


@simulate(**ratio_params_post["sim"])
def simulate_ratio_post(param=None, seed=None, simno=None, settings=ratio_params_post):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(class_ratio=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, shuffle=True
    )
    ## Simulate validation set, same as original dataset
    X_val, y_val = random_data_gen(class_ratio=param, seed=None, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_estimator = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        if AUC >= max_AUC:
            max_AUC = AUC
            best_estimator = estimator
    ## use model with tuned penalty
    y_pred = best_estimator.predict_proba(X_test)[:, 1]
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, permutation_scores = post_hoc_permutation(
        y_true=y_test,
        y_score=y_pred,
        n_permutations=n_permutations,
        score_function=score_model,
        n_jobs=-1,
    )
    # save with pickle
    if settings["file"]["save"]:
        pickle.dump(
            (score, permutation_scores), 
            open(os.path.join(settings["file"]["results_dir"], f"ratio_{param:.4f}_simno_{simno}.pkl"), "wb"))
    return score, permutation_scores

Running 500 simulations
Using dask client at http://192.168.86.107:8787/status
Running 500 simulations
Using dask client at http://192.168.86.107:8787/status
Running 500 simulations
Using dask client at http://192.168.86.107:8787/status
Running 500 simulations
Using dask client at http://192.168.86.107:8787/status


#### Run functions

In [14]:
testsize_futures_post, testsize_gather = simulate_testsize_post()
samplesize_futures_post, samplesize_gather = simulate_samplesize_post()
nfeats_futures_post, nfeats_gather = simulate_nfeats_post()
ratio_futures_post, ratio_gather = simulate_ratio_post()

2500 parallel jobs
2500 parallel jobs
2500 parallel jobs
2500 parallel jobs


#### Gather results

In [8]:
wait(testsize_futures_post)
testsize_result = testsize_gather(testsize_futures_post) 
df_result = pd.DataFrame(testsize_result).melt(var_name="param")
del testsize_result
df_result[["score", "perm_scores"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result = pd.merge(
    df_result["score"].apply(pd.Series),
    df_result["perm_scores"].explode().apply(pd.Series).add_prefix("null_"),
    left_index=True,
    right_index=True,
).reset_index(names="simno")
df_result._metadata = testsize_params_post
df_result.to_pickle(os.path.join(results_dir, "simulate_testsize_post.pkl"))
client.cancel(testsize_futures_post)

: 

In [24]:
sample = df_result.head()
sample

Unnamed: 0,simno,param,value,score,perm_scores
0,0,0.01,"({'roc_auc': 0.5238095238095238, 'accuracy': 0...","{'roc_auc': 0.5238095238095238, 'accuracy': 0....","[{'roc_auc': 0.8571428571428571, 'accuracy': 0..."
1,1,0.01,"({'roc_auc': 0.6190476190476191, 'accuracy': 0...","{'roc_auc': 0.6190476190476191, 'accuracy': 0....","[{'roc_auc': 0.7142857142857143, 'accuracy': 0..."
2,2,0.01,"({'roc_auc': 0.625, 'accuracy': 0.6, 'log_loss...","{'roc_auc': 0.625, 'accuracy': 0.6, 'log_loss'...","[{'roc_auc': 0.625, 'accuracy': 0.6, 'log_loss..."
3,3,0.01,"({'roc_auc': 0.39999999999999997, 'accuracy': ...","{'roc_auc': 0.39999999999999997, 'accuracy': 0...","[{'roc_auc': 0.68, 'accuracy': 0.7, 'log_loss'..."
4,4,0.01,"({'roc_auc': 0.5, 'accuracy': 0.6, 'log_loss':...","{'roc_auc': 0.5, 'accuracy': 0.6, 'log_loss': ...","[{'roc_auc': 0.41666666666666663, 'accuracy': ..."


In [33]:
sample["perm_scores"].explode().apply(pd.Series).add_prefix("null_")

Unnamed: 0,null_roc_auc,null_accuracy,null_log_loss,null_brier_score
0,0.857143,0.6,0.649343,0.228244
0,0.809524,0.6,0.653256,0.230198
0,0.666667,0.6,0.675855,0.241419
0,0.809524,0.8,0.654911,0.230961
0,0.142857,0.4,0.741383,0.273931
...,...,...,...,...
4,0.250000,0.2,0.718917,0.262813
4,0.208333,0.2,0.710482,0.258700
4,0.458333,0.4,0.694452,0.250695
4,0.500000,0.4,0.684797,0.245874


In [36]:
pd.merge(
    sample["score"].apply(pd.Series),
    sample["perm_scores"].explode().apply(pd.Series).add_prefix("null_"),
    left_index=True,
    right_index=True,
).reset_index(names="simno")

Unnamed: 0,simno,roc_auc,accuracy,log_loss,brier_score,null_roc_auc,null_accuracy,null_log_loss,null_brier_score
0,0,0.52381,0.4,0.684997,0.246022,0.857143,0.6,0.649343,0.228244
1,0,0.52381,0.4,0.684997,0.246022,0.809524,0.6,0.653256,0.230198
2,0,0.52381,0.4,0.684997,0.246022,0.666667,0.6,0.675855,0.241419
3,0,0.52381,0.4,0.684997,0.246022,0.809524,0.8,0.654911,0.230961
4,0,0.52381,0.4,0.684997,0.246022,0.142857,0.4,0.741383,0.273931
...,...,...,...,...,...,...,...,...,...
24995,4,0.50000,0.6,0.681027,0.243995,0.250000,0.2,0.718917,0.262813
24996,4,0.50000,0.6,0.681027,0.243995,0.208333,0.2,0.710482,0.258700
24997,4,0.50000,0.6,0.681027,0.243995,0.458333,0.4,0.694452,0.250695
24998,4,0.50000,0.6,0.681027,0.243995,0.500000,0.4,0.684797,0.245874


In [None]:
wait(samplesize_futures_post)
samplesize_result = samplesize_gather(samplesize_futures_post) 
df_result = pd.DataFrame(samplesize_result).melt(var_name="param")
del samplesize_result
df_result[["score", "perm_scores"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result = pd.merge(
    df_result["score"].apply(pd.Series),
    df_result["perm_scores"].explode().apply(pd.Series).add_prefix("null_"),
    left_index=True,
    right_index=True,
).reset_index(names="simno")
df_result._metadata = samplesize_params_post
df_result.to_pickle(os.path.join(results_dir,"simulate_samplesize_post.pkl"))
client.cancel(samplesize_futures_post)

In [None]:
wait(nfeats_futures_post)
nfeats_result = nfeats_gather(nfeats_futures_post)
df_result = pd.DataFrame(nfeats_result).melt(var_name="param")
del nfeats_result
df_result[["score", "perm_scores"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result = pd.merge(
    df_result["score"].apply(pd.Series),
    df_result["perm_scores"].explode().apply(pd.Series).add_prefix("null_"),
    left_index=True,
    right_index=True,
).reset_index(names="simno")
df_result._metadata = nfeats_params_post
df_result.to_pickle(os.path.join(results_dir, "simulate_nfeats_post.pkl"))
client.cancel(nfeats_futures_post)

In [None]:
wait(ratio_futures_post)
ratio_result = ratio_gather(ratio_futures_post)
df_result = pd.DataFrame(ratio_result).melt(var_name="param")
del ratio_result
df_result[["score", "perm_scores"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result = pd.merge(
    df_result["score"].apply(pd.Series),
    df_result["perm_scores"].explode().apply(pd.Series).add_prefix("null_"),
    left_index=True,
    right_index=True,
).reset_index(names="simno")
df_result._metadata = ratio_params_post
df_result.to_pickle(os.path.join(results_dir, "simulate_ratio_post.pkl"))
client.cancel(ratio_futures_post)

In [19]:
client.cancel(samplesize_futures_post + ratio_futures_post + nfeats_futures_post + testsize_futures_post)

In [24]:
client.shutdown()

# Pre-training permutations (original)

#### Simulation parameters

In [16]:
### shared parameters
# class_params = {
#     "C":1e-3,
#     "class_weight":"balanced"
# }
# permutation_params = {
#     "n_permutations": 5000
# }
# sim_params = {"n_sim": 150}
# data_gen_params = {
#     "maha":np.linspace(0., 1.5, 5)[0],
#     "psi_diag": 1.0,
#     "psi_offdiag": 0.,
#     "ddof": 150
# }
samplesize_params_pre = {
    "sim":{
        "parameter_range":np.logspace(2, 5, 5).astype(int),
        **sim_params
    },
    "data_gen": {
        "n_feats":10,
        "class_ratio":0.5,
        **data_gen_params
        
    },
    "classif": class_params,
    "perm": permutation_params,
    "file": file_params
}

nfeats_params_pre = {
    "sim":{
        "parameter_range":np.logspace(1, 10, 5, base=2).astype(int),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "class_ratio":0.5,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params,
    "file": file_params
}

ratio_params_pre = {
    "sim":{
        "parameter_range": np.logspace(np.log10(.01), np.log10(.5), 5), #np.linspace(.1, .9, 5),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "n_feats":10,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params,
    "file": file_params
}

testsize_params_pre = {
    "sim":{
        "parameter_range":np.logspace(np.log10(.01), np.log10(.5), 5),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "n_feats":10,
        "class_ratio":0.5,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params,
    "file": file_params
}

#### Simulation functions

In [17]:
@simulate(**samplesize_params_pre["sim"])
def simulate_samplesize_pre(param=None, seed=None, simno=None, settings=samplesize_params_pre):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set
    X_val, y_val = random_data_gen(n_samples=1000, seed=None, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        if AUC >= max_AUC:
            max_AUC = AUC
            best_C = C
    ## set up model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, permutation_scores = pre_training_permutation(
        estimator,
        X_train, X_test, y_train, y_test,
        n_permutations=n_permutations,
        score_func=roc_auc_score,
        verbose=True, n_jobs=-1
    )
    if settings["file"]["save"]:
        pickle.dump(
            (score, permutation_scores), 
            open(os.path.join(settings["file"]["results_dir"], f"pre_samplesize_{param:.4f}_simno_{simno}.pkl"), "wb"))
    return score, permutation_scores

@simulate(**testsize_params_pre["sim"])
def simulate_testsize_pre(param=None, seed=None, simno=None, settings=testsize_params_pre):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=param, shuffle=True)
    ## Simulate validation set, same as original dataset
    X_val, y_val = random_data_gen(seed=None, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        if AUC >= max_AUC:
            max_AUC = AUC
            best_C = C
    ## set up model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, permutation_scores = pre_training_permutation(
        estimator,
        X_train, X_test, y_train, y_test,
        n_permutations=n_permutations,
        score_func=roc_auc_score,
        verbose=True, n_jobs=-1
    )
    if settings["file"]["save"]:
        pickle.dump(
            (score, permutation_scores), 
            open(os.path.join(settings["file"]["results_dir"], f"pre_testsize_{param:.4f}_simno_{simno}.pkl"), "wb"))
    return score, permutation_scores

@simulate(**nfeats_params_pre["sim"])
def simulate_nfeats_pre(param=None, seed=None, simno=None, settings=nfeats_params_pre):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(n_feats=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set
    X_val, y_val = random_data_gen(n_feats=param, seed=None, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        if AUC >= max_AUC:
            max_AUC = AUC
            best_C = C
    ## set up model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, permutation_scores = pre_training_permutation(
        estimator,
        X_train, X_test, y_train, y_test,
        n_permutations=n_permutations,
        score_func=roc_auc_score,
        verbose=True, n_jobs=-1
    )
    if settings["file"]["save"]:
        pickle.dump(
            (score, permutation_scores), 
            open(os.path.join(settings["file"]["results_dir"], f"pre_nfeats_{param:.4f}_simno_{simno}.pkl"), "wb"))
    return score, permutation_scores

@simulate(**ratio_params_pre["sim"])
def simulate_ratio_pre(param=None, seed=None, simno=None, settings=ratio_params_pre):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(class_ratio=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set
    X_val, y_val = random_data_gen(class_ratio=param, seed=None, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        if AUC >= max_AUC:
            max_AUC = AUC
            best_C = C
    ## set up model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, permutation_scores = pre_training_permutation(
        estimator,
        X_train, X_test, y_train, y_test,
        n_permutations=n_permutations,
        score_func=roc_auc_score,
        verbose=True, n_jobs=-1
    )
    if settings["file"]["save"]:
        pickle.dump(
            (score, permutation_scores), 
            open(os.path.join(settings["file"]["results_dir"], f"pre_ratio_{param:.4f}_simno_{simno}.pkl"), "wb"))
    return score, permutation_scores

Running 500 simulations
Using dask client at http://192.168.86.107:8787/status
Running 500 simulations
Using dask client at http://192.168.86.107:8787/status
Running 500 simulations
Using dask client at http://192.168.86.107:8787/status
Running 500 simulations
Using dask client at http://192.168.86.107:8787/status


#### Run functions

In [18]:
testsize_futures_pre, testsize_gather = simulate_testsize_pre()
samplesize_futures_pre, samplesize_gather = simulate_samplesize_pre()
nfeats_futures_pre, nfeats_gather = simulate_nfeats_pre()
ratio_futures_pre, ratio_gather = simulate_ratio_pre()

2500 parallel jobs
2500 parallel jobs
2500 parallel jobs
2500 parallel jobs


#### Gather results

In [None]:
wait(testsize_futures_pre)
testsize_result = testsize_gather(testsize_futures_pre) 
df_result = pd.DataFrame(testsize_result).melt(var_name="param")
df_result[["score", "perm_scores"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result = pd.merge(
    df_result["score"].apply(pd.Series),
    df_result["perm_scores"].explode().apply(pd.Series).add_prefix("null_"),
    left_index=True,
    right_index=True,
).reset_index(names="simno")
df_result._metadata = testsize_params_pre

df_result.to_pickle(os.path.join(results_dir, "simulate_testsize_pre.pkl"))
client.cancel(testsize_futures_pre)

In [None]:
wait(samplesize_futures_pre)
samplesize_result = samplesize_gather(samplesize_futures_pre) 
df_result = pd.DataFrame(samplesize_result).melt(var_name="param")
df_result[["score", "perm_scores"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result = pd.merge(
    df_result["score"].apply(pd.Series),
    df_result["perm_scores"].explode().apply(pd.Series).add_prefix("null_"),
    left_index=True,
    right_index=True,
).reset_index(names="simno")
df_result._metadata = samplesize_params_pre
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_samplesize_pre.pkl")
client.cancel(samplesize_futures_pre)

In [None]:
wait(nfeats_futures_pre)
nfeats_result = nfeats_gather(nfeats_futures_pre)
df_result = pd.DataFrame(nfeats_result).melt(var_name="param")
df_result[["score", "perm_scores"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result = pd.merge(
    df_result["score"].apply(pd.Series),
    df_result["perm_scores"].explode().apply(pd.Series).add_prefix("null_"),
    left_index=True,
    right_index=True,
).reset_index(names="simno")
df_result._metadata = nfeats_params_pre
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_nfeats_pre.pkl")
client.cancel(nfeats_futures_pre)

In [None]:
wait(ratio_futures_pre)
ratio_result = ratio_gather(ratio_futures_pre)
df_result = pd.DataFrame(ratio_result).melt(var_name="param")
df_result[["score", "perm_scores"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result = pd.merge(
    df_result["score"].apply(pd.Series),
    df_result["perm_scores"].explode().apply(pd.Series).add_prefix("null_"),
    left_index=True,
    right_index=True,
).reset_index(names="simno")
df_result._metadata = ratio_params_pre
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_ratio_pre.pkl")
client.cancel(ratio_futures_pre)

In [33]:
client.cancel(ratio_futures_pre+nfeats_futures_pre+samplesize_futures_pre+testsize_futures_pre)

# Comparing runtime

In [38]:
runtime_params = {
    "sim":{
        "parameter_range":np.logspace(2, 5, 5).astype(int),
        **sim_params
    },
    "data_gen": {
        "n_feats":10,
        "class_ratio":0.5,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params
}

@simulate(**runtime_params["sim"])
def simulate_runtime_pre(param=None, seed=None, settings=runtime_params):
    start = time.time()
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set, same as original dataset
    X_val, y_val = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        best_C = C if AUC>max_AUC else best_C
    ## set up model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, null, p = pre_training_permutation(
        estimator,
        X_train, X_test, y_train, y_test,
        n_permutations=n_permutations,
        score_func=roc_auc_score,
        verbose=True, n_jobs=-1
    )
    stop = time.time()
    return stop - start

@simulate(**runtime_params["sim"])
def simulate_runtime_post(param=None, seed=None, settings=runtime_params):
    start = time.time()
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set, same as original dataset
    X_val, y_val = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        best_C = C if AUC>max_AUC else best_C
    ## train model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    estimator.fit(X=X_train, y=y_train)
    y_pred = estimator.predict_proba(X_test)[:, 1]
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    stop = time.time()
    return stop - start

Running 500 simulations
Using dask client at http://192.168.86.120:51360/status
Running 500 simulations
Using dask client at http://192.168.86.120:51360/status


In [39]:
runtime_futures_pre, runtime_gather = simulate_runtime_pre()
runtime_futures_post, runtime_gather = simulate_runtime_post()

2500 parallel jobs
2500 parallel jobs


In [47]:
runtime_result_pre = runtime_gather(runtime_futures_pre)
runtime_result_post = runtime_gather(runtime_futures_post)
df_result_pre = pd.DataFrame(runtime_result_pre).melt(var_name="param")
df_result_post = pd.DataFrame(runtime_result_post).melt(var_name="param")
df_result_pre['test'] = 'pre'
df_result_post['test'] = 'post'
df_result = pd.concat([df_result_pre, df_result_post])
df_result._metadata = runtime_params
df_result.to_pickle(f"sim_results/simulate_runtime.pkl")