# Setup and imports

In [3]:
import warnings
warnings.simplefilter('ignore', FutureWarning)
import numpy as np
import scipy as scp
from copy import deepcopy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (permutation_test_score, learning_curve, LeaveOneGroupOut,
                                     KFold, cross_val_score, cross_val_predict, cross_validate,
                                     train_test_split)
from sklearn.utils import parallel_backend
from sklearn.base import clone
from sklearn import datasets
from joblib.parallel import Parallel, delayed
import pickle
from permutation_helpers import random_data_gen, post_hoc_permutation
from simulate import simulate
from dask.distributed import progress, Client, wait



## Set up client

In [6]:
import cmldask.CMLDask as da
rhino_client = da.new_dask_client_slurm(
    job_name="simulations",
    memory_per_job="2GB",
    max_n_jobs=400, threads_per_job=1, 
    adapt=False,
    local_directory="/home1/jrudoler/dask-worker-space",
    log_directory="/home1/jrudoler/logs/",
#     resource_spec="h_vmem=2.5G,s_vmem=2.5G"
)

Unique port for jrudoler is 51360
{'dashboard_address': ':51360'}
To view the dashboard, run: 
`ssh -fN jrudoler@rhino2.psych.upenn.edu -L 8000:192.168.86.120:51360` in your local computer's terminal (NOT rhino) 
and then navigate to localhost:8000 in your browser
You've chosen to scale your cluster manually. This means workers will continue to run until you manually shut them down. Remember to run `client.shutdown` after you're done computing and no longer need to reserve resources.


In [4]:
#rhino_client.cluster.scale(400)

In [None]:
rhino_client.shutdown()

# Post-hoc simulations

#### Post-hoc simulation parameters

In [4]:
### shared parameters
class_params = {
        "C":np.logspace(np.log10(1e-4), np.log10(1e5), 8),
        "class_weight":"balanced"
    }
permutation_params = {
        "n_permutations": 5000
    }
sim_params = {"n_sim": 500}
data_gen_params = {
    "maha":np.linspace(0., 1.5, 5)[4],
    "psi_diag": 1.0,
    "psi_offdiag": 0.,
    "ddof": 150
}
samplesize_params_post = {
    "sim":{
        "parameter_range":np.logspace(2, 5, 5).astype(int),
        **sim_params
    },
    "data_gen": {
        "maha":0.,
        "n_feats":10,
        "class_ratio":0.5,
        **data_gen_params
        
    },
    "classif": class_params,
    "perm": permutation_params
}

nfeats_params_post = {
    "sim":{
        "parameter_range":np.logspace(1, 10, 5, base=2).astype(int),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "maha":0.,
        "class_ratio":0.5,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params
}

ratio_params_post = {
    "sim":{
        "parameter_range": np.logspace(np.log10(.01), np.log10(.5), 5), #np.linspace(.1, .9, 5),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "n_feats":10,
        "maha":0.,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params
}

testsize_params_post = {
    "sim":{
        "parameter_range":np.logspace(np.log10(.01), np.log10(.5), 5),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "n_feats":10,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params
}

#### Simulation functions

In [5]:
@simulate(**samplesize_params_post["sim"])
def simulate_samplesize_post(param=None, seed=None, settings=samplesize_params_post):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set, same as original dataset
    X_val, y_val = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        best_C = C if AUC>max_AUC else best_C
    ## train model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    estimator.fit(X=X_train, y=y_train)
    y_pred = estimator.predict_proba(X_test)[:, 1]
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    return score, permutation_scores, pvalue

@simulate(**testsize_params_post["sim"])
def simulate_testsize_post(param=None, seed=None, settings=testsize_params_post):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=param, shuffle=True)
    ## Simulate validation set, same as original dataset
    X_val, y_val = random_data_gen(seed=seed, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        best_C = C if AUC>max_AUC else best_C
    ## train model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    estimator.fit(X=X_train, y=y_train)
    y_pred = estimator.predict_proba(X_test)[:, 1]
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    return score, permutation_scores, pvalue

@simulate(**nfeats_params_post["sim"])
def simulate_nfeats_post(param=None, seed=None, settings=nfeats_params_post):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(n_feats=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set, same as original dataset
    X_val, y_val = random_data_gen(n_feats=param, seed=seed, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        best_C = C if AUC>max_AUC else best_C
    ## train model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    estimator.fit(X=X_train, y=y_train)
    y_pred = estimator.predict_proba(X_test)[:, 1]
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    return score, permutation_scores, pvalue

@simulate(**ratio_params_post["sim"])
def simulate_ratio_post(param=None, seed=None, settings=ratio_params_post):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(class_ratio=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set, same as original dataset
    X_val, y_val = random_data_gen(class_ratio=param, seed=seed, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        best_C = C if AUC>max_AUC else best_C
    ## train model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    estimator.fit(X=X_train, y=y_train)
    y_pred = estimator.predict_proba(X_test)[:, 1]
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    return score, permutation_scores, pvalue

Running 500 simulations
No dask client available, running sequentially
Running 500 simulations
No dask client available, running sequentially
Running 500 simulations
No dask client available, running sequentially
Running 500 simulations
No dask client available, running sequentially


#### Run functions

In [7]:
testsize_futures_post, testsize_gather = simulate_testsize_post()
samplesize_futures_post, samplesize_gather = simulate_samplesize_post()
nfeats_futures_post, nfeats_gather = simulate_nfeats_post()
ratio_futures_post, ratio_gather = simulate_ratio_post()

2500 parallel jobs
2500 parallel jobs
2500 parallel jobs
2500 parallel jobs


#### Gather results

In [7]:
import os
os.makedirs(f"sim_results/maha_{data_gen_params['maha']:.1f}", exist_ok=True)

In [8]:
wait(testsize_futures_post)
testsize_result = testsize_gather(testsize_futures_post) 
df_result = pd.DataFrame(testsize_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = testsize_params_post
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_testsize_post.pkl")
rhino_client.cancel(testsize_futures_post)

In [9]:
wait(samplesize_futures_post)
samplesize_result = samplesize_gather(samplesize_futures_post) 
df_result = pd.DataFrame(samplesize_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = samplesize_params_post
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_samplesize_post.pkl")
rhino_client.cancel(samplesize_futures_post)

In [10]:
wait(nfeats_futures_post)
nfeats_result = nfeats_gather(nfeats_futures_post)
df_result = pd.DataFrame(nfeats_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = nfeats_params_post
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_nfeats_post.pkl")
rhino_client.cancel(nfeats_futures_post)

In [11]:
wait(ratio_futures_post)
ratio_result = ratio_gather(ratio_futures_post)
df_result = pd.DataFrame(ratio_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = ratio_params_post
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_ratio_post.pkl")
rhino_client.cancel(ratio_futures_post)

In [23]:
rhino_client.cancel(samplesize_futures_post + ratio_futures_post + nfeats_futures_post + testsize_futures_post)

In [24]:
rhino_client.shutdown()

In [21]:
da.get_exceptions(ratio_futures_pre, np.arange(len(samplesize_futures_post)))

Unnamed: 0_level_0,exception,traceback_obj
param,Unnamed: 1_level_1,Unnamed: 2_level_1
35,ValueError('Only one class present in y_true. ...,<traceback object at 0x2ab9abcecd00>
40,ValueError('Only one class present in y_true. ...,<traceback object at 0x2ab9b89c7440>
105,ValueError('Only one class present in y_true. ...,<traceback object at 0x2ab9de046f80>
120,ValueError('Only one class present in y_true. ...,<traceback object at 0x2ab9dfab0300>
255,ValueError('Only one class present in y_true. ...,<traceback object at 0x2ab9abc862c0>
256,ValueError('Only one class present in y_true. ...,<traceback object at 0x2ab9bbbfdec0>
260,ValueError('Only one class present in y_true. ...,<traceback object at 0x2ab9aab40f80>
295,ValueError('Only one class present in y_true. ...,<traceback object at 0x2ab9df2aeb40>
330,ValueError('Only one class present in y_true. ...,<traceback object at 0x2ab9df71c4c0>
335,ValueError('Only one class present in y_true. ...,<traceback object at 0x2ab9cf67c380>


# Pre-training permutations (original)

In [10]:
def _train_score(estimator, X_train, X_test, y_train, y_test, 
                score_func, shuffle_labels=False):
    if shuffle_labels:
        indices = np.random.default_rng().permutation(len(y_train))
        y_train = y_train[indices]
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict_proba(X_test)[:,1]
    score = score_func(y_true=y_test, y_score=y_pred)
    return score



def pre_training_permutation(estimator, X_train, X_test, y_train, y_test,
                            n_permutations, score_func, verbose=False, n_jobs=None):
    score = _train_score(
        clone(estimator), X_train, X_test, y_train, y_test, score_func, shuffle_labels=False
    )
    permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(_train_score)(
            clone(estimator),
            X_train, X_test, y_train, y_test,
            score_func,
            shuffle_labels=True,
        )
        for _ in range(n_permutations)
    )
    permutation_scores = np.array(permutation_scores)
    pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1)
    return score, permutation_scores, pvalue

#### Simulation parameters

In [13]:
### shared parameters
# class_params = {
#     "C":1e-3,
#     "class_weight":"balanced"
# }
# permutation_params = {
#     "n_permutations": 5000
# }
# sim_params = {"n_sim": 150}
# data_gen_params = {
#     "maha":np.linspace(0., 1.5, 5)[0],
#     "psi_diag": 1.0,
#     "psi_offdiag": 0.,
#     "ddof": 150
# }
samplesize_params_pre = {
    "sim":{
        "parameter_range":np.logspace(2, 5, 5).astype(int),
        **sim_params
    },
    "data_gen": {
        "n_feats":10,
        "class_ratio":0.5,
        **data_gen_params
        
    },
    "classif": class_params,
    "perm": permutation_params
}

nfeats_params_pre = {
    "sim":{
        "parameter_range":np.logspace(1, 10, 5, base=2).astype(int),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "class_ratio":0.5,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params
}

ratio_params_pre = {
    "sim":{
        "parameter_range": np.logspace(np.log10(.01), np.log10(.5), 5), #np.linspace(.1, .9, 5),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "n_feats":10,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params
}

testsize_params_pre = {
    "sim":{
        "parameter_range":np.logspace(np.log10(.01), np.log10(.5), 5),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "n_feats":10,
        "class_ratio":0.5,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params
}

#### Simulation functions

In [14]:
@simulate(**samplesize_params_pre["sim"])
def simulate_samplesize_pre(param=None, seed=None, settings=samplesize_params_pre):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set, same as original dataset
    X_val, y_val = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        best_C = C if AUC>max_AUC else best_C
    ## set up model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, null, p = pre_training_permutation(
        estimator,
        X_train, X_test, y_train, y_test,
        n_permutations=n_permutations,
        score_func=roc_auc_score,
        verbose=True, n_jobs=-1
    )
    return score, null, p

@simulate(**testsize_params_pre["sim"])
def simulate_testsize_pre(param=None, seed=None, settings=testsize_params_pre):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=param, shuffle=True)
    ## Simulate validation set, same as original dataset
    X_val, y_val = random_data_gen(seed=seed, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        best_C = C if AUC>max_AUC else best_C
    ## set up model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, null, p = pre_training_permutation(
        estimator,
        X_train, X_test, y_train, y_test,
        n_permutations=n_permutations,
        score_func=roc_auc_score,
        verbose=True, n_jobs=-1
    )
    return score, null, p

@simulate(**nfeats_params_pre["sim"])
def simulate_nfeats_pre(param=None, seed=None, settings=nfeats_params_pre):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(n_feats=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set, same as original dataset
    X_val, y_val = random_data_gen(n_feats=param, seed=seed, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        best_C = C if AUC>max_AUC else best_C
    ## set up model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, null, p = pre_training_permutation(
        estimator,
        X_train, X_test, y_train, y_test,
        n_permutations=n_permutations,
        score_func=roc_auc_score,
        verbose=True, n_jobs=-1
    )
    return score, null, p

@simulate(**ratio_params_pre["sim"])
def simulate_ratio_pre(param=None, seed=None, settings=ratio_params_pre):
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(class_ratio=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set, same as original dataset
    X_val, y_val = random_data_gen(class_ratio=param, seed=seed, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        best_C = C if AUC>max_AUC else best_C
    ## set up model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, null, p = pre_training_permutation(
        estimator,
        X_train, X_test, y_train, y_test,
        n_permutations=n_permutations,
        score_func=roc_auc_score,
        verbose=True, n_jobs=-1
    )
    return score, null, p

Running 500 simulations
Using dask client at http://192.168.86.101:51360/status
Running 500 simulations
Using dask client at http://192.168.86.101:51360/status
Running 500 simulations
Using dask client at http://192.168.86.101:51360/status
Running 500 simulations
Using dask client at http://192.168.86.101:51360/status


#### Run functions

In [15]:
testsize_futures_pre, testsize_gather = simulate_testsize_pre()
samplesize_futures_pre, samplesize_gather = simulate_samplesize_pre()
nfeats_futures_pre, nfeats_gather = simulate_nfeats_pre()
ratio_futures_pre, ratio_gather = simulate_ratio_pre()

2500 parallel jobs
2500 parallel jobs
2500 parallel jobs
2500 parallel jobs


#### Gather results

In [16]:
wait(testsize_futures_pre)
testsize_result = testsize_gather(testsize_futures_pre) 
df_result = pd.DataFrame(testsize_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = testsize_params_pre
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_testsize_pre.pkl")
rhino_client.cancel(testsize_futures_pre)

In [None]:
wait(samplesize_futures_pre)
samplesize_result = samplesize_gather(samplesize_futures_pre) 
df_result = pd.DataFrame(samplesize_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = samplesize_params_pre
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_samplesize_pre.pkl")
rhino_client.cancel(samplesize_futures_pre)

In [None]:
wait(nfeats_futures_pre)
nfeats_result = nfeats_gather(nfeats_futures_pre)
df_result = pd.DataFrame(nfeats_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = nfeats_params_pre
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_nfeats_pre.pkl")
rhino_client.cancel(nfeats_futures_pre)

In [None]:
wait(ratio_futures_pre)
ratio_result = ratio_gather(ratio_futures_pre)
df_result = pd.DataFrame(ratio_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = ratio_params_pre
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_ratio_pre.pkl")
rhino_client.cancel(ratio_futures_pre)

In [35]:
rhino_client.cancel(ratio_futures_pre)

In [39]:
rhino_client.cancel(ratio_futures_pre+nfeats_futures_pre+samplesize_futures_pre+testsize_futures_pre)

In [35]:
da.get_exceptions(samplesize_futures_pre, range(len(samplesize_futures_pre)))

Unnamed: 0_level_0,exception,traceback_obj
param,Unnamed: 1_level_1,Unnamed: 2_level_1
0,"NameError(""name 'pre_training_permutation' is ...",<traceback object at 0x2b42fe391ec0>
1,"NameError(""name 'pre_training_permutation' is ...",<traceback object at 0x2b4320207f40>
2,"NameError(""name 'pre_training_permutation' is ...",<traceback object at 0x2b432015bc00>
3,"NameError(""name 'pre_training_permutation' is ...",<traceback object at 0x2b43099f7640>
4,"NameError(""name 'pre_training_permutation' is ...",<traceback object at 0x2b43250bb100>
...,...,...
2495,"NameError(""name 'pre_training_permutation' is ...",<traceback object at 0x2b4320f934c0>
2496,"NameError(""name 'pre_training_permutation' is ...",<traceback object at 0x2b434257e440>
2497,"NameError(""name 'pre_training_permutation' is ...",<traceback object at 0x2b4342099880>
2498,"NameError(""name 'pre_training_permutation' is ...",<traceback object at 0x2b4313809600>


# Comparing runtime

In [1]:
import time
def runtime(f, *args, **kwargs):
    start = time.time()
    f(*args, **kwargs)
    stop = time.time()
    return stop - start

In [38]:
runtime_params = {
    "sim":{
        "parameter_range":np.logspace(2, 5, 5).astype(int),
        **sim_params
    },
    "data_gen": {
        "n_feats":10,
        "class_ratio":0.5,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params
}

@simulate(**runtime_params["sim"])
def simulate_runtime_pre(param=None, seed=None, settings=runtime_params):
    start = time.time()
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set, same as original dataset
    X_val, y_val = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        best_C = C if AUC>max_AUC else best_C
    ## set up model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, null, p = pre_training_permutation(
        estimator,
        X_train, X_test, y_train, y_test,
        n_permutations=n_permutations,
        score_func=roc_auc_score,
        verbose=True, n_jobs=-1
    )
    stop = time.time()
    return stop - start

@simulate(**runtime_params["sim"])
def simulate_runtime_post(param=None, seed=None, settings=runtime_params):
    start = time.time()
    settings = deepcopy(settings)
    ## Simulate dataset
    X, y = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## Split into train-test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    ## Simulate validation set, same as original dataset
    X_val, y_val = random_data_gen(n_samples=param, seed=seed, **settings["data_gen"])
    ## iterate over possible penalty params
    max_AUC = 0
    best_C = None
    for C in settings["classif"].pop("C"):
        estimator = LogisticRegression(**settings["classif"], C=C)
        estimator.fit(X=X_train, y=y_train)
        y_pred = estimator.predict_proba(X_val)[:, 1]
        AUC = roc_auc_score(y_score=y_pred, y_true=y_val)
        best_C = C if AUC>max_AUC else best_C
    ## train model with tuned penalty
    estimator = LogisticRegression(**settings["classif"], C=best_C)
    estimator.fit(X=X_train, y=y_train)
    y_pred = estimator.predict_proba(X_test)[:, 1]
    ## permutations
    n_permutations = settings["perm"]["n_permutations"]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    stop = time.time()
    return stop - start

Running 500 simulations
Using dask client at http://192.168.86.120:51360/status
Running 500 simulations
Using dask client at http://192.168.86.120:51360/status


In [39]:
runtime_futures_pre, runtime_gather = simulate_runtime_pre()
runtime_futures_post, runtime_gather = simulate_runtime_post()

2500 parallel jobs
2500 parallel jobs


In [49]:
# wait(runtime_futures_pre)

# df_result._metadata = runtime_params
# df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_ratio_pre.pkl")
rhino_client.cancel(runtime_futures_pre+runtime_futures_post)

In [47]:
runtime_result_pre = runtime_gather(runtime_futures_pre)
runtime_result_post = runtime_gather(runtime_futures_post)
df_result_pre = pd.DataFrame(runtime_result_pre).melt(var_name="param")
df_result_post = pd.DataFrame(runtime_result_post).melt(var_name="param")
df_result_pre['test'] = 'pre'
df_result_post['test'] = 'post'
df_result = pd.concat([df_result_pre, df_result_post])
df_result._metadata = runtime_params
df_result.to_pickle(f"sim_results/simulate_runtime.pkl")

# Hyperparameter tuning

In [5]:
import cmldask.CMLDask as da
rhino_client = da.new_dask_client_slurm(
    job_name="C_tuning",
    memory_per_job="1.5GB",
    max_n_jobs=50,
    threads_per_job=1, 
    adapt=True,
    local_directory="/home1/jrudoler/",
    log_directory="/home1/jrudoler/logs/",
)

Unique port for jrudoler is 51360
{'dashboard_address': ':51360'}
To view the dashboard, run: 
`ssh -fN jrudoler@rhino2.psych.upenn.edu -L 8000:192.168.86.106:51360` in your local computer's terminal (NOT rhino) 
and then navigate to localhost:8000 in your browser


In [16]:
# TODO: Return all AUC scores for all C values, not just the best one. 
@simulate(parameter_range=np.linspace(0., 1.5, 5), n_sim=200)
def test_best_C(param=None, seed=None):
    X, y = random_data_gen(n_samples=5000, n_feats=10, maha=param, class_ratio=0.5, seed=seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    estimator = LogisticRegressionCV(class_weight='balanced', Cs=np.logspace(np.log10(1e-3), np.log10(1e6), 15))
    estimator.fit(X_train, y_train)
    pred = estimator.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_true=y_test, y_score=pred), estimator.C_[0]

Running 200 simulations
Using dask client at http://192.168.86.101:51360/status


In [31]:
np.logspace(np.log10(1e-4), np.log10(1e6), 10)

array([1.00000000e-04, 1.29154967e-03, 1.66810054e-02, 2.15443469e-01,
       2.78255940e+00, 3.59381366e+01, 4.64158883e+02, 5.99484250e+03,
       7.74263683e+04, 1.00000000e+06])

In [19]:
C_futures, C_gather = test_best_C()

1000 parallel jobs


In [17]:
rhino_client.cancel(C_futures)

In [6]:
rhino_client.shutdown()

In [20]:
progress(C_futures)

VBox()

In [21]:
C_results = C_gather(C_futures)

In [22]:
df_result = pd.DataFrame(C_results).melt(var_name="param")
df_result[["score", "C"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')

In [23]:
pd.options.display.max_rows=None
df_result.groupby(["param", "C"]).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,score,score,score,score,score,score,score,score
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
param,C,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
0.0,0.001,51.0,0.500189,0.014283,0.472552,0.488991,0.50046,0.508796,0.528035
0.0,0.004394,40.0,0.500041,0.01912,0.462883,0.486804,0.498724,0.513394,0.538359
0.0,0.019307,27.0,0.501062,0.018689,0.476384,0.487279,0.494766,0.514531,0.546536
0.0,0.084834,25.0,0.502859,0.017194,0.468201,0.491137,0.500589,0.510608,0.540863
0.0,0.372759,8.0,0.4952,0.025852,0.453977,0.478291,0.498051,0.505587,0.540648
0.0,1.637894,18.0,0.502319,0.016872,0.476178,0.492074,0.500702,0.518141,0.529839
0.0,7.196857,7.0,0.50345,0.019665,0.480454,0.4899,0.502414,0.515212,0.531061
0.0,31.622777,6.0,0.488825,0.009346,0.476941,0.481053,0.490806,0.496646,0.498003
0.0,138.949549,5.0,0.483678,0.021437,0.455054,0.46884,0.491178,0.494738,0.50858
0.0,610.54023,6.0,0.498185,0.010658,0.48341,0.492416,0.496817,0.506549,0.511327
