# Setup and imports

In [1]:
import warnings
warnings.simplefilter('ignore', FutureWarning)
import numpy as np
import scipy as scp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (permutation_test_score, learning_curve, LeaveOneGroupOut,
                                     KFold, cross_val_score, cross_val_predict, cross_validate,
                                     train_test_split)
from sklearn.utils import parallel_backend
from sklearn.base import clone
from sklearn import datasets
from joblib.parallel import Parallel, delayed
import pickle
from permutation_helpers import random_data_gen, post_hoc_permutation
from simulate import simulate
from dask.distributed import progress, Client, wait



## Set up client

In [2]:
import cmldask.CMLDask as da
rhino_client = da.new_dask_client_slurm(
    job_name="simulations",
    memory_per_job="2GB",
    max_n_jobs=200, threads_per_job=2, 
    adapt=True,
    local_directory="/home1/jrudoler/dask-worker-space",
    log_directory="/home1/jrudoler/logs/",
#     resource_spec="h_vmem=2.5G,s_vmem=2.5G"
)

Unique port for jrudoler is 51360
{'dashboard_address': ':51360'}
To view the dashboard, run: 
`ssh -fN jrudoler@rhino2.psych.upenn.edu -L 8000:192.168.86.101:51360` in your local computer's terminal (NOT rhino) 
and then navigate to localhost:8000 in your browser


In [6]:
rhino_client.shutdown()

# Post-hoc simulations

#### Post-hoc simulation parameters

In [4]:
### shared parameters
class_params = {
        "C":1e-3,
        "class_weight":"balanced"
    }
permutation_params = {
        "n_permutations": 5000
    }
sim_params = {"n_sim": 500}
data_gen_params = {
    "maha":np.linspace(0., 1.5, 5)[0],
    "psi_diag": 1.0,
    "psi_offdiag": 0.,
    "ddof": 150
}
samplesize_params_post = {
    "sim":{
        "parameter_range":np.logspace(2, 5, 5).astype(int),
        **sim_params
    },
    "data_gen": {
        "maha":0.,
        "n_feats":10,
        "class_ratio":0.5,
        **data_gen_params
        
    },
    "classif": class_params,
    "perm": permutation_params
}

nfeats_params_post = {
    "sim":{
        "parameter_range":np.logspace(1, 10, 5, base=2).astype(int),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "maha":0.,
        "class_ratio":0.5,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params
}

ratio_params_post = {
    "sim":{
        "parameter_range":np.linspace(.1, .9, 5),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "n_feats":10,
        "maha":0.,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params
}

testsize_params_post = {
    "sim":{
        "parameter_range":np.logspace(np.log10(.01), np.log10(.5), 5),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "n_feats":10,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params
}

#### Simulation functions

In [4]:
@simulate(**samplesize_params_post["sim"])
def simulate_samplesize_post(param=None, seed=None):
    X, y = random_data_gen(n_samples=param, seed=seed, **samplesize_params_post["data_gen"])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
#     estimator = LogisticRegressionCV(class_weight='balanced', Cs=6)
    estimator = LogisticRegression(**samplesize_params_post["classif"])
    n_permutations = samplesize_params_post["perm"]["n_permutations"]
    estimator.fit(X=X_train, y=y_train)
    y_pred = estimator.predict_proba(X_test)[:, 1]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    return score, permutation_scores, pvalue

@simulate(**nfeats_params_post["sim"])
def simulate_nfeats_post(param=None, seed=None):
    X, y = random_data_gen(n_feats=param, seed=seed, **nfeats_params_post["data_gen"])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
#     estimator = LogisticRegressionCV(class_weight='balanced', Cs=6)
    estimator = LogisticRegression(**nfeats_params_post["classif"])
    n_permutations = nfeats_params_post["perm"]["n_permutations"]
    estimator.fit(X=X_train, y=y_train)
    y_pred = estimator.predict_proba(X_test)[:, 1]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    return score, permutation_scores, pvalue

@simulate(**ratio_params_post["sim"])
def simulate_ratio_post(param=None, seed=None):
    X, y = random_data_gen(class_ratio=param, seed=seed, **ratio_params_post["data_gen"])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
#     estimator = LogisticRegressionCV(class_weight='balanced', Cs=6)
    estimator = LogisticRegression(**ratio_params_post["classif"])
    n_permutations = ratio_params_post["perm"]["n_permutations"]
    estimator.fit(X=X_train, y=y_train)
    y_pred = estimator.predict_proba(X_test)[:, 1]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    return score, permutation_scores, pvalue

@simulate(**testsize_params_post["sim"])
def simulate_testsize_post(param=None, seed=None):
    X, y = random_data_gen(seed=seed, **testsize_params_post["data_gen"])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=param, shuffle=True)
#     estimator = LogisticRegressionCV(class_weight='balanced', Cs=6)
    estimator = LogisticRegression(**samplesize_params_post["classif"])
    n_permutations = samplesize_params_post["perm"]["n_permutations"]
    estimator.fit(X=X_train, y=y_train)
    y_pred = estimator.predict_proba(X_test)[:, 1]
    score, permutation_scores, pvalue = post_hoc_permutation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    return score, permutation_scores, pvalue

Running 1000 simulations
Using dask client at http://192.168.86.101:51360/status
Running 1000 simulations
Using dask client at http://192.168.86.101:51360/status
Running 1000 simulations
Using dask client at http://192.168.86.101:51360/status
Running 1000 simulations
Using dask client at http://192.168.86.101:51360/status


#### Run functions

In [6]:
testsize_futures_post, testsize_gather = simulate_testsize_post()
samplesize_futures_post, samplesize_gather = simulate_samplesize_post()
nfeats_futures_post, nfeats_gather = simulate_nfeats_post()
ratio_futures_post, ratio_gather = simulate_ratio_post()

2500 parallel jobs
2500 parallel jobs
2500 parallel jobs
2500 parallel jobs


#### Gather results

In [7]:
import os
os.makedirs(f"sim_results/maha_{data_gen_params['maha']:.1f}", exist_ok=True)

In [17]:
samplesize_result = samplesize_gather(samplesize_futures_post) 
df_result = pd.DataFrame(samplesize_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = samplesize_params_post
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_samplesize_post.pkl")

In [16]:
testsize_result = testsize_gather(testsize_futures_post) 
df_result = pd.DataFrame(testsize_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = testsize_params_post
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_testsize_post.pkl")

In [18]:
nfeats_result = nfeats_gather(nfeats_futures_post)
df_result = pd.DataFrame(nfeats_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = nfeats_params_post
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_nfeats_post.pkl")

In [19]:
ratio_result = ratio_gather(ratio_futures_post)
df_result = pd.DataFrame(ratio_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = ratio_params_post
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_ratio_post.pkl")

In [20]:
rhino_client.cancel(samplesize_futures_post + ratio_futures_post + nfeats_futures_post + testsize_futures_post)

In [24]:
rhino_client.shutdown()

# Pre-training permutations (original)

In [3]:
def _train_score(estimator, X_train, X_test, y_train, y_test, 
                score_func, shuffle_labels=False):
    if shuffle_labels:
        indices = np.random.default_rng().permutation(len(y_train))
        y_train = y_train[indices]
    estimator.fit(X_train, y_train)
    y_pred = estimator.predict_proba(X_test)[:,1]
    score = score_func(y_true=y_test, y_score=y_pred)
    return score



def pre_training_permutation(estimator, X_train, X_test, y_train, y_test,
                            n_permutations, score_func, verbose=False, n_jobs=None):
    score = _train_score(
        clone(estimator), X_train, X_test, y_train, y_test, score_func, shuffle_labels=False
    )
    permutation_scores = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(_train_score)(
            clone(estimator),
            X_train, X_test, y_train, y_test,
            score_func,
            shuffle_labels=True,
        )
        for _ in range(n_permutations)
    )
    permutation_scores = np.array(permutation_scores)
    pvalue = (np.sum(permutation_scores >= score) + 1.0) / (n_permutations + 1)
    return score, permutation_scores, pvalue


#### Simulation parameters

In [5]:
### shared parameters
# class_params = {
#     "C":1e-3,
#     "class_weight":"balanced"
# }
# permutation_params = {
#     "n_permutations": 5000
# }
# sim_params = {"n_sim": 150}
# data_gen_params = {
#     "maha":np.linspace(0., 1.5, 5)[0],
#     "psi_diag": 1.0,
#     "psi_offdiag": 0.,
#     "ddof": 150
# }
samplesize_params_pre = {
    "sim":{
        "parameter_range":np.logspace(2, 5, 5).astype(int),
        **sim_params
    },
    "data_gen": {
        "n_feats":10,
        "class_ratio":0.5,
        **data_gen_params
        
    },
    "classif": class_params,
    "perm": permutation_params
}

nfeats_params_pre = {
    "sim":{
        "parameter_range":np.logspace(1, 10, 5, base=2).astype(int),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "class_ratio":0.5,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params
}

ratio_params_pre = {
    "sim":{
        "parameter_range":np.linspace(.1, .9, 5),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "n_feats":10,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params
}
testsize_params_pre = {
    "sim":{
        "parameter_range":np.logspace(np.log10(.01), np.log10(.5), 5),
        **sim_params
    },
    "data_gen": {
        "n_samples":1000,
        "n_feats":10,
        "class_ratio":0.5,
        **data_gen_params
    },
    "classif": class_params,
    "perm": permutation_params
}

#### Simulation functions

In [6]:
@simulate(**nfeats_params_pre["sim"])
def simulate_nfeats_pre(param=None, seed=None):
    X, y = random_data_gen(n_feats=param, seed=seed, **nfeats_params_pre["data_gen"])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    estimator = LogisticRegression(**nfeats_params_pre["classif"])
    n_permutations = nfeats_params_pre["perm"]["n_permutations"]
    score, null, p = pre_training_permutation(
        estimator,
        X_train, X_test, y_train, y_test,
        n_permutations=n_permutations,
        score_func=roc_auc_score,
        verbose=True, n_jobs=-1
    )
    return score, null, p

@simulate(**ratio_params_pre["sim"])
def simulate_ratio_pre(param=None, seed=None):
    X, y = random_data_gen(class_ratio=param, seed=seed, **ratio_params_pre["data_gen"])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    estimator = LogisticRegression(**ratio_params_pre["classif"])
    n_permutations = ratio_params_pre["perm"]["n_permutations"]
    score, null, p = pre_training_permutation(
        estimator,
        X_train, X_test, y_train, y_test,
        n_permutations=n_permutations,
        score_func=roc_auc_score,
        verbose=True, n_jobs=-1
    )
    return score, null, p

@simulate(**samplesize_params_pre["sim"])
def simulate_samplesize_pre(param=None, seed=None):
    X, y = random_data_gen(n_samples=param, seed=seed, **samplesize_params_pre["data_gen"])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    estimator = LogisticRegression(**samplesize_params_pre["classif"])
    n_permutations = samplesize_params_pre["perm"]["n_permutations"]
    score, null, p = pre_training_permutation(
        estimator,
        X_train, X_test, y_train, y_test,
        n_permutations=n_permutations,
        score_func=roc_auc_score,
        verbose=True, n_jobs=-1
    )
    return score, null, p

@simulate(**testsize_params_pre["sim"])
def simulate_testsize_pre(param=None, seed=None):
    X, y = random_data_gen(seed=seed, **testsize_params_pre["data_gen"])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=param, shuffle=True)
    estimator = LogisticRegression(**samplesize_params_pre["classif"])
    n_permutations = samplesize_params_pre["perm"]["n_permutations"]
    score, null, p = pre_training_permutation(
        estimator,
        X_train, X_test, y_train, y_test,
        n_permutations=n_permutations,
        score_func=roc_auc_score,
        verbose=True, n_jobs=-1
    )
    return score, null, p

Running 500 simulations
Using dask client at http://192.168.86.101:51360/status
Running 500 simulations
Using dask client at http://192.168.86.101:51360/status
Running 500 simulations
Using dask client at http://192.168.86.101:51360/status
Running 500 simulations
Using dask client at http://192.168.86.101:51360/status


#### Run functions

In [7]:
testsize_futures_pre, testsize_gather = simulate_testsize_pre()
samplesize_futures_pre, samplesize_gather = simulate_samplesize_pre()
nfeats_futures_pre, nfeats_gather = simulate_nfeats_pre()
ratio_futures_pre, ratio_gather = simulate_ratio_pre()

2500 parallel jobs
2500 parallel jobs
2500 parallel jobs
2500 parallel jobs


#### Gather results

In [None]:
samplesize_result = samplesize_gather(samplesize_futures_pre) 
df_result = pd.DataFrame(samplesize_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = samplesize_params_pre
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_samplesize_pre.pkl")

In [9]:
testsize_result = testsize_gather(testsize_futures_pre) 
df_result = pd.DataFrame(testsize_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = testsize_params_pre
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_testsize_pre.pkl")

In [None]:
nfeats_result = nfeats_gather(nfeats_futures_pre)
df_result = pd.DataFrame(nfeats_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = nfeats_params_pre
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_nfeats_pre.pkl")

In [None]:
ratio_result = ratio_gather(ratio_futures_pre)
df_result = pd.DataFrame(ratio_result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result._metadata = ratio_params_pre
df_result.to_pickle(f"sim_results/maha_{data_gen_params['maha']:.1f}/simulate_ratio_pre.pkl")

In [10]:
rhino_client.cancel(testsize_futures_pre)

In [27]:
rhino_client.cancel(ratio_futures_pre+nfeats_futures_pre+samplesize_futures_pre+testsize_futures_pre)

In [18]:
da.get_exceptions(maha_futures_pre, range(len(maha_futures_pre))).iloc[0]['exception']

Exception: None of the given futures resulted in exceptions

# Next steps
* Re-rerun post-hoc simulations with null maha distance
* Check out over/under-fitting with more or less regularization values. Compare to theoretical auc based on mahalanobis distance
* Check bias of null model (proportion of false positives - at different $\alpha$ thresholds [.1, .05, .01, .005])
* Check power of the test for a non-null model (positive mahalanobis distance) (proportion of predicted positives)
* Compare bias and power/sensitivity to the pre-trained model

## Farther along
* See how class balance, number of samples, etc. effect the above

In [32]:
df_result._metadata = {"hi":"joey"}

In [36]:
df_result.to_pickle("test_metadata.pkl")

In [35]:
pd.read_pickle("test_metadata.pkl")._metadata

{'hi': 'joey'}

# Hyperparameter tuning

In [5]:
import cmldask.CMLDask as da
rhino_client = da.new_dask_client_slurm(
    job_name="C_tuning",
    memory_per_job="1.5GB",
    max_n_jobs=50,
    threads_per_job=1, 
    adapt=True,
    local_directory="/home1/jrudoler/",
    log_directory="/home1/jrudoler/logs/",
)

Unique port for jrudoler is 51360
{'dashboard_address': ':51360'}
To view the dashboard, run: 
`ssh -fN jrudoler@rhino2.psych.upenn.edu -L 8000:192.168.86.106:51360` in your local computer's terminal (NOT rhino) 
and then navigate to localhost:8000 in your browser


In [16]:
# TODO: Return all AUC scores for all C values, not just the best one. 
@simulate(parameter_range=np.linspace(0., 1.5, 5), n_sim=200)
def test_best_C(param=None, seed=None):
    X, y = random_data_gen(n_samples=5000, n_feats=10, maha=param, class_ratio=0.5, seed=seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    estimator = LogisticRegressionCV(class_weight='balanced', Cs=np.logspace(np.log10(1e-3), np.log10(1e6), 15))
    estimator.fit(X_train, y_train)
    pred = estimator.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_true=y_test, y_score=pred), estimator.C_[0]

Running 200 simulations
Using dask client at http://192.168.86.101:51360/status


In [19]:
C_futures, C_gather = test_best_C()

1000 parallel jobs


In [17]:
rhino_client.cancel(C_futures)

In [6]:
rhino_client.shutdown()

In [20]:
progress(C_futures)

VBox()

In [21]:
C_results = C_gather(C_futures)

In [22]:
df_result = pd.DataFrame(C_results).melt(var_name="param")
df_result[["score", "C"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')

In [23]:
pd.options.display.max_rows=None
df_result.groupby(["param", "C"]).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,score,score,score,score,score,score,score,score
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
param,C,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
0.0,0.001,51.0,0.500189,0.014283,0.472552,0.488991,0.50046,0.508796,0.528035
0.0,0.004394,40.0,0.500041,0.01912,0.462883,0.486804,0.498724,0.513394,0.538359
0.0,0.019307,27.0,0.501062,0.018689,0.476384,0.487279,0.494766,0.514531,0.546536
0.0,0.084834,25.0,0.502859,0.017194,0.468201,0.491137,0.500589,0.510608,0.540863
0.0,0.372759,8.0,0.4952,0.025852,0.453977,0.478291,0.498051,0.505587,0.540648
0.0,1.637894,18.0,0.502319,0.016872,0.476178,0.492074,0.500702,0.518141,0.529839
0.0,7.196857,7.0,0.50345,0.019665,0.480454,0.4899,0.502414,0.515212,0.531061
0.0,31.622777,6.0,0.488825,0.009346,0.476941,0.481053,0.490806,0.496646,0.498003
0.0,138.949549,5.0,0.483678,0.021437,0.455054,0.46884,0.491178,0.494738,0.50858
0.0,610.54023,6.0,0.498185,0.010658,0.48341,0.492416,0.496817,0.506549,0.511327
