In [23]:
import warnings
warnings.simplefilter('ignore', FutureWarning)
import numpy as np
import scipy as scp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import (permutation_test_score, learning_curve, LeaveOneGroupOut,
                                     KFold, cross_val_score, cross_val_predict, cross_validate,
                                     train_test_split)
from sklearn.utils import parallel_backend
from sklearn.base import clone
from sklearn import datasets
from joblib.parallel import Parallel, delayed
import pickle
from permutation_helpers import random_data_gen, post_hoc_permuation, simulate

## Setup

In [20]:
## IRIS dataset
# X_iris, y_iris = datasets.load_iris(return_X_y=True)
# mask = y_iris < 2
# X_iris, y_iris = X_iris[mask], y_iris[mask]

In [2]:
X = np.load("random_data_X.npy")
y = np.load("random_data_y.npy")

In [3]:
estimator = LogisticRegressionCV(class_weight='balanced', Cs=4)
cv = KFold(n_splits=3, shuffle=True, random_state=0)
n_permutations = 5000

## Pre-training permutation

In [4]:
score, null, p = permutation_test_score(estimator=estimator,
                                        X=X, y=y,
                                        cv = cv,
                                        n_permutations=n_permutations,
                                        scoring='roc_auc',
                                        verbose=True, n_jobs=-1
                                       )

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 370 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 720 tasks      | elapsed:   19.0s
[Parallel(n_jobs=-1)]: Done 1170 tasks      | elapsed:   29.7s
[Parallel(n_jobs=-1)]: Done 1720 tasks      | elapsed:   43.0s
[Parallel(n_jobs=-1)]: Done 2370 tasks      | elapsed:   58.3s
[Parallel(n_jobs=-1)]: Done 3120 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 3970 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 4920 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 5000 out of 5000 | elapsed:  2.0min finished


In [5]:
p

0.0001999600079984003

## Post-training permutation

In [3]:
from permutation_helpers import post_hoc_permuation

In [7]:
y_pred = cross_val_predict(clone(estimator), X, y, cv=cv, method='predict_proba')
# post_score, post_null, post_p = post_hoc_permuation(y, y_pred[:, 1], n_jobs=-1, verbose=True, n_permutations=n_permutations)

In [8]:
cross_val_score(clone(estimator), X, y, cv=cv, scoring="roc_auc", verbose=1)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.1s finished


array([0.91790977, 0.92486726, 0.91869629])

Above permutes across CV folds. If we want to permute within CV folds we need to use something like the below and run the permutation function each time, then aggregate

In [16]:
from permutation_helpers import post_hoc_permutation_cv

## Running simulations

In [25]:
from permutation_helpers import simulate

In [5]:
@simulate(parameter_range=np.linspace(0, 5), n_sim=10)
def scale(param=None, x=None, intercept=None):
    return param*x + intercept

Running 10 simulations
No dask client avaialable, running sequentially


In [6]:
# try:
#     client.shutdown()
#     client = Client()
# except:
#     client = Client()
result = scale(x=np.array([1, 2, 4]), intercept=2)
result

No dask client avaialable, running sequentially


{0.0: {0: array([2., 2., 2.]),
  1: array([2., 2., 2.]),
  2: array([2., 2., 2.]),
  3: array([2., 2., 2.]),
  4: array([2., 2., 2.]),
  5: array([2., 2., 2.]),
  6: array([2., 2., 2.]),
  7: array([2., 2., 2.]),
  8: array([2., 2., 2.]),
  9: array([2., 2., 2.])},
 0.10204081632653061: {0: array([2.10204082, 2.20408163, 2.40816327]),
  1: array([2.10204082, 2.20408163, 2.40816327]),
  2: array([2.10204082, 2.20408163, 2.40816327]),
  3: array([2.10204082, 2.20408163, 2.40816327]),
  4: array([2.10204082, 2.20408163, 2.40816327]),
  5: array([2.10204082, 2.20408163, 2.40816327]),
  6: array([2.10204082, 2.20408163, 2.40816327]),
  7: array([2.10204082, 2.20408163, 2.40816327]),
  8: array([2.10204082, 2.20408163, 2.40816327]),
  9: array([2.10204082, 2.20408163, 2.40816327])},
 0.20408163265306123: {0: array([2.20408163, 2.40816327, 2.81632653]),
  1: array([2.20408163, 2.40816327, 2.81632653]),
  2: array([2.20408163, 2.40816327, 2.81632653]),
  3: array([2.20408163, 2.40816327, 2.816

In [8]:
pd_result = pd.DataFrame(result).melt(var_name="param")
pd_result


Unnamed: 0,param,value
0,0.0,"[2.0, 2.0, 2.0]"
1,0.0,"[2.0, 2.0, 2.0]"
2,0.0,"[2.0, 2.0, 2.0]"
3,0.0,"[2.0, 2.0, 2.0]"
4,0.0,"[2.0, 2.0, 2.0]"
...,...,...
495,5.0,"[7.0, 12.0, 22.0]"
496,5.0,"[7.0, 12.0, 22.0]"
497,5.0,"[7.0, 12.0, 22.0]"
498,5.0,"[7.0, 12.0, 22.0]"


In [10]:
pd_result['value'].apply(pd.Series)

Unnamed: 0,0,1,2
0,2.0,2.0,2.0
1,2.0,2.0,2.0
2,2.0,2.0,2.0
3,2.0,2.0,2.0
4,2.0,2.0,2.0
...,...,...,...
495,7.0,12.0,22.0
496,7.0,12.0,22.0
497,7.0,12.0,22.0
498,7.0,12.0,22.0


In [11]:
with open("simulation_results.pkl", "wb") as f:
    pickle.dump(result, f)

## Simulate permutations with different params

In [2]:
import cmldask.CMLDask as da

In [3]:
rhino_client = da.new_dask_client(
    job_name="simulations",
    memory_per_job="1GB",
    max_n_jobs=50, threads_per_job=10, 
    adapt=False,
    local_directory="/home1/jrudoler/dask_worker_space",
    log_directory="/home1/jrudoler/logs/",
    )

Unique port for jrudoler is 51360
{'dashboard_address': ':51360'}
To view the dashboard, run: 
`ssh -fN jrudoler@rhino2.psych.upenn.edu -L 8000:128.91.77.24:51360` in your local computer's terminal (NOT rhino) 
and then navigate to localhost:8000 in your browser
You've chosen to scale your cluster manually. This means workers will continue to run until you manually shut them down. Remember to run `client.shutdown` after you're done computing and no longer need to reserve resources.


In [4]:
rhino_client.cluster.scale(300)

In [22]:
np.logspace(2, 6, 15).astype(int)

array([    100,     193,     372,     719,    1389,    2682,    5179,
         10000,   19306,   37275,   71968,  138949,  268269,  517947,
       1000000])

In [52]:
@simulate(parameter_range=np.logspace(2, 5, 10).astype(int), n_sim=1000)
def simulate_samplesize(param=None, seed=None):
    X, y = random_data_gen(n_samples=param, n_feats=5, maha=0.1, ratio=0.5, seed=seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    estimator = LogisticRegressionCV(class_weight='balanced', Cs=4)
    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    n_permutations = 5000
    estimator.fit(X=X_train, y=y_train)
    y_pred = estimator.predict_proba(X_test)[:, 1]
    score, permutation_scores, pvalue = post_hoc_permuation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    return score, permutation_scores, pvalue

@simulate(parameter_range=np.linspace(0., 1.5), n_sim=1000)
def simulate_maha(param=None, seed=None):
    X, y = random_data_gen(n_samples=10000, n_feats=5, maha=param, ratio=0.5, seed=seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, shuffle=True)
    estimator = LogisticRegressionCV(class_weight='balanced', Cs=4)
    cv = KFold(n_splits=5, shuffle=True, random_state=0)
    n_permutations = 5000
    estimator.fit(X=X_train, y=y_train)
    y_pred = estimator.predict_proba(X_test)[:, 1]
    score, permutation_scores, pvalue = post_hoc_permuation(
        y_true=y_test, y_score=y_pred,
        n_permutations=n_permutations, n_jobs=-1,
        )
    return score, permutation_scores, pvalue
    
    

Running 1000 simulations
using dask client at http://128.91.77.24:51360/status
Running 1000 simulations
using dask client at http://128.91.77.24:51360/status


In [6]:
result = simulate_samplesize()

Running 1000 simulations
Using dask client at http://128.91.77.24:51360/status
10000 parallel jobs


In [53]:
maha_result = simulate_maha()

Running 1000 simulations
Using dask client at http://128.91.77.24:51360/status
50000 parallel jobs


In [27]:
df_result = pd.DataFrame(result).melt(var_name="param")
df_result[["score", "perm_scores", "pval"]] = df_result['value'].apply(pd.Series)
df_result = df_result.drop(columns='value')
df_result.to_pickle("simulate_samplesize.pkl")
# loaded = pd.read_pickle("simulate_samplesize.pkl")

In [53]:
rhino_client.shutdown()

2022-12-28 17:50:08,819 - distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
