In [1]:
%env MKL_NUM_THREADS=1
%env NUMEXPR_NUM_THREADS=1
%env OMP_NUM_THREADS=1
%env OPENBLAS_NUM_THREADS=1
%env VECLIB_MAXIMUM_THREADS=1

env: MKL_NUM_THREADS=1
env: NUMEXPR_NUM_THREADS=1
env: OMP_NUM_THREADS=1
env: OPENBLAS_NUM_THREADS=1
env: VECLIB_MAXIMUM_THREADS=1


In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate
from imblearn.pipeline import Pipeline

from tqdm import tqdm

In [3]:
# Evaluation functions
from sklearn.metrics import roc_auc_score, make_scorer
from lift.perc_lift_score import perc_lift_score

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Samplers
from imblearn.over_sampling import SMOTE, ADASYN
from wgan.imblearn import GANbalancer

In [4]:
from sklearn.datasets import make_classification
X,y = make_classification(n_samples=4000, n_features=20, weights=[0.95,0.05], 
                          n_informative=20, n_redundant=0, n_clusters_per_class=5)

# from wgan.data import load_DMC10
# X, _, y, _, idx_cont, idx_cat, cat_dict = load_DMC10("/Users/hauptjoh/Data/DMC10")

In [5]:
from experiment.experiment_config import experiment_config
scorers, models, samplers = experiment_config(X, idx_cont=None, idx_cat=None)

In [6]:
idx_cont=None
idx_cat = None

### Samplers
scorers = {'auc':make_scorer(roc_auc_score, needs_proba=True),
          'TDLift':make_scorer(perc_lift_score, needs_proba=True, percentile=0.1)}

### Models
models = []
models.append(('LR', LogisticRegression(solver='liblinear'), {
    "C": [10]
}))
# models.append(('RF', RandomForestClassifier(), {
#     "n_estimators":[100],
#     "max_features":["sqrt"],
#     "min_samples_leaf":[20]
# }))

### Samplers
from imblearn.over_sampling import SMOTE, ADASYN
from wgan.imblearn import GANbalancer

samplers = []

# GAN
if idx_cont is None:
    idx_cont = list(range(X.shape[1]))

categorical = None
if idx_cat is not None:
    categorical = [(i,
                    np.max(X[:,i])+1,
                    int(min(15., np.ceil(np.max((X[:,i])+1)/2)))
                   )
                    for i in idx_cat]

samplers.append(('cGAN', GANbalancer(
        idx_cont=idx_cont, categorical=categorical, batch_size = 128, critic_iterations=1
), {
    'generator_input'  : [40],
    'generator_layers' : [[100]],
    'critic_layers'    : [[100]],
    'n_iter'           : [40000,100000,1000000]
}))

# baseline
samplers.append(('unbalanced', None, {}))

# SMOTE
samplers.append(('SMOTE', SMOTE(), {
    'k_neighbors':[5,10]
}))

# ADASYN
samplers.append(('ADASYN', ADASYN(), {
    'n_neighbors':[5,10]
}))

In [7]:
seed = 123

score_outer = {}

for sampler_name, sampler, sampler_grid in tqdm(samplers):
    
    sampler_grid = {'sampler__'+key:item for key, item in sampler_grid.items()}
    
    score_inner = {}

    for model_name, model, model_grid in tqdm(models):

        pipeline = Pipeline(memory='./.cachedir', steps=[
            ('scaler', MinMaxScaler()),
            ('sampler', sampler),
            ('classifier', model)
          ])
        model_grid = {'classifier__'+key:item for key, item in model_grid.items()}
        p_grid = {**sampler_grid, **model_grid}
        
        inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
        outer_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)
            
        clf = GridSearchCV(pipeline, param_grid= p_grid, cv=inner_cv, scoring=scorers, refit='auc', 
                           return_train_score=True, iid=False, n_jobs=20, pre_dispatch=40, verbose=1)

        score_inner[model_name] = cross_validate(clf, X=X,y=y,cv=outer_cv , scoring=scorers, return_train_score=True,
                                    return_estimator=True, verbose=1)
    score_outer[sampler_name] = score_inner

  0%|          | 0/4 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   8 out of  15 | elapsed: 13.7min remaining: 12.0min
[Parallel(n_jobs=20)]: Done  15 out of  15 | elapsed: 149.5min finished


Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   8 out of  15 | elapsed: 13.9min remaining: 12.1min
[Parallel(n_jobs=20)]: Done  15 out of  15 | elapsed: 122.1min finished


Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   8 out of  15 | elapsed:  9.2min remaining:  8.0min
[Parallel(n_jobs=20)]: Done  15 out of  15 | elapsed: 113.1min finished
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 501.6min finished

 25%|██▌       | 1/4 [8:21:37<25:04:52, 30097.45s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   5 out of   5 | elapsed:    0.4s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=20)]: Done   2 out of   5 | elapsed:    0.2s remaining:    0.3s
[Parallel(n_jobs=20)]: Done   5 out of   5 | elapsed:    0.4s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=20)]: Done   2 out of   5 | elapsed:    0.2s remaining:    0.3s
[Parallel(n_jobs=20)]: Done   5 out of   5 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.5s finished

 50%|█████     | 2/4 [8:21:38<11:42:17, 21068.67s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=20)]: Done   4 out of  10 | elapsed:    0.9s remaining:    1.3s
[Parallel(n_jobs=20)]: Done  10 out of  10 | elapsed:    2.3s finished


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   4 out of  10 | elapsed:    1.0s remaining:    1.5s
[Parallel(n_jobs=20)]: Done  10 out of  10 | elapsed:    2.4s finished


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   4 out of  10 | elapsed:    1.1s remaining:    1.7s
[Parallel(n_jobs=20)]: Done  10 out of  10 | elapsed:    3.5s finished
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    9.2s finished

 75%|███████▌  | 3/4 [8:21:48<4:05:50, 14750.83s/it] 
  0%|          | 0/1 [00:00<?, ?it/s][A[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=20)]: Done   4 out of  10 | elapsed:    1.3s remaining:    2.0s
[Parallel(n_jobs=20)]: Done  10 out of  10 | elapsed:    9.5s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=20)]: Done   4 out of  10 | elapsed:    5.2s remaining:    7.8s
[Parallel(n_jobs=20)]: Done  10 out of  10 | elapsed:    5.3s finished
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.


Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=20)]: Done   4 out of  10 | elapsed:    6.1s remaining:    9.2s
[Parallel(n_jobs=20)]: Done  10 out of  10 | elapsed:    6.1s finished
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   21.4s finished

100%|██████████| 4/4 [8:22:09<00:00, 10332.00s/it]  


In [8]:
scores = pd.DataFrame([{
  'sampler':sampler_name, 'model':model_name, 
    'auc':np.mean(model["test_auc"]),  'auc_sd':np.std(model["test_auc"]),
    'lift0.1':np.mean(model["test_TDLift"]),  'lift0.1_sd':np.std(model["test_TDLift"]),
} for sampler_name, sampler in score_outer.items()
    for model_name, model in sampler.items()]
)

print(scores)

        auc    auc_sd   lift0.1  lift0.1_sd model     sampler
0  0.776272  0.040642  1.080369    0.024811    LR        cGAN
1  0.814863  0.038656  1.090651    0.007022    LR  unbalanced
2  0.821822  0.035713  1.095655    0.000054    LR       SMOTE
3  0.821116  0.035804  1.090651    0.007022    LR      ADASYN


In [9]:
tuning_results = {sampler_name:
    {model_name:
    # vstack result DataFrame for each outer fold
        pd.concat([ 
            # Inner CV tuning results as DataFrame
            pd.concat([pd.DataFrame(inner_cv.cv_results_['params']).astype(str), 
                       pd.DataFrame({
                           'mean_test_auc':inner_cv.cv_results_['mean_test_auc'],
                           'std_test_auc':inner_cv.cv_results_['std_test_auc'],
                           'mean_test_TDLift':inner_cv.cv_results_['mean_test_TDLift'],
                           'std_test_TDLift':inner_cv.cv_results_['std_test_TDLift']
                       })
                      ], sort=False, ignore_index=False, axis=1)
            for inner_cv in model['estimator']]).groupby(list(model['estimator'][0].cv_results_['params'][0].keys())).mean().reset_index()
            for model_name, model in sampler.items()}
          for sampler_name, sampler in score_outer.items()}


In [10]:
tuning_results["cGAN"]["LR"].sort_values(["sampler__n_iter"])

Unnamed: 0,classifier__C,sampler__critic_layers,sampler__generator_input,sampler__generator_layers,sampler__n_iter,mean_test_auc,std_test_auc,mean_test_TDLift,std_test_TDLift
0,10,[100],40,[100],100000,0.780123,0.040656,1.087537,0.029349
1,10,[100],40,[100],1000000,0.779761,0.037585,1.077583,0.028426
2,10,[100],40,[100],40000,0.777282,0.047214,1.090175,0.023121


In [11]:
parameter_scores =  pd.DataFrame([{
  'sampler':sampler_name, 'model':model_name,
    'parameter':param_name,
    'parameter_value':str(param_value),
    'auc':cv.cv_results_['mean_test_auc'][i],  'auc_sd':cv.cv_results_["std_test_auc"][i],
    'lift0.1':cv.cv_results_["mean_test_TDLift"][i],  'lift0.1_sd':cv.cv_results_["std_test_TDLift"][i]
}   for sampler_name, sampler in score_outer.items()
    for model_name, model in sampler.items()
    for cv in model['estimator']
    for i, (param_name, param_value) in enumerate(cv.cv_results_['params'][0].items())
]
)

IndexError: index 3 is out of bounds for axis 0 with size 3

In [None]:
parameter_scores

In [None]:
parameter_scores.groupby(['sampler','model','parameter','parameter_value']).mean()