In [33]:
%env MKL_NUM_THREADS=1
%env NUMEXPR_NUM_THREADS=1
%env OMP_NUM_THREADS=1
%env OPENBLAS_NUM_THREADS=1
%env VECLIB_MAXIMUM_THREADS=1

env: MKL_NUM_THREADS=1
env: NUMEXPR_NUM_THREADS=1
env: OMP_NUM_THREADS=1
env: OPENBLAS_NUM_THREADS=1
env: VECLIB_MAXIMUM_THREADS=1


In [34]:
import sys
sys.path.append("/home/RDC/hauptjoh.hub/utils")

In [35]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from tqdm import tqdm

In [36]:
# Evaluation functions
from sklearn.metrics import roc_auc_score, make_scorer
from lift.perc_lift_score import perc_lift_score

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Samplers
from imblearn.over_sampling import SMOTE, ADASYN, SMOTENC
from wgan.imblearn import GANbalancer

In [37]:
# from sklearn.datasets import make_classification
# X,y = make_classification(n_samples=4000, n_features=20, weights=[0.99,0.01], 
#                           n_informative=20, n_redundant=0, n_clusters_per_class=5)

# X = pd.DataFrame(X)

In [38]:
import data_loader
path = "/home/RDC/hauptjoh.hub/data"
#X,y = data_loader.load_coil00(path)
X,y = data_loader.load_dmc10(path)

#y = np.eye(y.nunique())[y]

In [39]:
# Initialize index lists
idx_cont = None
idx_cat  = None

if idx_cat is None:
    idx_cat = list(np.where(X.dtypes=='category')[0])
    idx_cat = [int(x) for x in idx_cat]

if idx_cont is None:
    idx_cont = [x for x in range(X.shape[1]) if x not in idx_cat]
    idx_cont = [int(x) for x in idx_cont]

In [40]:
# Initialize embedding tuples
categorical = None
if idx_cat is not None:
    categorical = [(i,
                    len(X.iloc[:,i].cat.categories),
                    int(min(15., np.ceil(0.5*len(X.iloc[:,i].cat.categories))))
                   )
                    for i in idx_cat]

In [41]:
# Make sure categorical variables are encoded from 0
if np.any([idx>min(idx_cat) for idx in idx_cont]):
    raise ValueError("Variables need to be ordered [cont, cat]")

In [42]:
X=X.to_numpy(dtype=np.float32)
y=y.to_numpy(dtype=np.int32)

In [43]:
#from experiment.experiment_config import experiment_config
#scorers, models, samplers = experiment_config(X, idx_cont=None, idx_cat=None)

In [44]:
### Samplers
scorers = {'auc':make_scorer(roc_auc_score, needs_proba=True),
          'TDLift':make_scorer(perc_lift_score, needs_proba=True, percentile=0.1)}

### Models
models = []
models.append(('LR', LogisticRegression(solver='liblinear'), {
    "C": [10]
}))
# models.append(('RF', RandomForestClassifier(), {
#     "n_estimators":[200],
#     "max_features":["sqrt"],
#     "min_samples_leaf":[20]
# }))

### Samplers
from imblearn.over_sampling import SMOTE, ADASYN
from wgan.imblearn import GANbalancer

samplers = []

# GAN

samplers.append(('cGAN', GANbalancer(
        idx_cont=idx_cont, categorical=categorical, batch_size = 128, auxiliary=True
), {
    'generator_input'  : [40,100],
    'generator_layers' : [[40,40],[100,100]],
    'critic_layers'    : [[40],[100]],
    'n_iter'           : [100000],
    'critic_iterations': [2]
}))

# baseline
samplers.append(('unbalanced', None, {}))

SMOTE
samplers.append(('SMOTE', SMOTENC(categorical_features=idx_cat), {
    'k_neighbors':[5,10,15,20,25]
}))

# # ADASYN
# samplers.append(('ADASYN', ADASYN(), {
#     'n_neighbors':[5,10]
# }))

In [45]:
preproc_sampler = ColumnTransformer([
    ('scaler', MinMaxScaler(), idx_cont),
    ('pass',   'passthrough',  idx_cat)
])

preproc_clf = ColumnTransformer([
    ('pass', 'passthrough', idx_cont),
    ('ohe',   OneHotEncoder(categories='auto', handle_unknown='ignore'),  idx_cat)
])

In [46]:
seed = 123

score_outer = {}

for sampler_name, sampler, sampler_grid in tqdm(samplers):
    
    sampler_grid = {'sampler__'+key:item for key, item in sampler_grid.items()}
    
    score_inner = {}

    for model_name, model, model_grid in tqdm(models):

        pipeline = Pipeline(memory='./.cachedir', steps=[
            ('preproc_sampler', preproc_sampler),
            ('sampler', sampler),
            ('preproc_clf', preproc_clf),
            ('classifier', model)
          ])
        
        model_grid = {'classifier__'+key:item for key, item in model_grid.items()}
        p_grid = {**sampler_grid, **model_grid}
        
        inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
        outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
            
        clf = GridSearchCV(pipeline, param_grid= p_grid, cv=inner_cv, scoring=scorers, refit='auc', 
                           return_train_score=True, iid=False, 
                           n_jobs=16, pre_dispatch=32, verbose=1)

        score_inner[model_name] = cross_validate(clf, X=X,y=y,cv=outer_cv , scoring=scorers, return_train_score=True,
                                    return_estimator=True, verbose=1, error_score='raise')
    score_outer[sampler_name] = score_inner



  0%|          | 0/3 [00:00<?, ?it/s][A[A


  0%|          | 0/1 [00:00<?, ?it/s][A[A[A

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  40 out of  40 | elapsed: 391.7min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  40 out of  40 | elapsed: 384.0min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  40 out of  40 | elapsed: 388.0min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  40 out of  40 | elapsed: 381.8min finished


Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  40 out of  40 | elapsed: 394.5min finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 2465.8min finished



100%|██████████| 1/1 [41:05:46<00:00, 147946.38s/it][A[A[A

 33%|███▎      | 1/3 [41:05:46<82:11:32, 147946.38s/it][A[A


  0%|          | 0/1 [00:00<?, ?it/s][A[A[A

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   5 out of   5 | elapsed:   17.2s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   5 out of   5 | elapsed:    8.3s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   5 out of   5 | elapsed:    8.7s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   5 out of   5 | elapsed:    3.1s finished


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done   5 out of   5 | elapsed:    1.8s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   45.2s finished



100%|██████████| 1/1 [00:45<00:00, 45.17s/it][A[A[A

 67%|██████▋   | 2/3 [41:06:31<28:46:16, 103576.02s/it][A[A


  0%|          | 0/1 [00:00<?, ?it/s][A[A[A[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=16)]: Done  20 out of  25 | elapsed:   14.3s remaining:    3.6s
[Parallel(n_jobs=16)]: Done  25 out of  25 | elapsed:   14.7s finished


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  20 out of  25 | elapsed:   12.1s remaining:    3.0s
[Parallel(n_jobs=16)]: Done  25 out of  25 | elapsed:   12.6s finished


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  20 out of  25 | elapsed:   13.4s remaining:    3.4s
[Parallel(n_jobs=16)]: Done  25 out of  25 | elapsed:   13.9s finished


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  20 out of  25 | elapsed:   13.4s remaining:    3.3s
[Parallel(n_jobs=16)]: Done  25 out of  25 | elapsed:   13.8s finished


Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=16)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  20 out of  25 | elapsed:   12.2s remaining:    3.1s
[Parallel(n_jobs=16)]: Done  25 out of  25 | elapsed:   13.2s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.3min finished



100%|██████████| 1/1 [01:20<00:00, 80.75s/it][A[A[A

100%|██████████| 3/3 [41:07:52<00:00, 72527.44s/it]    [A[A

In [47]:
#score_outer['SMOTE']["LR"]["estimator"][1].best_estimator_.named_steps["sampler"].sampling_strategy_

In [48]:
scores = pd.DataFrame([{
  'sampler':sampler_name, 'model':model_name, 
    'auc':np.mean(model["test_auc"]),  'auc_sd':np.std(model["test_auc"]),
    'lift0.1':np.mean(model["test_TDLift"]),  'lift0.1_sd':np.std(model["test_TDLift"]),
} for sampler_name, sampler in score_outer.items()
    for model_name, model in sampler.items()]
)

print(scores)

        auc    auc_sd   lift0.1  lift0.1_sd model     sampler
0  0.598723  0.016157  1.044356    0.008771    LR        cGAN
1  0.623048  0.005692  1.058126    0.004475    LR  unbalanced
2  0.573626  0.005416  1.034074    0.006696    LR       SMOTE


In [None]:
scores.to_csv("results.csv")

In [49]:
tuning_results = {sampler_name:
    {model_name:
    # vstack result DataFrame for each outer fold
        pd.concat([ 
            # Inner CV tuning results as DataFrame
            pd.concat([pd.DataFrame(inner_cv.cv_results_['params']).astype(str), 
                       pd.DataFrame({
                           'mean_test_auc':inner_cv.cv_results_['mean_test_auc'],
                           'std_test_auc':inner_cv.cv_results_['std_test_auc'],
                           'mean_test_TDLift':inner_cv.cv_results_['mean_test_TDLift'],
                           'std_test_TDLift':inner_cv.cv_results_['std_test_TDLift']
                       })
                      ], sort=False, ignore_index=False, axis=1)
            for inner_cv in model['estimator']]).groupby(list(model['estimator'][0].cv_results_['params'][0].keys())).mean().reset_index()
            for model_name, model in sampler.items()}
          for sampler_name, sampler in score_outer.items()}


In [50]:
tuning_results["cGAN"]["LR"].sort_values(["sampler__n_iter"])

Unnamed: 0,classifier__C,sampler__critic_iterations,sampler__critic_layers,sampler__generator_input,sampler__generator_layers,sampler__n_iter,mean_test_auc,std_test_auc,mean_test_TDLift,std_test_TDLift
0,10,2,[100],100,"[100, 100]",100000,0.572371,0.02011,1.036865,0.01134
1,10,2,[100],100,"[40, 40]",100000,0.568733,0.018025,1.032092,0.01006
2,10,2,[100],40,"[100, 100]",100000,0.57056,0.013995,1.036269,0.009695
3,10,2,[100],40,"[40, 40]",100000,0.559006,0.017895,1.029476,0.011564
4,10,2,[40],100,"[100, 100]",100000,0.586622,0.016635,1.041685,0.010522
5,10,2,[40],100,"[40, 40]",100000,0.592664,0.012136,1.041733,0.009956
6,10,2,[40],40,"[100, 100]",100000,0.576302,0.014907,1.038106,0.009637
7,10,2,[40],40,"[40, 40]",100000,0.586792,0.015623,1.041869,0.009653


In [None]:
tuning_results["cGAN"]["LR"].sort_values(["sampler__n_iter"]).to_csv("tuning_results.csv")

In [None]:
parameter_scores =  pd.DataFrame([{
  'sampler':sampler_name, 'model':model_name,
    'parameter':param_name,
    'parameter_value':str(param_value),
    'auc':cv.cv_results_['mean_test_auc'][i],  'auc_sd':cv.cv_results_["std_test_auc"][i],
    'lift0.1':cv.cv_results_["mean_test_TDLift"][i],  'lift0.1_sd':cv.cv_results_["std_test_TDLift"][i]
}   for sampler_name, sampler in score_outer.items()
    for model_name, model in sampler.items()
    for cv in model['estimator']
    for i, (param_name, param_value) in enumerate(cv.cv_results_['params'][0].items())
]
)

In [None]:
parameter_scores

In [None]:
parameter_scores.groupby(['sampler','model','parameter','parameter_value']).mean()

# Only sampler test