In [1]:
import sys
import json
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate
from imblearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from tqdm import tqdm

# Evaluation functions
from sklearn.metrics import roc_auc_score, make_scorer
from lift.perc_lift_score import perc_lift_score

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

sys.path.append("/Users/hauptjoh/Projects/utils/imbalanced-learn/")
#sys.path.append("/home/RDC/hauptjoh.hub/GANbalanced")
# Samplers
from imblearn.over_sampling import SMOTENC
from imblearnNC.under_sampling import TomekLinksNC

from wgan.imblearn import GANbalancer
import data_loader

Using TensorFlow backend.


In [2]:
n_jobs=3

data_path = Path("~/Data/COIL00")
X, y = data_loader.load_coil00(data_path)

In [3]:
# Initialize index lists
idx_cont = None
idx_cat  = None

if idx_cat is None:
    idx_cat = list(np.where(X.dtypes == 'category')[0])
    idx_cat = [int(x) for x in idx_cat]

if idx_cont is None:
    idx_cont = [x for x in range(X.shape[1]) if x not in idx_cat]
    idx_cont = [int(x) for x in idx_cont]

In [4]:
# Initialize embedding tuples
categorical = None
if idx_cat is not None:
    categorical = [(i,
                    len(X.iloc[:,i].cat.categories),
                    int(min(15., np.ceil(0.5*len(X.iloc[:,i].cat.categories))))
                   )
                    for i in idx_cat]

# Make sure categorical variables are encoded from 0
if np.any([idx>min(idx_cat) for idx in idx_cont]):
    raise ValueError("Variables need to be ordered [cont, cat]")

In [5]:
X=X.to_numpy(dtype=np.float32)
y=y.to_numpy(dtype=np.int32)

In [6]:
### Scorer
scorers = {'auc':make_scorer(roc_auc_score, needs_proba=True),
          'TDLift':make_scorer(perc_lift_score, needs_proba=True, percentile=0.1)}

# Load specific experiment configuration
config = {
 "models": {
     "LR":{
         "C": [10]
   }
 },
 "samplers": {
    "cGAN":{
        "generator_input": [20],
        "generator_layers": [[20]],
        "critic_layers": [[20]],
        "n_iter": [100],
        "critic_iterations": [2]
   },
    "unbalanced":{},
    "SMOTE":{
        "k_neighbors": [3]
   }
 }
}


### Models
models = []
model_fun = {"LR":LogisticRegression(solver='liblinear'),
          "RF":RandomForestClassifier(min_samples_leaf=20)
          }

for model_name, model_params in config["models"].items():
    models.append((model_name, model_fun[model_name], model_params))

### Samplers
samplers = []
sampler_fun = {"cGAN":GANbalancer(\
                         idx_cont=idx_cont, categorical=categorical,
                         batch_size = 128, auxiliary=True),
                "unbalanced":None,
                "SMOTE":SMOTENC(categorical_features=idx_cat)
}

# Cleaner
cleaner = TomekLinksNC(categorical_features=idx_cat, sampling_strategy='auto')

for sampler_name, sampler_params in config["samplers"].items():
    samplers.append((sampler_name, sampler_fun[sampler_name], sampler_params))


### Pipeline construction

preproc_sampler = ColumnTransformer([
    ('scaler', MinMaxScaler(), idx_cont),
    ('pass',   'passthrough',  idx_cat)
])

preproc_clf = ColumnTransformer([
    ('pass', 'passthrough', idx_cont),
    ('ohe',   OneHotEncoder(categories='auto', handle_unknown='ignore'),  idx_cat)
])


seed = 123

score_outer = {}

for sampler_name, sampler, sampler_grid in tqdm(samplers):

    sampler_grid = {'sampler__'+key:item for key, item in sampler_grid.items()}

    score_inner = {}

    for model_name, model, model_grid in tqdm(models):

        pipeline = Pipeline(steps=[
            ('preproc_sampler', preproc_sampler),
            ('sampler', sampler),
            ('Cleaning', cleaner),
            ('preproc_clf', preproc_clf),
            ('classifier', model)
          ])

        model_grid = {'classifier__'+key:item for key, item in model_grid.items()}
        p_grid = {**sampler_grid, **model_grid}

        inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=seed)
        outer_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=seed)

        clf = GridSearchCV(pipeline, param_grid= p_grid, cv=inner_cv, scoring=scorers, refit='auc',
                           return_train_score=True, iid=False,
                           n_jobs=n_jobs, pre_dispatch=n_jobs*2, verbose=1)

        score_inner[model_name] = cross_validate(clf, X=X,y=y,cv=outer_cv , scoring=scorers, return_train_score=True,
                                    return_estimator=True, verbose=1, error_score='raise')
    score_outer[sampler_name] = score_inner


scores = pd.DataFrame([{
  'sampler':sampler_name, 'model':model_name,
    'auc':np.mean(model["test_auc"]),  'auc_sd':np.std(model["test_auc"]),
    'lift0.1':np.mean(model["test_TDLift"]),  'lift0.1_sd':np.std(model["test_TDLift"]),
} for sampler_name, sampler in score_outer.items()
    for model_name, model in sampler.items()]
)


tuning_results = {sampler_name:
    {model_name:
    # vstack result DataFrame for each outer fold
        pd.concat([
            # Inner CV tuning results as DataFrame
            pd.concat([pd.DataFrame(inner_cv.cv_results_['params']).astype(str),
                       pd.DataFrame({
                           'mean_test_auc':inner_cv.cv_results_['mean_test_auc'],
                           'std_test_auc':inner_cv.cv_results_['std_test_auc'],
                           'mean_test_TDLift':inner_cv.cv_results_['mean_test_TDLift'],
                           'std_test_TDLift':inner_cv.cv_results_['std_test_TDLift']
                       })
                      ], sort=False, ignore_index=False, axis=1)
            for inner_cv in model['estimator']]).groupby(list(model['estimator'][0].cv_results_['params'][0].keys())).mean().reset_index()
            for model_name, model in sampler.items()}
          for sampler_name, sampler in score_outer.items()}

### Collect results
scores = pd.DataFrame([{
      'sampler':sampler_name, 'model':model_name,
        'auc':np.mean(model["test_auc"]),  'auc_sd':np.std(model["test_auc"]),
        'lift0.1':np.mean(model["test_TDLift"]),  'lift0.1_sd':np.std(model["test_TDLift"]),
    } for sampler_name, sampler in score_outer.items()
        for model_name, model in sampler.items()]
    )

tuning_results = {sampler_name:
    {model_name:
    # vstack result DataFrame for each outer fold
        pd.concat([
            # Inner CV tuning results as DataFrame
            pd.concat([pd.DataFrame(inner_cv.cv_results_['params']).astype(str),
                       pd.DataFrame({
                           'mean_test_auc':inner_cv.cv_results_['mean_test_auc'],
                           'std_test_auc':inner_cv.cv_results_['std_test_auc'],
                           'mean_test_TDLift':inner_cv.cv_results_['mean_test_TDLift'],
                           'std_test_TDLift':inner_cv.cv_results_['std_test_TDLift']
                       })
                      ], sort=False, ignore_index=False, axis=1)
            for inner_cv in model['estimator']]).groupby(list(model['estimator'][0].cv_results_['params'][0].keys())).mean().reset_index()
            for model_name, model in sampler.items()}
          for sampler_name, sampler in score_outer.items()}


  0%|          | 0/3 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=3)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=3)]: Done   2 out of   2 | elapsed:   28.4s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=3)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 out of   2 | elapsed:   27.7s finished
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.6min finished

100%|██████████| 1/1 [01:35<00:00, 95.82s/it][A
 33%|███▎      | 1/3 [01:35<03:11, 95.83s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=3)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=3)]: Done   2 out of   2 | elapsed:    1.4s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=3)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 out of   2 | elapsed:    1.4s finished
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.6s finished

100%|██████████| 1/1 [00:06<00:00,  6.62s/it][A
 67%|██████▋   | 2/3 [01:42<01:09, 69.07s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=3)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=3)]: Done   2 out of   2 | elapsed:   13.4s finished


Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=3)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 out of   2 | elapsed:   15.7s finished
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.1min finished

100%|██████████| 1/1 [01:05<00:00, 65.57s/it][A
100%|██████████| 3/3 [02:48<00:00, 68.02s/it]
