In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate
from imblearn.pipeline import Pipeline

from experiment.experiment_config import experiment_config

from tqdm import tqdm

In [2]:
from sklearn.datasets import make_classification
X,y = make_classification(n_samples=5000, n_features=10, weights=[0.9,0.1], n_informative=10, n_redundant=0, n_clusters_per_class=10)

# from wgan.data import load_DMC10
# X, _, y, _, idx_cont, idx_cat, cat_dict = load_DMC10("/Users/hauptjoh/Data/DMC10")

In [3]:
scorers, models, samplers = experiment_config(X, idx_cont=None, idx_cat=None)

In [4]:
# Evaluation functions
from sklearn.metrics import roc_auc_score, make_scorer
from lift.perc_lift_score import perc_lift_score

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

# Samplers
from imblearn.over_sampling import SMOTE, ADASYN
from wgan.imblearn import GANbalancer

idx_cont=None
idx_cat = None

### Samplers
scorers = {'auc':make_scorer(roc_auc_score, needs_proba=True),
          'TDLift':make_scorer(perc_lift_score, needs_proba=True, percentile=0.1)}

### Models
models = []
models.append(('LR', LogisticRegression(solver='liblinear'), {
    "C": [1]
}))
# models.append(('RF', RandomForestClassifier(), {
#     "n_estimators":[100],
#     "max_features":["sqrt"],
#     "min_samples_leaf":[20]
# }))

### Samplers
from imblearn.over_sampling import SMOTE, ADASYN
from wgan.imblearn import GANbalancer


samplers = []

# baseline
samplers.append(('unbalanced', None, {}))

# SMOTE
samplers.append(('SMOTE', SMOTE(), {
    'k_neighbors':[5,10,20]
}))

# ADASYN
samplers.append(('ADASYN', ADASYN(), {
    'n_neighbors':[5,10,20]
}))

# GAN
if idx_cont is None:
    idx_cont = list(range(X.shape[1]))

categorical = None
if idx_cat is not None:
    categorical = [(i,
                    np.max(X[:,i])+1,
                    int(min(15., np.ceil(np.max((X[:,i])+1)/2)))
                   )
                    for i in idx_cat]

samplers.append(('cGAN', GANbalancer(
        idx_cont=idx_cont, categorical=categorical,
        generator_input=X.shape[1]
), {
    'generator_layers' : [[20,20]],
    'critic_layers'    : [[20,20]],
    'n_iter'           : [1000,2000,6000,10000,15000]
}))

In [5]:
seed = 123

score_outer = {}

for sampler_name, sampler, sampler_grid in tqdm(samplers):
    
    sampler_grid = {'sampler__'+key:item for key, item in sampler_grid.items()}
    
    score_inner = {}

    for model_name, model, model_grid in tqdm(models):

        pipeline = Pipeline(memory='./.cachedir', steps=[
            ('scaler', MinMaxScaler()),
            ('sampler', sampler),
            ('classifier', model)
          ])
        model_grid = {'classifier__'+key:item for key, item in model_grid.items()}
        p_grid = {**sampler_grid, **model_grid}
        
        inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
        outer_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=seed)
            
        clf = GridSearchCV(pipeline, param_grid= p_grid, cv=inner_cv, scoring=scorers, refit='auc', 
                           return_train_score=True, iid=False, n_jobs=3)

        score_inner[model_name] = cross_validate(clf, X=X,y=y,cv=outer_cv , scoring=scorers, return_train_score=True,
                                    return_estimator=True)
    score_outer[sampler_name] = score_inner

  0%|          | 0/4 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:00<00:00,  2.62it/s][A
 25%|██▌       | 1/4 [00:00<00:01,  2.55it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:02<00:00,  2.05s/it][A
 50%|█████     | 2/4 [00:02<00:01,  1.11it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [00:02<00:00,  2.79s/it][A
 75%|███████▌  | 3/4 [00:05<00:01,  1.50s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A
100%|██████████| 1/1 [1:26:41<00:00, 5201.76s/it][A
100%|██████████| 4/4 [1:26:47<00:00, 1561.59s/it]


In [6]:
scores = pd.DataFrame([{
  'sampler':sampler_name, 'model':model_name, 
    'auc':np.mean(model["test_auc"]),  'auc_sd':np.std(model["test_auc"]),
    'lift0.1':np.mean(model["test_TDLift"]),  'lift0.1_sd':np.std(model["test_TDLift"]),
} for sampler_name, sampler in score_outer.items()
    for model_name, model in sampler.items()]
)

print(scores)

        auc    auc_sd   lift0.1  lift0.1_sd model     sampler
0  0.661829  0.016844  1.070364    0.006128    LR  unbalanced
1  0.670366  0.014031  1.072507    0.012529    LR       SMOTE
2  0.671946  0.014659  1.070371    0.010393    LR      ADASYN
3  0.597118  0.033858  1.025555    0.013139    LR        cGAN


In [45]:
(

In [47]:
temp

{'mean_fit_time': array([0.0542058 , 0.04649496, 0.05619779]),
 'std_fit_time': array([0.00973518, 0.00063056, 0.00939692]),
 'mean_score_time': array([0.00349998, 0.00264945, 0.00263748]),
 'std_score_time': array([0.00162883, 0.00036672, 0.00025601]),
 'param_classifier__C': masked_array(data=[1, 1, 1],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'param_sampler__k_neighbors': masked_array(data=[5, 10, 20],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'classifier__C': 1, 'sampler__k_neighbors': 5},
  {'classifier__C': 1, 'sampler__k_neighbors': 10},
  {'classifier__C': 1, 'sampler__k_neighbors': 20}],
 'split0_test_auc': array([0.6978022 , 0.70535714, 0.69767342]),
 'split1_test_auc': array([0.68187672, 0.67582418, 0.67955872]),
 'split2_test_auc': array([0.66728194, 0.66702438, 0.66848386]),
 'split3_test_auc': array([0.66084306, 0.66642342, 0.66848386]),
 'split4_test_auc

In [46]:
temp['mean_test_auc']

array([0.67911026, 0.68487213, 0.68077664])

In [41]:
[value for key,value in temp.items() if "test_auc" in key]

[array([0.6978022 , 0.70535714, 0.69767342]),
 array([0.68187672, 0.67582418, 0.67955872]),
 array([0.66728194, 0.66702438, 0.66848386]),
 array([0.66084306, 0.66642342, 0.66848386]),
 array([0.68774738, 0.70973154, 0.68968336]),
 array([0.67911026, 0.68487213, 0.68077664]),
 array([0.01345542, 0.01885942, 0.01156321]),
 array([3, 1, 2], dtype=int32)]

In [178]:
model = score_outer['unbalanced']["LR"]

In [144]:
score_outer['SMOTE']["LR"]['estimator'][0].cv_results_['params'][0].keys()

dict_keys(['classifier__C', 'sampler__k_neighbors'])

In [145]:
temp = pd.DataFrame({"Model":["LR"]})

In [148]:
temp.groupby(['Model'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x1a28d1b6a0>

In [133]:
 score_outer['SMOTE']

{'LR': {'fit_time': array([1.00116801, 1.01145983]),
  'score_time': array([0.00440621, 0.00404525]),
  'estimator': (GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
          error_score='raise-deprecating',
          estimator=Pipeline(memory='./.cachedir',
        steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('sampler', SMOTE(k_neighbors=5, kind='deprecated', m_neighbors='deprecated', n_jobs=1,
      out_step='deprecated', random_state=None, ratio=None,
      sampling_strategy='auto', svm_estimator='deprecated')), ('classifier', LogisticRegress...ty='l2', random_state=None, solver='liblinear',
             tol=0.0001, verbose=0, warm_start=False))]),
          fit_params=None, iid=False, n_jobs=3,
          param_grid={'sampler__k_neighbors': [5, 10, 20], 'classifier__C': [1]},
          pre_dispatch='2*n_jobs', refit='auc', return_train_score=True,
          scoring={'auc': make_scorer(roc_auc_score, needs_proba=True), 'TDLift': mak

In [187]:
temp  = pd.concat([ 
# Inner CV tuning results as DataFrame
pd.concat([pd.DataFrame(inner_cv.cv_results_['params'], dtype='str'), 
           pd.DataFrame({
               'mean_test_auc':inner_cv.cv_results_['mean_test_auc'],
               'std_test_auc':inner_cv.cv_results_['std_test_auc'],
               'mean_test_TDLift':inner_cv.cv_results_['mean_test_TDLift'],
               'std_test_TDLift':inner_cv.cv_results_['std_test_TDLift']
           })
          ], sort=False, ignore_index=False, axis=1)
for inner_cv in model['estimator']]).groupby(list(model['estimator'][0].cv_results_['params'][0].keys())).mean()

In [188]:
list(model['estimator'][0].cv_results_['params'][0].keys())

['classifier__C']

In [189]:
temp.so#.groupby(['classifier__C','sampler__k_neighbors'])

Unnamed: 0_level_0,mean_test_auc,std_test_auc,mean_test_TDLift,std_test_TDLift
classifier__C,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.630473,0.027994,1.057617,0.031181


In [195]:
str([10])

'[10]'

In [203]:
tuning_results = {sampler_name:
    {model_name:
    # vstack result DataFrame for each outer fold
        pd.concat([ 
            # Inner CV tuning results as DataFrame
            pd.concat([pd.DataFrame(inner_cv.cv_results_['params']).astype(str), 
                       pd.DataFrame({
                           'mean_test_auc':inner_cv.cv_results_['mean_test_auc'],
                           'std_test_auc':inner_cv.cv_results_['std_test_auc'],
                           'mean_test_TDLift':inner_cv.cv_results_['mean_test_TDLift'],
                           'std_test_TDLift':inner_cv.cv_results_['std_test_TDLift']
                       })
                      ], sort=False, ignore_index=False, axis=1)
            for inner_cv in model['estimator']]).groupby(list(model['estimator'][0].cv_results_['params'][0].keys())).mean().reset_index()
            for model_name, model in sampler.items()}
          for sampler_name, sampler in score_outer.items()}


In [210]:
tuning_results["cGAN"]["LR"].sort_values(["sampler__n_iter"])

Unnamed: 0,classifier__C,sampler__critic_layers,sampler__generator_layers,sampler__n_iter,mean_test_auc,std_test_auc,mean_test_TDLift,std_test_TDLift
0,1,[20],[20],1000,0.527235,0.082427,1.019196,0.037761
1,1,[20],[20],10000,0.578513,0.050765,1.030048,0.04352
2,1,[20],[20],15000,0.594508,0.064937,1.038586,0.043547
3,1,[20],[20],2000,0.511375,0.035972,0.991504,0.061659
4,1,[20],[20],6000,0.499183,0.084654,0.974537,0.052733


In [152]:
{sampler_name:
    {model_name:
    # vstack result DataFrame for each outer fold
        [ 
            # Inner CV tuning results as DataFrame
            print(list(inner_cv.cv_results_['params'][0].keys()))
            for inner_cv in model['estimator']]
            for model_name, model in sampler.items()}
          for sampler_name, sampler in score_outer.items()}

['classifier__C']
['classifier__C']
['classifier__C', 'sampler__k_neighbors']
['classifier__C', 'sampler__k_neighbors']
['classifier__C', 'sampler__n_neighbors']
['classifier__C', 'sampler__n_neighbors']
['classifier__C', 'sampler__critic_layers', 'sampler__generator_layers', 'sampler__n_iter']
['classifier__C', 'sampler__critic_layers', 'sampler__generator_layers', 'sampler__n_iter']


{'unbalanced': {'LR': [None, None]},
 'SMOTE': {'LR': [None, None]},
 'ADASYN': {'LR': [None, None]},
 'cGAN': {'LR': [None, None]}}

In [94]:
parameter_scores =  pd.DataFrame([{
  'sampler':sampler_name, 'model':model_name,
    'parameter':param_name,
    'parameter_value':str(param_value),
    'auc':cv.cv_results_['mean_test_auc'][i],  'auc_sd':cv.cv_results_["std_test_auc"][i],
    'lift0.1':cv.cv_results_["mean_test_TDLift"][i],  'lift0.1_sd':cv.cv_results_["std_test_TDLift"][i]
}   for sampler_name, sampler in score_outer.items()
    for model_name, model in sampler.items()
    for cv in model['estimator']
    for i, (param_name, param_value) in enumerate(cv.cv_results_['params'][0].items())
]
)

In [95]:
parameter_scores

Unnamed: 0,auc,auc_sd,lift0.1,lift0.1_sd,model,parameter,parameter_value,sampler
0,0.666012,0.014444,1.072222,0.02104,LR,classifier__C,1,unbalanced
1,0.594935,0.041543,1.043012,0.041322,LR,classifier__C,1,unbalanced
2,0.67911,0.013455,1.085034,0.015897,LR,classifier__C,1,SMOTE
3,0.684872,0.018859,1.085034,0.015897,LR,sampler__k_neighbors,5,SMOTE
4,0.627644,0.050939,1.043012,0.024734,LR,classifier__C,1,SMOTE
5,0.627536,0.054787,1.047286,0.026814,LR,sampler__k_neighbors,5,SMOTE
6,0.680049,0.01536,1.089308,0.019132,LR,classifier__C,1,ADASYN
7,0.682259,0.016651,1.080761,0.021637,LR,sampler__n_neighbors,5,ADASYN
8,0.62659,0.05393,1.047286,0.030027,LR,classifier__C,1,ADASYN
9,0.629165,0.050968,1.043012,0.031257,LR,sampler__n_neighbors,5,ADASYN


In [96]:
parameter_scores.groupby(['sampler','model','parameter','parameter_value']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,auc,auc_sd,lift0.1,lift0.1_sd
sampler,model,parameter,parameter_value,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ADASYN,LR,classifier__C,1,0.65332,0.034645,1.068297,0.024579
ADASYN,LR,sampler__n_neighbors,5,0.655712,0.033809,1.061886,0.026447
SMOTE,LR,classifier__C,1,0.653377,0.032197,1.064023,0.020316
SMOTE,LR,sampler__k_neighbors,5,0.656204,0.036823,1.06616,0.021355
cGAN,LR,classifier__C,1,0.527235,0.082427,1.019196,0.037761
cGAN,LR,sampler__critic_layers,[20],0.511375,0.035972,0.991504,0.061659
cGAN,LR,sampler__generator_layers,[20],0.499183,0.084654,0.974537,0.052733
cGAN,LR,sampler__n_iter,1000,0.578513,0.050765,1.030048,0.04352
unbalanced,LR,classifier__C,1,0.630473,0.027994,1.057617,0.031181


In [60]:
for i in score_outer['SMOTE']["LR"]['estimator']:
    print(i)

GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffle=True),
       error_score='raise-deprecating',
       estimator=Pipeline(memory='./.cachedir',
     steps=[('scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('sampler', SMOTE(k_neighbors=5, kind='deprecated', m_neighbors='deprecated', n_jobs=1,
   out_step='deprecated', random_state=None, ratio=None,
   sampling_strategy='auto', svm_estimator='deprecated')), ('classifier', LogisticRegress...ty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid=False, n_jobs=3,
       param_grid={'sampler__k_neighbors': [5, 10, 20], 'classifier__C': [1]},
       pre_dispatch='2*n_jobs', refit='auc', return_train_score=True,
       scoring={'auc': make_scorer(roc_auc_score, needs_proba=True), 'TDLift': make_scorer(perc_lift_score, needs_proba=True, percentile=0.1)},
       verbose=0)
GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=123, shuffl