Licensed under the MIT License.

Copyright (c) 2021-2031. All rights reserved.

# Stacking with Vecstack

* Stacking optimized ensembling models, train on synthetic data and evaluate on real data.
* Synthetic Data
  * CTGAN synthetic data got most similar feature distributions as real data
  * CastGAN synthetic data got best baseline model performance

In [1]:
import pandas as pd
import numpy as np
import timeit
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
from category_encoders import TargetEncoder

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from vecstack import stacking

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

## Load Data

In [2]:
target = 'deposit'
num_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

raw_df = pd.read_pickle('../../crystal_ball/data_collector/structured_data/campaign_deposit.pkl')
X_train, X_test, y_train, y_test = train_test_split(raw_df.drop(target, axis=1), raw_df[target], 
                                                    test_size=0.25, random_state=10,
                                                    stratify=raw_df[target])
X_train.reset_index(drop=True, inplace=True) 
X_test.reset_index(drop=True, inplace=True) 
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

ctgan_syn_X_train = pd.read_csv('ctgan_syn_X_train_10.csv')
castgan_syn_X_train = pd.read_csv('CasTGAN/Generated_Data/wb_0/campaign_fake_20230901-2310.csv')

print(ctgan_syn_X_train.shape, castgan_syn_X_train.shape)

(8371, 16) (2791, 16) (8371,) (2791,)
(8371, 16) (8371, 16)


In [3]:
# encode categorical features since different ensembling models handle cat features in different way
def encode_data(cat_cols, X_train, y_train, X_test=None):
    enc = TargetEncoder(cols=cat_cols)
    encoded_X_train = enc.fit_transform(X_train, y_train)
    
    if X_test is not None:
        encoded_X_test = enc.transform(X_test)
        return encoded_X_train, encoded_X_test
    else:
        return encoded_X_train


encoded_X_train, encoded_X_test = encode_data(cat_cols, X_train, y_train, X_test)
encoded_ctgan_syn_X_train = encode_data(cat_cols, ctgan_syn_X_train, y_train)
encoded_castgan_syn_X_train = encode_data(cat_cols, castgan_syn_X_train, y_train)

print(encoded_X_train.shape, encoded_X_test.shape)
print(encoded_ctgan_syn_X_train.shape, encoded_castgan_syn_X_train.shape)

(8371, 16) (2791, 16)
(8371, 16) (8371, 16)


## Stacking for CTGAN Synthetic Data

* Stacking architecture: https://developer.ibm.com/articles/stack-machine-learning-models-get-better-results/
* About `vecstack`: https://github.com/vecxoz/vecstack
  * 2 levels stacking concept (for 'A' or 'B' mode): https://github.com/vecxoz/vecstack/blob/master/examples/00_stacking_concept_pictures_code.ipynb
    * decided by param `mode`, 'oof_pred' (alias 'B'), 'oof_pred_bag' (alias 'A')
* `stacking()` params: https://github.com/vecxoz/vecstack/blob/master/vecstack/core.py#L127-L132
  * NOTE: if set `needs_proba=True` for this classification will get error as its generated y has 2 dimensions rather than required 1 dimension, might be the bug in the package.

In [4]:
lgb_params = {'num_leaves': 4, 'learning_rate': 0.17937097905882862, 
              'bagging_fraction': 0.8506740422636537, 'feature_fraction': 0.6643670461510903,
              'bagging_freq': 18, 'min_data_in_leaf': 1133, 'num_iterations': 2956, 
              'objective': 'binary', 'metric': 'auc', 'random_state': 10, 
              'verbosity': -1, 'boosting': 'gbdt', 'num_threads': 4}
lgb_model = LGBMClassifier(**lgb_params)

xgb_params = {'booster': 'gbtree', 'nthread': 4, 'seed': 10, 'eval_metric': 'auc',
              'objective': 'binary:hinge', 'max_depth': 8, 
              'min_child_weight': 3,'subsample': 0.8, 
              'enable_categorical': True,'learning_rate': 0.17937097905882862}
xgb_model = XGBClassifier(**xgb_params)

catboost_params = {'max_depth': 7, 'learning_rate': 0.03767190865887794, 
                   'min_data_in_leaf': 1295, 'subsample': 0.9480606825100655, 
                   'grow_policy': 'SymmetricTree', 'loss_function': 'Logloss',
                   'random_seed': 10}
catboost_model = CatBoostClassifier(**catboost_params)


models = [lgb_model, xgb_model, catboost_model]

S_train, S_test = stacking(models, encoded_ctgan_syn_X_train, y_train.values, encoded_X_test, 
                           regression=False, transform_target=None, transform_pred=None,
                           mode='oof_pred', needs_proba=False, save_dir=None,
                           metric=average_precision_score, n_folds=3, stratified=True,
                           shuffle=True, random_state=10, verbose=1)

task:         [classification]
n_classes:    [2]
metric:       [average_precision_score]
mode:         [oof_pred]
n_models:     [3]

model  0:     [LGBMClassifier]
    ----
    MEAN:     [0.47295435] + [0.00135097]
    FULL:     [0.47294637]

    Fitting on full train set...

model  1:     [XGBClassifier]
    ----
    MEAN:     [0.47366760] + [0.00146399]
    FULL:     [0.47365826]

    Fitting on full train set...

model  2:     [CatBoostClassifier]
0:	learn: 0.6923890	total: 153ms	remaining: 2m 32s
1:	learn: 0.6918644	total: 173ms	remaining: 1m 26s
2:	learn: 0.6911267	total: 193ms	remaining: 1m 4s
3:	learn: 0.6905561	total: 212ms	remaining: 52.9s
4:	learn: 0.6899321	total: 232ms	remaining: 46.2s
5:	learn: 0.6894763	total: 252ms	remaining: 41.7s
6:	learn: 0.6888271	total: 272ms	remaining: 38.6s
7:	learn: 0.6882476	total: 292ms	remaining: 36.2s
8:	learn: 0.6875689	total: 312ms	remaining: 34.3s
9:	learn: 0.6870956	total: 332ms	remaining: 32.9s
10:	learn: 0.6862126	total: 352ms	remaining

In [5]:
# They're the features for next level's modeling
display(S_train[0:5])
print()
display(S_test[0:5])

array([[0, 1, 1],
       [0, 1, 0],
       [0, 0, 0],
       [0, 1, 1],
       [0, 1, 0]])




array([[1, 1, 1],
       [0, 1, 1],
       [1, 1, 1],
       [1, 1, 0],
       [0, 1, 0]])

### 2nd Level Learning without Original Features

In [6]:
# initialize 2nd level model
catboost_params = {'max_depth': 7, 'learning_rate': 0.03767190865887794, 'min_data_in_leaf': 1295,
              'subsample': 0.9480606825100655, 'grow_policy': 'SymmetricTree',
              'loss_function': 'Logloss', 'random_seed': 10}
model = CatBoostClassifier(**catboost_params)
    
# fit 2nd level model
model = model.fit(S_train, y_train, verbose=False)

# predict
y_pred = model.predict(S_test)

# evaluation
auc = roc_auc_score(y_test, y_pred)
avp = average_precision_score(y_test, y_pred)
print(f'Testing AUC is {auc}, Testing Average Precision is {avp}')

Testing AUC is 0.5, Testing Average Precision is 0.4736653529201003


### 2nd Level Learning with Original Features

In [14]:
copy_encoded_ctgan_syn_X_train = encoded_ctgan_syn_X_train.copy()
copy_encoded_X_test = encoded_X_test.copy()

copy_encoded_ctgan_syn_X_train['stacking1'] = S_train[:, 0]
copy_encoded_ctgan_syn_X_train['stacking2'] = S_train[:, 1]
copy_encoded_ctgan_syn_X_train['stacking3'] = S_train[:, 2]
display(copy_encoded_ctgan_syn_X_train.head())

copy_encoded_X_test['stacking1'] = S_test[:, 0]
copy_encoded_X_test['stacking2'] = S_test[:, 1]
copy_encoded_X_test['stacking3'] = S_test[:, 2]
display(copy_encoded_X_test.head())

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,stacking1,stacking2,stacking3
0,36,0.460613,0.480469,0.468323,0.472767,6203,0.472185,0.471817,0.473386,30,0.481633,187,1,-1,0,0.475026,1,1,1
1,33,0.483254,0.472555,0.468323,0.472767,1633,0.472185,0.471817,0.473386,13,0.440613,775,2,-1,0,0.475026,0,1,0
2,40,0.467153,0.480469,0.468323,0.472767,-11,0.47585,0.471817,0.473386,27,0.451264,295,4,-1,0,0.475026,0,0,1
3,35,0.467153,0.480469,0.474845,0.472767,102,0.472185,0.471817,0.473386,5,0.466373,43,2,-1,26,0.499179,1,1,1
4,44,0.49529,0.472555,0.47476,0.472767,0,0.472185,0.471817,0.473386,4,0.467857,1405,1,-1,0,0.47644,0,1,1


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,stacking1,stacking2,stacking3
0,33,0.465465,0.437133,0.447517,0.476878,7,0.368434,0.496279,0.542474,17,0.623066,612,1,148,1,0.497849,1,1,0
1,59,0.672474,0.437133,0.538296,0.476878,422,0.368434,0.496279,0.542474,14,0.329577,647,4,-1,0,0.408654,0,1,1
2,25,0.766423,0.538991,0.507895,0.476878,3,0.569444,0.496279,0.542474,26,0.329577,170,1,-1,0,0.408654,0,1,1
3,51,0.36587,0.437133,0.398931,0.476878,4497,0.368434,0.496279,0.229415,15,0.329577,276,3,-1,0,0.408654,0,0,0
4,49,0.465465,0.538991,0.447517,0.476878,293,0.569444,0.496279,0.542474,28,0.442731,452,1,-1,0,0.408654,1,1,1


In [15]:
# initialize 2nd level model
catboost_params = {'max_depth': 7, 'learning_rate': 0.03767190865887794, 'min_data_in_leaf': 1295,
              'subsample': 0.9480606825100655, 'grow_policy': 'SymmetricTree',
              'loss_function': 'Logloss', 'random_seed': 10}
model = CatBoostClassifier(**catboost_params)
    
# fit 2nd level model
model = model.fit(copy_encoded_ctgan_syn_X_train, y_train, verbose=False)

# predict
y_pred = model.predict(copy_encoded_X_test)

# evaluation
auc = roc_auc_score(y_test, y_pred)
avp = average_precision_score(y_test, y_pred)
print(f'Testing AUC is {auc}, Testing Average Precision is {avp}')

Testing AUC is 0.5514894815599032, Testing Average Precision is 0.5025697580538885


## Stacking for CastGAN Synthetic Data

In [16]:
lgb_params = {'num_leaves': 4, 'learning_rate': 0.17937097905882862, 
              'bagging_fraction': 0.8506740422636537, 'feature_fraction': 0.6643670461510903,
              'bagging_freq': 18, 'min_data_in_leaf': 1133, 'num_iterations': 2956, 
              'objective': 'binary', 'metric': 'auc', 'random_state': 10, 
              'verbosity': -1, 'boosting': 'gbdt', 'num_threads': 4}
lgb_model = LGBMClassifier(**lgb_params)

xgb_params = {'booster': 'gbtree', 'nthread': 4, 'seed': 10, 'eval_metric': 'auc',
              'objective': 'binary:hinge', 'max_depth': 8, 
              'min_child_weight': 3,'subsample': 0.8, 
              'enable_categorical': True,'learning_rate': 0.17937097905882862}
xgb_model = XGBClassifier(**xgb_params)

catboost_params = {'max_depth': 7, 'learning_rate': 0.03767190865887794, 
                   'min_data_in_leaf': 1295, 'subsample': 0.9480606825100655, 
                   'grow_policy': 'SymmetricTree', 'loss_function': 'Logloss',
                   'random_seed': 10}
catboost_model = CatBoostClassifier(**catboost_params)


models = [lgb_model, xgb_model, catboost_model]

S_train, S_test = stacking(models, encoded_castgan_syn_X_train, y_train.values, encoded_X_test, 
                           regression=False, transform_target=None, transform_pred=None,
                           mode='oof_pred', needs_proba=False, save_dir=None,
                           metric=average_precision_score, n_folds=3, stratified=True,
                           shuffle=True, random_state=10, verbose=1)

task:         [classification]
n_classes:    [2]
metric:       [average_precision_score]
mode:         [oof_pred]
n_models:     [3]

model  0:     [LGBMClassifier]
    ----
    MEAN:     [0.47512426] + [0.00708425]
    FULL:     [0.47486766]

    Fitting on full train set...

model  1:     [XGBClassifier]
    ----
    MEAN:     [0.47583376] + [0.00487737]
    FULL:     [0.47573053]

    Fitting on full train set...

model  2:     [CatBoostClassifier]
0:	learn: 0.6925542	total: 7.76ms	remaining: 7.75s
1:	learn: 0.6920159	total: 15ms	remaining: 7.49s
2:	learn: 0.6914636	total: 22.5ms	remaining: 7.48s
3:	learn: 0.6909081	total: 30.1ms	remaining: 7.5s
4:	learn: 0.6904045	total: 36.8ms	remaining: 7.32s
5:	learn: 0.6898007	total: 43.5ms	remaining: 7.21s
6:	learn: 0.6890965	total: 51.4ms	remaining: 7.29s
7:	learn: 0.6885187	total: 58.7ms	remaining: 7.28s
8:	learn: 0.6880463	total: 65.2ms	remaining: 7.18s
9:	learn: 0.6875188	total: 72.1ms	remaining: 7.14s
10:	learn: 0.6866585	total: 79.1ms	rem

### 2nd Level Learning without Original Features

In [10]:
# initialize 2nd level model
catboost_params = {'max_depth': 7, 'learning_rate': 0.03767190865887794, 'min_data_in_leaf': 1295,
              'subsample': 0.9480606825100655, 'grow_policy': 'SymmetricTree',
              'loss_function': 'Logloss', 'random_seed': 10}
model = CatBoostClassifier(**catboost_params)
    
# fit 2nd level model
model = model.fit(S_train, y_train, verbose=False)

# predict
y_pred = model.predict(S_test)

# evaluation
auc = roc_auc_score(y_test, y_pred)
avp = average_precision_score(y_test, y_pred)
print(f'Testing AUC is {auc}, Testing Average Precision is {avp}')

Testing AUC is 0.5, Testing Average Precision is 0.4736653529201003


### 2nd Level Learning with Original Features

In [17]:
copy_encoded_castgan_syn_X_train = encoded_castgan_syn_X_train.copy()
copy_encoded_X_test = encoded_X_test.copy()

copy_encoded_castgan_syn_X_train['stacking1'] = S_train[:, 0]
copy_encoded_castgan_syn_X_train['stacking2'] = S_train[:, 1]
copy_encoded_castgan_syn_X_train['stacking3'] = S_train[:, 2]
display(copy_encoded_castgan_syn_X_train.head())

copy_encoded_X_test['stacking1'] = S_test[:, 0]
copy_encoded_X_test['stacking2'] = S_test[:, 1]
copy_encoded_X_test['stacking3'] = S_test[:, 2]
display(copy_encoded_X_test.head())

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,stacking1,stacking2,stacking3
0,17,0.482547,0.472339,0.473747,0.473836,2005,0.490255,0.474817,0.474986,25,0.481901,258,3,-6,0,0.472608,1,1,1
1,28,0.482547,0.472339,0.473747,0.473836,848,0.472482,0.474817,0.449102,18,0.47782,288,2,140,1,0.472608,0,1,0
2,24,0.482547,0.472339,0.473747,0.473836,2680,0.472482,0.474817,0.474986,34,0.42487,261,1,-5,0,0.472608,0,0,1
3,43,0.482547,0.472339,0.473747,0.473836,2722,0.472482,0.474817,0.474986,10,0.42487,283,0,-4,0,0.472608,1,1,1
4,17,0.482547,0.472339,0.473747,0.473836,2588,0.472482,0.474817,0.474986,27,0.481901,277,0,-6,0,0.472608,0,1,1


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,stacking1,stacking2,stacking3
0,33,0.465465,0.437133,0.447517,0.476878,7,0.368434,0.496279,0.542474,17,0.623066,612,1,148,1,0.497849,1,1,0
1,59,0.672474,0.437133,0.538296,0.476878,422,0.368434,0.496279,0.542474,14,0.329577,647,4,-1,0,0.408654,0,1,1
2,25,0.766423,0.538991,0.507895,0.476878,3,0.569444,0.496279,0.542474,26,0.329577,170,1,-1,0,0.408654,0,1,1
3,51,0.36587,0.437133,0.398931,0.476878,4497,0.368434,0.496279,0.229415,15,0.329577,276,3,-1,0,0.408654,0,0,0
4,49,0.465465,0.538991,0.447517,0.476878,293,0.569444,0.496279,0.542474,28,0.442731,452,1,-1,0,0.408654,1,1,1


In [18]:
# initialize 2nd level model
catboost_params = {'max_depth': 7, 'learning_rate': 0.03767190865887794, 'min_data_in_leaf': 1295,
              'subsample': 0.9480606825100655, 'grow_policy': 'SymmetricTree',
              'loss_function': 'Logloss', 'random_seed': 10}
model = CatBoostClassifier(**catboost_params)
    
# fit 2nd level model
model = model.fit(copy_encoded_castgan_syn_X_train, y_train, verbose=False)

# predict
y_pred = model.predict(copy_encoded_X_test)

# evaluation
auc = roc_auc_score(y_test, y_pred)
avp = average_precision_score(y_test, y_pred)
print(f'Testing AUC is {auc}, Testing Average Precision is {avp}')

Testing AUC is 0.4941944410401963, Testing Average Precision is 0.4708063269814774


## Summary

* For CTGAN synthetic data, Stacking slightly improved the performance
* But for CastGAN synthetic data, Stacking didn't appear to be the best option.