# Stacking Using Baseline Models

In [2]:
# Processes
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, KFold
from sklearn.metrics import log_loss, roc_auc_score, roc_curve, auc

# Modelling
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from catboost import CatBoostClassifier



# Data prep, logging, and garbage collection
from load_data import load_train_data, load_test_data
from logging import StreamHandler, DEBUG, Formatter, FileHandler, getLogger
import gc


import warnings
warnings.filterwarnings('ignore')

# Data Loading and Prep

In [16]:
DIR = 'result_tmp/'
logger = getLogger(__name__)


log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s ')
handler = StreamHandler()
handler.setLevel('INFO')
handler.setFormatter(log_fmt)
logger.addHandler(handler)

handler = FileHandler(DIR + 'train.py.log', 'a')
handler.setLevel(DEBUG)
handler.setFormatter(log_fmt)
logger.setLevel(DEBUG)
logger.addHandler(handler)

logger.info('start')

# data_prep
df = load_train_data()
x_train = df.drop(['target'], axis = 1)
y_train = df['target']

use_cols = x_train.columns.values

logger.debug('train columns: {} {}'.format(use_cols.shape, use_cols))

logger.info('Train data prep completed {}'.format(x_train.shape))

del df
gc.collect()

df = load_test_data()

x_test = df[use_cols].sort_values('id')
logger.info('Test data prep completed {}'.format(x_test.shape))


del df
gc.collect()

2018-06-09 07:07:26,844 __main__ 17 [INFO][<module>] start 
2018-06-09 07:07:36,382 __main__ 28 [INFO][<module>] Train data prep completed (595212, 58) 
2018-06-09 07:07:50,871 __main__ 36 [INFO][<module>] Test data prep completed (892816, 58) 


31

# Modelling
* Assumes each baseline model has been tuned using some kind of method
* The paramaters are the optimum ones already chosen

## Model 1: XGBoost

In [3]:
xgb_params = {
        'seed': 0,
        'colsample_bytree': 0.3085,
        'silent': 1,
        'subsample': 0.7,
        'learning_rate': 0.01,
        'objective': 'binary:logistic',
        'max_depth': 5,
        'num_parallel_tree': 1,
        'min_child_weight': 4.2922,
        'eval_metric': 'auc',
        'eta':0.1,
        'gamma': 0.5290,
        'subsample':0.9930,
        'max_delta_step':0,
        'booster':'gbtree',
        'scale_pos_weight': 26,
        'n_estimators': 2000,
        'n_jobs': -1
}
    

model_xgb = xgb.XGBClassifier(**xgb_params)

In [4]:
model_xgb.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.3085,
 'eta': 0.1,
 'eval_metric': 'auc',
 'gamma': 0.529,
 'learning_rate': 0.01,
 'max_delta_step': 0,
 'max_depth': 5,
 'min_child_weight': 4.2922,
 'missing': None,
 'n_estimators': 2000,
 'n_jobs': -1,
 'nthread': None,
 'num_parallel_tree': 1,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 26,
 'seed': 0,
 'silent': 1,
 'subsample': 0.993}

## Model 2: Logistic Regression

In [5]:
lr_params = {'C': 10, 
             'fit_intercept': True, 
             'penalty': 'l1', 
             'random_state': 0, 
             'verbose':2}

model_lr = LogisticRegression(**lr_params)

In [6]:
model_lr.get_params()

{'C': 10,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'ovr',
 'n_jobs': 1,
 'penalty': 'l1',
 'random_state': 0,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 2,
 'warm_start': False}

## Model 3: LightGBM

In [7]:
lgb_params = {
    'learning_rate': 0.024, 
    'max_depth': 5, 
    'boosting': 'gbdt', 
    'objective': 'binary', 
    'metric': 'auc',
    'feature_fraction': .7,
    'is_training_metric': False, 
    'seed': 99,
    'n_estimators': 2000
}

model_lgb = lgb.LGBMClassifier(**lgb_params)

In [8]:
model_lgb.get_params()

{'boosting': 'gbdt',
 'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'feature_fraction': 0.7,
 'is_training_metric': False,
 'learning_rate': 0.024,
 'max_depth': 5,
 'metric': 'auc',
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 2000,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'seed': 99,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 1}

## Model 4: Random Forest

In [45]:
model_rf = RandomForestClassifier(n_estimators=500, 
                                  max_depth=6, 
                                  criterion='entropy', 
                                  min_samples_split=10,  
                                  n_jobs=-1, 
                                  random_state=123, 
                                  verbose=1, 
                                  class_weight = "balanced")



In [46]:
model_rf.get_params()

{'bootstrap': True,
 'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': 6,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 500,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 123,
 'verbose': 1,
 'warm_start': False}

## Model 5: CatBoost

In [11]:
MAX_ROUNDS = 650
OPTIMIZE_ROUNDS = False
LEARNING_RATE = 0.05

model_cat = CatBoostClassifier(
    learning_rate=LEARNING_RATE, 
    depth=5, 
    l2_leaf_reg = 14, 
    iterations = MAX_ROUNDS,
    verbose = False,
    loss_function='Logloss',eval_metric='AUC'
)

In [12]:
model_cat.get_params()

{'depth': 5,
 'eval_metric': 'AUC',
 'iterations': 650,
 'l2_leaf_reg': 14,
 'learning_rate': 0.05,
 'logging_level': 'Silent',
 'loss_function': 'Logloss'}

# Base Models Score

In [13]:
def auc_cv(model):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    auc= cross_val_score(model, x_train.values, y_train.values, scoring="accuracy", cv = skf)
    return(auc)

In [14]:
# # For script purposes only

models = ['Cat Boost', 'XGBoost', 'Light GBM', 'Random Forest']
for n, i in enumerate([model_cat, model_xgb, model_lgb, model_rf]):
    score = auc_cv(i)
    print('Score of' + models[n] + ': {:.4f} ({:.4f})'.format(score.mean(), score.std()))

Score ofCat Boost: 0.9635 (0.0000)
Score ofXGBoost: 0.6748 (0.0016)
Score ofLight GBM: 0.9636 (0.0000)


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   28.6s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   29.2s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   29.7s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   28.8s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jo

Score ofRandom Forest: 0.6566 (0.0054)


# Ensembling
Creating an sklearn helper class to reuse codes

In [28]:
class SklearnHelper(object):
    
    def __init__(self, clf, seed=0):
        self.clf = clf()
        
    def train(self, x_train, y_train):
        self.clf.fit(x_train.values, y_train.values)
        
    def predict(self, x):
        return self.predict(x)
    
    def predict_proba(self, x):
        return self.predict_proba(x)[:, 1]
    
    def fit(self, x, y):
        return self.clf.fit(x.values, y.values)
    
    def feature_importance(self, x, y):
        print(self.clf.fit(x, y).feature_importances_)


# Out of Fold (OOF) Predictions

In [20]:
# Some useful parameters which will come in handy later on
ntrain = x_train.shape[0]
ntest = x_test.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(n_splits= NFOLDS, random_state=SEED, shuffle=True)

In [40]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain, ) )
    oof_test = np.zeros((ntest, ))
    oof_test_kf = np.empty((NFOLDS, ntest))
    
    for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        
        clf.fit(x_tr, y_tr)
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_kf[i, :] = clf.predict(x_test)
        
    oof_test[:] = oof_test_kf.mean(axis =0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
    

In [41]:
xgb_oof_train, xgb_oof_test = get_oof(model_xgb, x_train.values, y_train.values, x_test.values)

In [43]:
lgb_oof_train, lgb_oof_test = get_oof(model_lgb, x_train.values, y_train.values, x_test.values)

In [44]:
cat_oof_train, cat_oof_test = get_oof(model_cat, x_train.values, y_train.values, x_test.values)

In [47]:
rf_oof_train, rf_oof_test = get_oof(model_rf, x_train.values, y_train.values, x_test.values)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   48.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.2min finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    1.5s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    1.7s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    5.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   11.7s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:   13.4s finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   50.3s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapse

In [48]:
lr_oof_train, lr_oof_test = get_oof(model_lr, x_train.values, y_train.values, x_test.values)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

In [58]:
x_train_con = np.concatenate(( xgb_oof_train, lr_oof_train, cat_oof_train, rf_oof_train), axis=1)
x_test_con = np.concatenate(( xgb_oof_test, lr_oof_test, cat_oof_test, rf_oof_test), axis=1)

# Second Level Learning

In [59]:
gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=26).fit(x_train_con, y_train)
predictions = gbm.predict(x_test_con)

In [60]:
predictions

array([0, 0, 0, ..., 0, 0, 0])

In [61]:
DIR = 'result_tmp/'
submission = pd.DataFrame(

    {'id': x_test['id'],
     'target': predictions
    }
)

submission.to_csv(DIR + 'submission_oof_stacked.csv', index = False)
