# Stacking Using Baseline Models

In [1]:
# Processes
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, KFold
from sklearn.metrics import log_loss, roc_auc_score, roc_curve, auc

# Modelling
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from catboost import CatBoostClassifier



# Data prep, logging, and garbage collection
from load_data import load_train_data, load_test_data
from logging import StreamHandler, DEBUG, Formatter, FileHandler, getLogger
import gc


import warnings
warnings.filterwarnings('ignore')


# Data Loading and Prep

In [2]:
DIR = 'result_tmp/'
logger = getLogger(__name__)


log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s ')
handler = StreamHandler()
handler.setLevel('INFO')
handler.setFormatter(log_fmt)
logger.addHandler(handler)

handler = FileHandler(DIR + 'train.py.log', 'a')
handler.setLevel(DEBUG)
handler.setFormatter(log_fmt)
logger.setLevel(DEBUG)
logger.addHandler(handler)

logger.info('start')

# data_prep
df = load_train_data()
x_train = df.drop(['target'], axis = 1)
y_train = df['target']

use_cols = x_train.columns.values

logger.debug('train columns: {} {}'.format(use_cols.shape, use_cols))

logger.info('Train data prep completed {}'.format(x_train.shape))

del df
gc.collect()

df = load_test_data()

x_test = df[use_cols].sort_values('id')
logger.info('Test data prep completed {}'.format(x_test.shape))


del df
gc.collect()

2018-06-07 01:22:17,938 __main__ 17 [INFO][<module>] start 
2018-06-07 01:22:22,308 __main__ 28 [INFO][<module>] Train data prep completed (595212, 58) 
2018-06-07 01:22:28,408 __main__ 36 [INFO][<module>] Test data prep completed (892816, 58) 


31

# Modelling
* Assumes each baseline model has been tuned using some kind of method
* The paramaters are the optimum ones already chosen

## Model 1: XGBoost

In [3]:
xgb_params = {
        'seed': 0,
        'colsample_bytree': 0.3085,
        'silent': 1,
        'subsample': 0.7,
        'learning_rate': 0.01,
        'objective': 'binary:logistic',
        'max_depth': 5,
        'num_parallel_tree': 1,
        'min_child_weight': 4.2922,
        'eval_metric': 'auc',
        'eta':0.1,
        'gamma': 0.5290,
        'subsample':0.9930,
        'max_delta_step':0,
        'booster':'gbtree',
        'scale_pos_weight': 26,
        'n_estimators': 2000,
        'n_jobs': -1
}
    

model_xgb = xgb.XGBClassifier(**xgb_params)

In [4]:
model_xgb.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.3085,
 'eta': 0.1,
 'eval_metric': 'auc',
 'gamma': 0.529,
 'learning_rate': 0.01,
 'max_delta_step': 0,
 'max_depth': 5,
 'min_child_weight': 4.2922,
 'missing': None,
 'n_estimators': 2000,
 'n_jobs': -1,
 'nthread': None,
 'num_parallel_tree': 1,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 26,
 'seed': 0,
 'silent': 1,
 'subsample': 0.993}

## Model 2: Logistic Regression

In [5]:
lr_params = {'C': 10, 
             'fit_intercept': True, 
             'penalty': 'l1', 
             'random_state': 0, 
             'verbose':2}

model_lr = LogisticRegression(**lr_params)

In [6]:
model_lr.get_params()

{'C': 10,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'max_iter': 100,
 'multi_class': 'ovr',
 'n_jobs': 1,
 'penalty': 'l1',
 'random_state': 0,
 'solver': 'liblinear',
 'tol': 0.0001,
 'verbose': 2,
 'warm_start': False}

## Model 3: LightGBM

In [7]:
lgb_params = {
    'learning_rate': 0.024, 
    'max_depth': 4, 
    'lambda_l1': 16.7,
    'boosting': 'gbdt', 
    'objective': 'binary', 
    'metric': 'auc',
    'feature_fraction': .7,
    'is_training_metric': False, 
    'seed': 99,
    'n_estimators': 1400
}

model_lgb = lgb.LGBMClassifier(**lgb_params)

In [8]:
model_lgb.get_params()

{'boosting': 'gbdt',
 'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'feature_fraction': 0.7,
 'is_training_metric': False,
 'lambda_l1': 16.7,
 'learning_rate': 0.024,
 'max_depth': 4,
 'metric': 'auc',
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 1400,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': 'binary',
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'seed': 99,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 1}

## Model 4: Random Forest

In [9]:
model_rf = RandomForestClassifier(n_estimators=100, 
                                  max_depth=8, 
                                  criterion='entropy', 
                                  min_samples_split=10,  
                                  n_jobs=-1, 
                                  random_state=123, 
                                  verbose=1, 
                                  class_weight = "balanced")



In [10]:
model_rf.get_params()

{'bootstrap': True,
 'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': 8,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 123,
 'verbose': 1,
 'warm_start': False}

## Model 5: CatBoost

In [11]:
MAX_ROUNDS = 650
OPTIMIZE_ROUNDS = False
LEARNING_RATE = 0.05

model_cat = CatBoostClassifier(
    learning_rate=LEARNING_RATE, 
    depth=5, 
    l2_leaf_reg = 14, 
    iterations = MAX_ROUNDS,
    verbose = False,
    loss_function='Logloss',eval_metric='AUC'
)

In [12]:
model_cat.get_params()

{'depth': 5,
 'eval_metric': 'AUC',
 'iterations': 650,
 'l2_leaf_reg': 14,
 'learning_rate': 0.05,
 'logging_level': 'Silent',
 'loss_function': 'Logloss'}

# Base Models Score

In [13]:
def auc_cv(model):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    auc= cross_val_score(model, x_train.values, y_train.values, scoring="accuracy", cv = skf)
    return(auc)

In [14]:
# # For script purposes only

models = ['Cat Boost', 'XGBoost', 'Light GBM', 'Random Forest']
for n, i in enumerate([model_cat, model_xgb, model_lgb, model_rf]):
    score = auc_cv(i)
    print('Score of' + models[n] + ': {:.4f} ({:.4f})'.format(score.mean(), score.std()))

Score ofCat Boost: 0.9635 (0.0000)
Score ofXGBoost: 0.6748 (0.0016)
Score ofLight GBM: 0.9636 (0.0000)


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   28.6s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   29.2s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   29.7s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   28.8s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jo

Score ofRandom Forest: 0.6566 (0.0054)


In [15]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
        
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        skf = StratifiedKFold(n_splits=self.n_folds, shuffle=True, random_state=0)
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in skf.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self

    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

In [16]:
stacked_averaged_models = StackingAveragedModels(base_models = (model_cat, model_lgb, model_xgb),
                                                 meta_model = model_rf)

In [17]:
score = auc_cv(stacked_averaged_models)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.3s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.3s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.2s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.3s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jo

In [18]:
score

array([0.68449216, 0.68114043, 0.68117403, 0.68596798, 0.67997581])

In [19]:
stacked_averaged_models.fit(x_train.values, y_train.values)


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.9s finished


StackingAveragedModels(base_models=(<catboost.core.CatBoostClassifier object at 0x7f723d226e80>, LGBMClassifier(boosting='gbdt', boosting_type='gbdt', class_weight=None,
        colsample_bytree=1.0, feature_fraction=0.7,
        is_training_metric=False, lambda_l1=16.7, learning_rate=0.024,
        max_depth=4, metric='a..._state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=26,
       seed=0, silent=1, subsample=0.993)),
            meta_model=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='entropy', max_depth=8, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False, random_state=123,
            verbose=1, warm_start=False),
            n_folds=5)

In [22]:
stacked_train_pred = stacked_averaged_models.predict(x_test.values)[:, 1]

AttributeError: 'StackingAveragedModels' object has no attribute 'predict_proba'

In [26]:
stacked_train_pred

array([0, 0, 0, ..., 0, 0, 0])

In [27]:
DIR = 'result_tmp/'
submission = pd.DataFrame(

    {'id': x_test['id'],
     'target': stacked_train_pred
    }
)

submission.to_csv(DIR + 'submission_stacked.csv', index = False)
