# How to implement stacking 

In this notebook I will demonstrate how to build a stacking structure. We will use sample porto_seguero data to build our two base models(XGB and LGB), and their Out Of Fold predictions to train Logistic Regressions(Our Stacker).

In [1]:
#load required Libraries
#load important libraries
#author:sohaib
from __future__ import division
#import required modules
from datetime import datetime
import pandas as pd 
from IPython.display import display
import numpy as np
from numba import jit

import xgboost as xgb
import time

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.model_selection import cross_val_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
# to supress printing of exponential notation in pandas
pd.options.display.float_format = '{:20,.2f}'.format
pd.options.display.max_columns = 100

In [2]:
#download the porto data from : https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/data
TRAIN_PATH = 'porto_data/train.csv'
TEST_PATH = 'porto_data/test.csv'
THREADS=4

In [17]:
#helper functions

# from CPMP's kernel https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
@jit
def eval_gini(y_true, y_prob):
    """Gini Evaluation metric

    Score Gini for give True target and predicted target values
    
    
    Arguments:
        y_true {np.array} -- True target values
        y_prob {np.array} -- Predicted target values

    Returns:
        gini {float} -- calculated gini sccore
    """
    
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini


def auc_to_gini(score):
    """Converts AUC to Gini
       
    Arguments:
        score {float} -- AUC score
    
    Returns:
        [float] -- gini score
    """
    
    gini = (2 * score) - 1
    return gini


def bold(text_to_bold):
    """Bolds the given string
       
    Arguments:
        text_to_bold {string} -- string to bold

    Returns:
        [string]: Bold string
    """
    
    bold = "\033[1m"
    reset = "\033[0;0m"
    bold_text = bold + text_to_bold + reset 
    
    return bold_text

#__author__ = harishasan
def timer(start_time=None):
    """Prints time
    
    Initiate a time object, and prints total time consumed when again initialized object is passed as argument
    
    Keyword Arguments:
        start_time {[object]} -- initialized time object (default: {None})
    
    """
    
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))
       

    
def calculate_avg(scores):
    """calculates average of given allay or list
    
    Arguments:
        scores {[np.array]} -- array of scores
    """
    avg = sum(scores)/ float(len(scores))
    return avg

def stacker(stacker_model, s_train, target, s_test):
    """trains stacker model on base models predictions and test on base model test predictions dataset
       
    Arguments:
        stacker_model {[type]} -- model to traina s stacker
        s_train {[type]} -- stacker train set
        target {[type]} -- target(y) for model
        s_test {[type]} -- stacker test set
    
    Returns:
        [pd.DataFrame] -- predictions dataframem
    """
    
    start_time = timer(None)
    stacker_model.fit(s_train, target)
    preds = stacker_model.predict_proba(s_test)[:, 1]
    timer(start_time)
    preds_df = pd.DataFrame(data=preds, columns=['target']) 
    
    return preds_df



def get_base_model_results(model, X, y, test, model_name='base_model', y_test=None, n_splits=5, random_state=0, save_fold_results=True, fold_results_path='base_models_results/'):
    """for given model produces OOF and test predictions
    
    For the given Number of Folds, trains the model on train set, predicts on Out OF FOld set and test set, and repeat for given number of Folds.
    
    Arguments:
        model {Object} -- Model i.e XGB, LGB...
        X {nd.array} -- Train set
        y {np.array} -- target values of train set
        test {nd.array} -- test set
    
    Keyword Arguments:
        y_test {np.array} -- test set target values (default: {None})
        n_splits {int} -- Number of folds for Cross Validation split (default: {5})
        random_state {int} -- random seed to use in making folds for Cross Validation (default: {0})
        save_fold_results {bool} -- save out of fold predections and test or Not (default: True)

    
    Returns:
        tuple -- tuple of Dataframe of Out Of Fold and test set Predictions
    """
    #TODO: refactor this function in to small reusable chunks.
    
    print bold("STARTING ITERATION FOR") + " " + bold(model_name)
    
    train_preds_auc = []
    train_preds_gini = []
    
    holdout_preds_auc = []
    holdout_preds_gini = []
    
    start_time = time.time()

    X = np.array(X)
    y = np.array(y)
    test = np.array(test)
    
    folds = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state).split(X, y))
    
    stacker_train = np.zeros(X.shape[0])
    stacker_foldtest = np.zeros((test.shape[0], n_splits))
    
    for i, (train_idx, test_idx) in enumerate(folds):
        print "Fold {}".format(str(i))
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_holdout = X[test_idx]
        y_holdout = y[test_idx]
        
        model.fit(X_train, y_train)
        
        preds_train = model.predict_proba(X_train)[:, 1] 
        preds = model.predict_proba(X_holdout)[:, 1]
        
        train_gini = eval_gini(y_train, preds_train)
        
        holdout_gini = eval_gini(y_holdout, preds)
        
        
        print "Training gini = {}    Holdout gini = {}\n".format(str(train_gini), str(holdout_gini))
        

#         print "making stacker train"
        stacker_train[test_idx] = preds
        
        stacker_foldtest[:, i] = model.predict_proba(test)[:, 1]
        
        #append fold results to list
        train_preds_gini.append(train_gini)
        
        holdout_preds_gini.append(holdout_gini)
    
    
    foldtest_mean = stacker_foldtest.mean(axis=1)
    stacker_test = foldtest_mean
    print "\nAverage gini of training = {}    Average gini of holdouts = {} \n".format(str(calculate_avg(train_preds_gini)), str(calculate_avg(holdout_preds_gini)))
    
    print "Folds variance for train {:.3f}    Folds variance for holdouts = {:.3f}\n".format(np.std(train_preds_gini), np.std(holdout_preds_gini))
    if y_test is not None:
        print bold("Test GINI = ") + " {:.5f}".format(auc_to_gini(roc_auc_score(y_test, stacker_test)))
    print "\nTraining time in Minutes {} \n \n".format(str((time.time() - start_time)/60))   

    oof_preds = pd.DataFrame(data=stacker_train, columns=['{}train'.format(model_name)])
    test_preds = pd.DataFrame(data=stacker_test, columns=['{}test'.format(model_name)])
    
    if save_fold_results:
        oof_preds.to_csv(fold_results_path + 'oof_{}.csv'.format(model_name), index=False)
        test_preds.to_csv(fold_results_path + 'test_{}.csv'.format(model_name), index=False)
    
    return (oof_preds, test_preds)



In [13]:
#load the train and test data
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)


id_test = test['id'].values
id_train = train['id'].values
y = train['target']

### Drop calc
unwanted = train.columns[train.columns.str.startswith('ps_calc_')]
train.drop(unwanted, axis=1, inplace=True)  
test.drop(unwanted, axis=1, inplace=True)

train.drop(['id', 'target'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

## Define base models and stacker 

In [14]:
#here we use two base models XGB and LGB
xgb_base = XGBClassifier(nthread=THREADS)
lgb_base = LGBMClassifier(num_threads=THREADS)

#stacker Model
lr_stacker = LogisticRegression()

In [15]:
#gets oof and test predictions for the stacker
#first for XGB
xgb_oof_pred, xgb_test_pred = get_base_model_results(xgb_base, train, y, test, model_name='xgb_base')

[1mSTARTING ITERATION FOR[0;0m [1mxgb_base[0;0m
Fold 0
Training gini = 0.340645764921    Holdout gini = 0.275406236435

Fold 1
Training gini = 0.33851851826    Holdout gini = 0.259669414449

Fold 2
Training gini = 0.340736587994    Holdout gini = 0.241749774065

Fold 3
Training gini = 0.340662367258    Holdout gini = 0.261198984291

Fold 4
Training gini = 0.346551848993    Holdout gini = 0.235377882626


Average gini of training = 0.341423017485    Average gini of holdouts = 0.254680458373 

Folds variance for train 0.003    Folds variance for holdouts = 0.014


Training time in Minutes 0.260008366903 
 



In [16]:
lgb_oof_pred, lgb_test_pred = get_base_model_results(lgb_base, train, y, test,  model_name='lgb_base')

[1mSTARTING ITERATION FOR[0;0m [1mlgb_base[0;0m
Fold 0
Training gini = 0.260276217994    Holdout gini = 0.23153844655

Fold 1
Training gini = 0.255141807083    Holdout gini = 0.199481760163

Fold 2
Training gini = 0.269546061365    Holdout gini = 0.206483090163

Fold 3
Training gini = 0.259199540103    Holdout gini = 0.220051355943

Fold 4
Training gini = 0.268139743917    Holdout gini = 0.208116253322


Average gini of training = 0.262460674092    Average gini of holdouts = 0.213134181228 

Folds variance for train 0.006    Folds variance for holdouts = 0.011


Training time in Minutes 0.0246824502945 
 



In [35]:
#make stacker train and test df
stacker_train = pd.concat([xgb_oof_pred, lgb_oof_pred], axis=1)
stacker_test = pd.concat([xgb_test_pred, lgb_test_pred], axis=1)

In [36]:
#check cross val score of stacker
results = cross_val_score(lr_stacker, stacker_train, y=y, cv=5, scoring='roc_auc')
print bold('CV of stacker:') + ' {:.4f}'.format(auc_to_gini(results.mean()))

[1mCV of stacker:[0;0m 0.2539


In [37]:
#now train stacker on all OOF predictions and test
test_stacker_preds = stacker(lr_stacker, stacker_train, y, stacker_test)


 Time taken: 0 hours 0 minutes and 0.21 seconds.


In [None]:
#