# Target Encoded XGB and LGB

**Note:** the results in this Note book are made using sample porto seguero data, to produce best results download the data from competition <a herf='https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/data'> page</a> on kaggle, and place in porto_data directory, and also bin the data using binning script.

In [7]:
from __future__ import division
#import required modules
from datetime import datetime
import pandas as pd 
from IPython.display import display
import numpy as np
from sklearn.metrics import roc_auc_score


import time

from sklearn.model_selection import StratifiedKFold

from numba import jit
from sklearn.preprocessing import LabelEncoder

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from rgf.sklearn import RGFClassifier


# to supress printing of exponential notation in pandas
pd.options.display.float_format = '{:20,.2f}'.format
pd.options.display.max_columns = 100

In [37]:
TRAIN_PATH = '../porto_data/train.csv'
TEST_PATH = '../porto_data/test.csv'
# SUBMIT_NAME = 'te_llb_18_LGB_1.csv'train.replace(np.nan, -1, inplace=True)
test.replace(np.nan, -1, inplace=True)

In [38]:
#helper functions

# from CPMP's kernel https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
@jit
def eval_gini(y_true, y_prob):
    """Gini Evaluation metric

    Score Gini for give True target and predicted target values
    
    
    Arguments:
        y_true {np.array} -- True target values
        y_prob {np.array} -- Predicted target values

    Returns:
        gini {float} -- calculated gini sccore
    """
    
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini


def auc_to_gini(score):
    """Converts AUC to Gini
       
    Arguments:
        score {float} -- AUC score
    
    Returns:
        [float] -- gini score
    """
    
    gini = (2 * score) - 1
    return gini


def bold(text_to_bold):
    """Bolds the given string
       
    Arguments:
        text_to_bold {string} -- string to bold

    Returns:
        [string]: Bold string
    """
    
    bold = "\033[1m"
    reset = "\033[0;0m"
    bold_text = bold + text_to_bold + reset 
    
    return bold_text

#__author__ = harishasan
def timer(start_time=None):
    """Prints time
    
    Initiate a time object, and prints total time consumed when again initialized object is passed as argument
    
    Keyword Arguments:
        start_time {[object]} -- initialized time object (default: {None})
    
    """
    
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))
       

    
def calculate_avg(scores):
    """calculates average of given allay or list
    
    Arguments:
        scores {[np.array]} -- array of scores
    """
    avg = sum(scores)/ float(len(scores))
    return avg

def stacker(stacker_model, s_train, target, s_test):
    """trains stacker model on base models predictions and test on base model test predictions dataset
       
    Arguments:
        stacker_model {[type]} -- model to traina s stacker
        s_train {[type]} -- stacker train set
        target {[type]} -- target(y) for model
        s_test {[type]} -- stacker test set
    
    Returns:
        [pd.DataFrame] -- predictions dataframem
    """
    
    start_time = timer(None)
    stacker_model.fit(s_train, target)
    preds = stacker_model.predict_proba(s_test)[:, 1]
    timer(start_time)
    preds_df = pd.DataFrame(data=preds, columns=['target']) 
    
    return preds_df



def get_base_model_results(model, X, y, test, model_name='base_model', y_test=None, n_splits=5, random_state=0, save_fold_results=True, fold_results_path='base_models_results/'):
    """for given model produces OOF and test predictions
    
    For the given Number of Folds, trains the model on train set, predicts on Out OF FOld set and test set, and repeat for given number of Folds.
    
    Arguments:
        model {Object} -- Model i.e XGB, LGB...
        X {nd.array} -- Train set
        y {np.array} -- target values of train set
        test {nd.array} -- test set
    
    Keyword Arguments:
        y_test {np.array} -- test set target values (default: {None})
        n_splits {int} -- Number of folds for Cross Validation split (default: {5})
        random_state {int} -- random seed to use in making folds for Cross Validation (default: {0})
        save_fold_results {bool} -- save out of fold predections and test or Not (default: True)

    
    Returns:
        tuple -- tuple of Dataframe of Out Of Fold and test set Predictions
    """
    #TODO: refactor this function in to small reusable chunks.
    
    print bold("STARTING ITERATION FOR") + " " + bold(model_name)
    
    train_preds_auc = []
    train_preds_gini = []
    
    holdout_preds_auc = []
    holdout_preds_gini = []
    
    start_time = time.time()

    X = np.array(X)
    y = np.array(y)
    test = np.array(test)
    
    folds = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state).split(X, y))
    
    stacker_train = np.zeros(X.shape[0])
    stacker_foldtest = np.zeros((test.shape[0], n_splits))
    
    for i, (train_idx, test_idx) in enumerate(folds):
        print "Fold {}".format(str(i))
        X_train = X[train_idx]
        y_train = y[train_idx]
        X_holdout = X[test_idx]
        y_holdout = y[test_idx]
        
        model.fit(X_train, y_train)
        
        preds_train = model.predict_proba(X_train)[:, 1] 
        preds = model.predict_proba(X_holdout)[:, 1]
        
        train_gini = eval_gini(y_train, preds_train)
        
        holdout_gini = eval_gini(y_holdout, preds)
        
        
        print "Training gini = {}    Holdout gini = {}\n".format(str(train_gini), str(holdout_gini))
        

#         print "making stacker train"
        stacker_train[test_idx] = preds
        
        stacker_foldtest[:, i] = model.predict_proba(test)[:, 1]
        
        #append fold results to list
        train_preds_gini.append(train_gini)
        
        holdout_preds_gini.append(holdout_gini)
    
    
    foldtest_mean = stacker_foldtest.mean(axis=1)
    stacker_test = foldtest_mean
    print "\nAverage gini of training = {}    Average gini of holdouts = {} \n".format(str(calculate_avg(train_preds_gini)), str(calculate_avg(holdout_preds_gini)))
    
    print "Folds variance for train {:.3f}    Folds variance for holdouts = {:.3f}\n".format(np.std(train_preds_gini), np.std(holdout_preds_gini))
    if y_test is not None:
        print bold("Test GINI = ") + " {:.5f}".format(auc_to_gini(roc_auc_score(y_test, stacker_test)))
    print "\nTraining time in Minutes {} \n \n".format(str((time.time() - start_time)/60))   

    oof_preds = pd.DataFrame(data=stacker_train, columns=['{}train'.format(model_name)])
    test_preds = pd.DataFrame(data=stacker_test, columns=['{}test'.format(model_name)])
    
    if save_fold_results:
        oof_preds.to_csv(fold_results_path + 'oof_{}.csv'.format(model_name), index=False)
        test_preds.to_csv(fold_results_path + 'test_{}.csv'.format(model_name), index=False)
    
    return (oof_preds, test_preds)



def one_hot_encode(data, cols=None, drop_missing_cols=False):
    """
   creates new dataframe with dummy(OHE) columns appended
   Args:
       cols: list of column names to one_hot encode
       
    """
    
    cat_cols = cols if cols is not None else [col for col in data.columns.values if 'cat' in col]
    print cat_cols
    cat_df = data[cat_cols]
    ohe = pd.get_dummies(cat_df, columns=cat_cols)
    
    df = data.drop(cat_cols, axis=1)

    df = pd.concat([df, ohe], axis=1)
    if drop_missing_cols:
        neg_cols = [col for col in df.columns.values if '-1' in col]
    #     print neg_cols
        df.drop(neg_cols, axis=1, inplace=True)
    
    return df


def drop_all_excepet(data, cols_list=None, dropid=True):
       
    # That is because in python setting a variable actually sets a reference to the variable. 
    # Almost every person learning python encounters this at some point. The solution is simply to copy the list:
    cols_list = None if cols_list is None else cols_list[:]
    if dropid:
        cols_list.append('target')
    else:
        cols_list.append('target')
        cols_list.append('id')
    cols_to_drop = [col for col in data.columns.values if col not in cols_list]
#     print cols_to_drop
    data.drop(cols_to_drop, axis=1, inplace=True)

In [39]:
#importatn features
# from olivier, include ps_car_11 for better results
train_features = [
"ps_car_13",  #            : 1571.65 / shadow  609.23
"ps_reg_03",  #            : 1408.42 / shadow  511.15
"ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
"ps_ind_03",  #            : 1219.47 / shadow  230.55
"ps_ind_15",  #            :  922.18 / shadow  242.00
"ps_reg_02",  #            :  920.65 / shadow  267.50
"ps_car_14",  #            :  798.48 / shadow  549.58
"ps_car_12",  #            :  731.93 / shadow  293.62
"ps_car_01_cat",  #        :  698.07 / shadow  178.72
"ps_car_07_cat",  #        :  694.53 / shadow   36.35
"ps_ind_17_bin",  #        :  620.77 / shadow   23.15
"ps_car_03_cat",  #        :  611.73 / shadow   50.67
"ps_reg_01",  #            :  598.60 / shadow  178.57
"ps_car_15",  #            :  593.35 / shadow  226.43
"ps_ind_01",  #            :  547.32 / shadow  154.58
"ps_ind_16_bin",  #        :  475.37 / shadow   34.17
"ps_ind_07_bin",  #        :  435.28 / shadow   28.92
"ps_car_06_cat",  #        :  398.02 / shadow  212.43
"ps_car_04_cat",  #        :  376.87 / shadow   76.98
"ps_ind_06_bin",  #        :  370.97 / shadow   36.13
"ps_car_09_cat",  #        :  214.12 / shadow   81.38
"ps_car_02_cat",  #        :  203.03 / shadow   26.67
"ps_ind_02_cat",  #        :  189.47 / shadow   65.68
"ps_car_11",  #            :  173.28 / shadow   76.45
"ps_car_05_cat",  #        :  172.75 / shadow   62.92
"ps_ind_08_bin",  #        :  140.73 / shadow   27.63
"ps_car_08_cat",  #        :  120.87 / shadow   28.82
"ps_ind_09_bin",  #        :  113.92 / shadow   27.05
"ps_ind_04_cat",  #        :  107.27 / shadow   37.43
"ps_ind_18_bin",  #        :   77.42 / shadow   25.97
"ps_ind_12_bin",  #        :   39.67 / shadow   15.52
"ps_ind_14",  #            :   37.37 / shadow   16.65
    'ps_car_11_cat'
]

In [40]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

In [41]:
train.replace(np.nan, -1, inplace=True)
test.replace(np.nan, -1, inplace=True)

In [42]:
id_test = test['id'].values
id_train = train['id'].values
y = train['target']

In [43]:
drop_all_excepet(train, train_features)
drop_all_excepet(test, train_features, dropid=False)

#seperate target and test id
target = train['target']
train.drop(['target'], axis=1, inplace=True)

test_id = test['id']
test.drop(['id'], axis=1, inplace=True)


In [44]:
train_oh = one_hot_encode(train, drop_missing_cols=True)
test_oh = one_hot_encode(test, drop_missing_cols=True)
print train_oh.shape
print test_oh.shape

['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_11_cat']
['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_11_cat']
(119043, 192)
(250, 155)


In [45]:
#define models
#xgb
xgb_model = XGBClassifier(    
                        max_depth=6,learning_rate=0.07, n_estimators=450,
                        objective="binary:logistic",
                        gamma=10,scale_pos_weight=1.6,                   
    nthread=4, min_child_weight=9, subsample=.8,colsample_bytree=.8, reg_lambda=1.4, 
    reg_alpha=10 
                     )

#lgb
lgb_params = {'max_depth':5,'num_leaves':30,'learning_rate':0.05,'colsample_bytree':0.8,'max_bin':10, 'subsample':0.8, \
              'n_estimators':450,'subsample_freq':6, 'objective':'binary', 'num_threads':3, 'min_child_samples':600}
lgb_model = LGBMClassifier(**lgb_params)

In [None]:
xgb_strain, xgb_stest = get_base_model_results(xgb_model, train_oh, y, test_oh, model_name='oh_xgb', save_fold_results=False )

In [None]:
lgb_strain, lgb_stest = get_base_model_results(lgb_model, train_oh, y, test_oh, model_name='oh_lgb', save_fold_results=False )

In [None]:
#save oof and test predictions