# Best Target Encoded LGB , XGB and RGF

**Note:** the results in this Note book are made using sample porto seguero data, to produce best results download the data from competition <a herf='https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/data'> page</a> on kaggle, and place in porto_data directory, and also bin the data using binning script.

In [32]:
from __future__ import division
#import required modules
from datetime import datetime
import pandas as pd 
from IPython.display import display
import numpy as np
from sklearn.metrics import roc_auc_score


import time

from sklearn.model_selection import StratifiedKFold

from numba import jit
from sklearn.preprocessing import LabelEncoder

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from rgf.sklearn import RGFClassifier


# to supress printing of exponential notation in pandas
pd.options.display.float_format = '{:20,.2f}'.format
pd.options.display.max_columns = 100

In [11]:
TRAIN_PATH = '../porto_data/bin_train.csv'
TEST_PATH = '../porto_data/bin_test.csv'
# SUBMIT_NAME = 'te_llb_18_LGB_1.csv'

In [5]:
#helper functions

# from CPMP's kernel https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
@jit
def eval_gini(y_true, y_prob):
    """Gini Evaluation metric

    Score Gini for give True target and predicted target values
    
    
    Arguments:
        y_true {np.array} -- True target values
        y_prob {np.array} -- Predicted target values

    Returns:
        gini {float} -- calculated gini sccore
    """
    
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini


def auc_to_gini(score):
    """Converts AUC to Gini
       
    Arguments:
        score {float} -- AUC score
    
    Returns:
        [float] -- gini score
    """
    
    gini = (2 * score) - 1
    return gini


def bold(text_to_bold):
    """Bolds the given string
       
    Arguments:
        text_to_bold {string} -- string to bold

    Returns:
        [string]: Bold string
    """
    
    bold = "\033[1m"
    reset = "\033[0;0m"
    bold_text = bold + text_to_bold + reset 
    
    return bold_text

#__author__ = harishasan
def timer(start_time=None):
    """Prints time
    
    Initiate a time object, and prints total time consumed when again initialized object is passed as argument
    
    Keyword Arguments:
        start_time {[object]} -- initialized time object (default: {None})
    
    """
    
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))
       

    
def calculate_avg(scores):
    """calculates average of given allay or list
    
    Arguments:
        scores {[np.array]} -- array of scores
    """
    avg = sum(scores)/ float(len(scores))
    return avg



In [6]:
#from olivier
#https://www.kaggle.com/ogrellier/python-target-encoding-for-categorical-features
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))



def target_encode(trn_series=None,    # Revised to encode validation series
                  val_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_val_series = pd.merge(
        val_series.to_frame(val_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=val_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_val_series.index = val_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_val_series, noise_level), add_noise(ft_tst_series, noise_level)


In [29]:
def get_te_base_results(model, X, y, test_df, model_name, f_cats, fillna=False, random_state=0):
    """get target encoded OOF and test predections for the stacker,target encodes training and holdout seperately in each fold
    to reduce bias
     
    Arguments:
        model {Object} -- Model
        X {nd.array} -- Train set
        y {np.array} -- target values
        test_df {nd.array} -- Test Set
        model_name {string} -- naem of the model
        f_cats {list} -- list of category columns to encode
    
    Keyword Arguments:
        fillna {bool} -- To fill NaN with mean or assign True (default: {False})
    """
    
    
    print bold("STARTING ITERATION FOR") + " " + bold(model_name)
    start_time = timer(None)
    y_valid_pred = 0*y
    y_test_pred = 0

    # Set up folds
    K = 5
    kf = StratifiedKFold(n_splits = K, random_state = random_state, shuffle = True)
    np.random.seed(0)
    
    train_preds = []
    eval_preds = []

    for i, (train_index, test_index) in enumerate(kf.split(X, y)):

        # Create data for this fold
        y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
        X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
        X_test = test_df.copy()
        print "Fold {}".format(str(i))

        # Enocode data
        for f in f_cats:
            X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode(
                                                            trn_series=X_train[f],
                                                            val_series=X_valid[f],
                                                            tst_series=X_test[f],
                                                            target=y_train,
                                                            min_samples_leaf=200,
                                                            smoothing=10,
                                                            noise_level=0
                                                            )
            
        #fill NaN for Rgf
        if fillna is True:
            X_train.fillna(X_train.mean(), inplace=True)
            X_valid.fillna(X_valid.mean(), inplace=True)
            X_test.fillna(X_test.mean(), inplace=True)
       
       
        fit_model = model.fit( X_train, y_train )


        preds_train = fit_model.predict_proba(X_train)[:,1]
        # Generate validation predictions for this fold
        pred = fit_model.predict_proba(X_valid)[:,1]

        #calculate train and valid gini
        train_gini = eval_gini(y_train, preds_train)
        val_gini = eval_gini(y_valid, pred)

        print "Train Gini = {}    holdout Gini = {}".format(str(train_gini), str(val_gini)) 
        y_valid_pred.iloc[test_index] = pred

        # Accumulate test set predictions
        y_test_pred += fit_model.predict_proba(X_test)[:,1]
        train_preds.append(train_gini)
        eval_preds.append(val_gini)

        del X_test, X_train, X_valid, y_train

    y_test_pred /= K  # Average test set predictions
    print "\nGini for full holdout set:", eval_gini(y, y_valid_pred)
    
    
    print "Avg Train gini = {}    Avg holdout gini = {}".format(str(calculate_avg(train_preds)), str(calculate_avg(eval_preds)))
    print "Fold Variance for train = {:.3f}    Fold Variance for holdouts = {:.3f}".format(np.std(train_preds),np.std(eval_preds))
    
    
    timer(start_time)    

    oof_preds = pd.DataFrame(data=y_valid_pred, columns=['{}_oof'.format(model_name)])
    test_preds = pd.DataFrame(data=y_test_pred, columns=['{}_test'.format(model_name)])
    
    return oof_preds, test_preds

In [13]:
#importatn features
# from olivier: https://www.kaggle.com/ogrellier/noise-analysis-of-porto-seguro-s-features
train_features_bestxgb = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
"ps_reg_03",  #            : 1408.42 / shadow  511.15
"ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
"ps_ind_03",  #            : 1219.47 / shadow  230.55
"ps_ind_15",  #            :  922.18 / shadow  242.00
"ps_reg_02",  #            :  920.65 / shadow  267.50
"ps_car_14",  #            :  798.48 / shadow  549.58
"ps_car_12",  #            :  731.93 / shadow  293.62
"ps_car_01_cat",  #        :  698.07 / shadow  178.72
"ps_car_07_cat",  #        :  694.53 / shadow   36.35
"ps_ind_17_bin",  #        :  620.77 / shadow   23.15
"ps_car_03_cat",  #        :  611.73 / shadow   50.67
"ps_reg_01",  #            :  598.60 / shadow  178.57
"ps_car_15",  #            :  593.35 / shadow  226.43
"ps_ind_01",  #            :  547.32 / shadow  154.58
"ps_ind_16_bin",  #        :  475.37 / shadow   34.17
"ps_ind_07_bin",  #        :  435.28 / shadow   28.92
"ps_car_06_cat",  #        :  398.02 / shadow  212.43
"ps_car_04_cat",  #        :  376.87 / shadow   76.98
"ps_ind_06_bin",  #        :  370.97 / shadow   36.13
"ps_car_09_cat",  #        :  214.12 / shadow   81.38
"ps_car_02_cat",  #        :  203.03 / shadow   26.67
"ps_ind_02_cat",  #        :  189.47 / shadow   65.68
"ps_car_11",  #            :  173.28 / shadow   76.45
"ps_car_05_cat",  #        :  172.75 / shadow   62.92
"ps_calc_09",  #           :  169.13 / shadow  129.72
"ps_calc_05",  #           :  148.83 / shadow  120.68
"ps_ind_08_bin",  #        :  140.73 / shadow   27.63
"ps_car_08_cat",  #        :  120.87 / shadow   28.82
"ps_ind_09_bin",  #        :  113.92 / shadow   27.05
"ps_ind_04_cat",  #        :  107.27 / shadow   37.43
"ps_ind_18_bin",  #        :   77.42 / shadow   25.97
"ps_ind_12_bin",  #        :   39.67 / shadow   15.52
"ps_ind_14",  #            :   37.37 / shadow   16.65
]
# add combinations
combs_bestxgb = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]

In [27]:
#define base model
lgb_params = {'learning_rate':0.05,'min_child_samples':350, 'n_estimators':370,'subsample_freq':10\
             , 'max_bin':10,'objective':'binary', 'max_depth':4, 'num_leaves':30, 'num_threads':4,'subsample':.8, 'colsample_bytree':.8}

lgb_model = LGBMClassifier(**lgb_params)

#best_xgb
xgb_model = XGBClassifier(    
                        n_estimators=400,
                        max_depth=5,
                        objective="binary:logistic",
                        learning_rate=0.07, 
                        subsample=.8,
                        min_child_weight=7,
                        colsample_bytree=.8,
                        scale_pos_weight=1.6,
                        gamma=11,
                        reg_alpha=5,
                        reg_lambda=1.6,
                            nthread=4
)

#best_rgf
rgf_model = RGFClassifier(
        max_leaf=1000,
        algorithm="RGF",  
        loss="Log",
        l2=0.01,
        sl2=0.01,
        normalize=False,
        min_samples_leaf=10,
        n_iter=None,
        opt_interval=100,
        learning_rate=.5,
        calc_prob="sigmoid",
        n_jobs=-1,
        memory_policy="generous",
        verbose=0
    )

In [15]:
train_df = pd.read_csv(TRAIN_PATH, na_values="-1") # .iloc[0:200,:]
test_df = pd.read_csv(TEST_PATH, na_values="-1")

## Start training 

In [16]:
# Process data
id_test = test_df['id'].values
id_train = train_df['id'].values
y = train_df['target']

X = train_df[train_features_bestxgb]
test_df = test_df[train_features_bestxgb]

#cat columns to target encode
f_cats = [f for f in X.columns if "_cat" in f]

In [20]:
lgb_strain, lgb_stest = get_te_base_results(lgb_model, X, y, test_df, 'lgb1', f_cats)

[1mSTARTING ITERATION FOR[0;0m [1mlgb1[0;0m
Fold 0
Train Gini = 0.453337512928    holdout Gini = 0.254025905363
Fold 1
Train Gini = 0.449149416397    holdout Gini = 0.24560588904
Fold 2
Train Gini = 0.452085700473    holdout Gini = 0.251745059506
Fold 3
Train Gini = 0.451292794451    holdout Gini = 0.254442722938
Fold 4
Train Gini = 0.448115282752    holdout Gini = 0.29270420102

Gini for full holdout set: 0.259472818537
Avg Train gini = 0.4507961414    Avg holdout gini = 0.259704755573
Fold Variance for train = 0.002    Fold Variance for holdouts = 0.017

 Time taken: 0 hours 0 minutes and 21.49 seconds.


In [25]:
xgb_strain, xgb_stest = get_te_base_results(xgb_model, X, y, test_df, 'te_xgb1', f_cats)

[1mSTARTING ITERATION FOR[0;0m [1mte_xgb1[0;0m
Fold 0
Train Gini = 0.375217614083    holdout Gini = 0.278800336748
Fold 1
Train Gini = 0.370816902145    holdout Gini = 0.263590814104
Fold 2
Train Gini = 0.371373877905    holdout Gini = 0.270695293899
Fold 3
Train Gini = 0.37503736193    holdout Gini = 0.263621748999
Fold 4
Train Gini = 0.378883948847    holdout Gini = 0.250950124139

Gini for full holdout set: 0.265400244922
Avg Train gini = 0.374265940982    Avg holdout gini = 0.265531663578
Fold Variance for train = 0.003    Fold Variance for holdouts = 0.009

 Time taken: 0 hours 2 minutes and 7.19 seconds.


In [30]:
rgf_strain, rgf_stest = get_te_base_results(rgf_model, X, y, test_df, 'te_rgf', f_cats, fillna=True)

[1mSTARTING ITERATION FOR[0;0m [1mte_rgf[0;0m
Fold 0
Train Gini = 0.432728844686    holdout Gini = 0.253327861473
Fold 1
Train Gini = 0.429529108395    holdout Gini = 0.247963971695
Fold 2
Train Gini = 0.424535760836    holdout Gini = 0.263353880933
Fold 3
Train Gini = 0.434713897729    holdout Gini = 0.242758673472
Fold 4
Train Gini = 0.439071389702    holdout Gini = 0.242465224461

Gini for full holdout set: 0.249811207804
Avg Train gini = 0.43211580027    Avg holdout gini = 0.249973922407
Fold Variance for train = 0.005    Fold Variance for holdouts = 0.008

 Time taken: 0 hours 4 minutes and 38.68 seconds.


In [31]:
#save oof and test predictons