# Porto Seguro’s Safe Driver Prediction
- Predict if a driver will file an insurance claim next year.
- https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/data
- install: http://jupyter-contrib-nbextensions.readthedocs.io/en/latest/install.html
- install: https://github.com/Jupyter-contrib/jupyter_nbextensions_configurator


# Credit to:
* https://www.kaggle.com/arthurtok/interactive-porto-insights-a-plot-ly-tutorial
* https://www.kaggle.com/anokas/simple-xgboost-btb-0-27
* https://www.kaggle.com/rshally/porto-xgb-lgb-kfold-lb-0-282
* https://www.kaggle.com/akashdeepjassal/simple-keras-mlp/code
* https://www.kaggle.com/pnagel/keras-starter/code
* https://datascience.stackexchange.com/questions/13490/how-to-set-class-weights-for-imbalanced-classes-in-keras

# Library import

In [3]:
# data processing
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# ML
# # Scikit-learn
from sklearn.model_selection import cross_val_score, train_test_split, learning_curve, validation_curve, KFold
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelBinarizer, StandardScaler

# LightGBM
from lightgbm import LGBMClassifier, LGBMRegressor

# Keras
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import SGD, Adam
from keras.utils import np_utils

# System
import datetime as dtime

# Load data

In [4]:
pd.options.display.float_format = '{:,.4f}'.format
# Input data files are available in the DATA_DIR directory.
DATA_DIR = "data-temp"
# Load data. Download from:https://www.kaggle.com/c/nyc-taxi-trip-duration/data
train_data = pd.read_csv(DATA_DIR + "/train.csv")
eval_data =  pd.read_csv(DATA_DIR + "/test.csv")

In [4]:
print("train size:", train_data.shape, " test size:", eval_data.shape)

train size: (595212, 59)  test size: (892816, 58)


In [5]:
train_data.head(5)

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [6]:
eval_data.head(5)

Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,0,0,1,8,1,0,0,1,0,0,...,1,1,1,12,0,1,1,0,0,1
1,1,4,2,5,1,0,0,0,0,1,...,2,0,3,10,0,0,1,1,0,1
2,2,5,1,3,0,0,0,0,0,1,...,4,0,2,4,0,0,0,0,0,0
3,3,0,1,6,0,0,1,0,0,0,...,5,1,0,5,1,0,1,0,0,0
4,4,5,1,7,0,0,0,0,0,1,...,4,0,0,4,0,1,1,0,0,1


In [8]:
diff_cols = np.setdiff1d(train_data.columns.values, eval_data.columns.values)
diff_cols

array(['target'], dtype=object)

## Combine train data and eval data

In [5]:
label = 'target'
features = eval_data.columns.values
target = train_data[label]
combine_data = pd.concat([train_data[features], eval_data], keys=['train','eval'])
print("combine data:", len(combine_data))
combine_data.head(5)

combine data: 1488028


Unnamed: 0,Unnamed: 1,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
train,0,7,2,2,5,1,0,0,1,0,0,...,9,1,5,8,0,1,1,0,0,1
train,1,9,1,1,7,0,0,0,0,1,0,...,3,1,1,9,0,1,1,0,1,0
train,2,13,5,4,9,1,0,0,0,1,0,...,4,2,7,7,0,1,1,0,1,0
train,3,16,0,1,2,0,0,1,0,0,0,...,2,2,4,9,0,0,0,0,0,0
train,4,17,0,2,0,1,0,1,0,0,0,...,3,1,1,3,0,0,0,1,1,0


# Pre-process data

## Check& Fill  NaN 

In [11]:
def check_null_data(data):
    #Get high percent of NaN data
    null_data = data.isnull()
    total = null_data.sum().sort_values(ascending=False)
    percent = (null_data.sum()/null_data.count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    high_percent_miss_data = missing_data[missing_data['Percent']>0]
    #print(missing_data)
    print(high_percent_miss_data)
    miss_data_cols = high_percent_miss_data.index.values
    return miss_data_cols

In [12]:
# combine data for null
check_null_data(combine_data)

Empty DataFrame
Columns: [Total, Percent]
Index: []


array([], dtype=object)

## Split train_set and eval_set

In [6]:
data = combine_data
train_set = data.loc['train']
eval_set = data.loc['eval']
data = train_set
data.loc[:,label] = target
data[:5]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin,target
0,7,2,2,5,1,0,0,1,0,0,...,1,5,8,0,1,1,0,0,1,0
1,9,1,1,7,0,0,0,0,1,0,...,1,1,9,0,1,1,0,1,0,0
2,13,5,4,9,1,0,0,0,1,0,...,2,7,7,0,1,1,0,1,0,0
3,16,0,1,2,0,0,1,0,0,0,...,2,4,9,0,0,0,0,0,0,0
4,17,0,2,0,1,0,1,0,0,0,...,1,1,3,0,0,0,1,1,0,0


# Train model

In [7]:
data = train_set.drop(['id', label], axis=1)
print(data.shape)
data[:5]

(595212, 57)


Unnamed: 0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,2,2,5,1,0,0,1,0,0,0,...,9,1,5,8,0,1,1,0,0,1
1,1,1,7,0,0,0,0,1,0,0,...,3,1,1,9,0,1,1,0,1,0
2,5,4,9,1,0,0,0,1,0,0,...,4,2,7,7,0,1,1,0,1,0
3,0,1,2,0,0,1,0,0,0,0,...,2,2,4,9,0,0,0,0,0,0
4,0,2,0,1,0,1,0,0,0,0,...,3,1,1,3,0,0,0,1,1,0


## Compute class weigth for unbalanced labels
* credit to: https://datascience.stackexchange.com/questions/13490/how-to-set-class-weights-for-imbalanced-classes-in-keras

In [8]:
cw = class_weight.compute_class_weight('balanced', np.unique(target), target)
class_weight_dict = dict(enumerate(cw))
class_weight_dict

{0: 0.51891309427079879, 1: 13.718355305614455}

## Split train/test set

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(
    data, target, train_size=0.85, random_state=1234)
print("X_train:", X_train.shape, " Y_train:", Y_train.shape,
      " X_test:", X_test.shape, " Y_test:", Y_test.shape)
X_train[:5]



X_train: (505930, 57)  Y_train: (505930,)  X_test: (89282, 57)  Y_test: (89282,)


Unnamed: 0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
265838,2,4,9,1,0,0,1,0,0,0,...,3,0,5,6,0,0,1,0,0,0
125350,5,2,8,1,0,0,1,0,0,0,...,3,3,2,4,0,1,0,0,0,0
37522,0,1,4,1,0,0,0,0,1,0,...,6,3,4,10,0,0,0,1,0,0
467358,0,1,6,0,4,1,0,0,0,0,...,3,1,1,10,0,0,1,0,0,0
171996,0,1,7,0,0,1,0,0,0,0,...,4,1,3,5,0,1,1,0,1,1


## Eval metrics

In [10]:
# custom objective function (similar to auc)

def gini(y, pred):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def gini_normalized(y, pred):
    return gini(y, pred) / gini(y, y)

def gini_xgb(y, pred):
    return 'gini', gini(y, pred) / gini(y, y)

def gini_lgb(y, pred):
    score = gini(y, pred)/ gini(y,y)
    #score = gini(y, pred)
    return 'gini', score, True


## Model definition

In [12]:
params = {'metric': 'auc', 
          'learning_rate': 0.1, 
          'max_depth': 10, 
          'max_bin': 10,  
          'objective': 'binary',
          'feature_fraction': 0.8,
          'bagging_fraction': 0.9,
          'bagging_freq': 10,
          'min_data': 500}
N_ROUNDS = 4000
#LGBMClassifier
model = LGBMRegressor(objective='binary',
                       metric='auc',
                       n_estimators=N_ROUNDS,
                       learning_rate=0.01,
                       max_depth=10,
                       num_leaves=31,
                       random_state=12345,
                       max_bin=63,
                       n_jobs=-1, 
                       device='gpu',
                       silent=False)

## Training data

In [13]:
early_stopping_rounds=50
model.fit(
                X_train, Y_train, eval_set=[(X_test, Y_test)],
                eval_metric=gini_lgb,
                early_stopping_rounds=early_stopping_rounds,
                verbose=10,
            )

Training until validation scores don't improve for 50 rounds.
[10]	valid_0's auc: 0.622392	valid_0's gini: 0.244731
[20]	valid_0's auc: 0.624501	valid_0's gini: 0.248999
[30]	valid_0's auc: 0.62495	valid_0's gini: 0.249875
[40]	valid_0's auc: 0.625518	valid_0's gini: 0.251046
[50]	valid_0's auc: 0.626465	valid_0's gini: 0.252941
[60]	valid_0's auc: 0.626828	valid_0's gini: 0.253658
[70]	valid_0's auc: 0.62846	valid_0's gini: 0.256923
[80]	valid_0's auc: 0.629704	valid_0's gini: 0.25941
[90]	valid_0's auc: 0.630435	valid_0's gini: 0.26087
[100]	valid_0's auc: 0.630894	valid_0's gini: 0.261782
[110]	valid_0's auc: 0.631537	valid_0's gini: 0.263075
[120]	valid_0's auc: 0.632129	valid_0's gini: 0.264258
[130]	valid_0's auc: 0.632697	valid_0's gini: 0.265394
[140]	valid_0's auc: 0.633476	valid_0's gini: 0.266951
[150]	valid_0's auc: 0.634077	valid_0's gini: 0.268154
[160]	valid_0's auc: 0.634566	valid_0's gini: 0.269131
[170]	valid_0's auc: 0.634841	valid_0's gini: 0.269682
[180]	valid_0's 

LGBMRegressor(boosting_type='gbdt', colsample_bytree=1.0, device='gpu',
       learning_rate=0.01, max_bin=63, max_depth=10, metric='auc',
       min_child_samples=10, min_child_weight=5, min_split_gain=0.0,
       n_estimators=4000, n_jobs=-1, num_leaves=31, objective='binary',
       random_state=12345, reg_alpha=0.0, reg_lambda=0.0, silent=False,
       subsample=1.0, subsample_for_bin=50000, subsample_freq=1)

## Evaluate model

In [14]:
print("Best iteration:",model.best_iteration_)
print("Best score:",model.best_score_['valid_0']['gini'])
print("Best AUC:",model.best_score_['valid_0']['auc'])

Best iteration: 1023
Best score: 0.284188479204
Best AUC: 0.642094239602


In [18]:
y_pred = model.predict(X_test)
score = gini_normalized(Y_test, y_pred)
print("Gini:", score)

Gini: 0.284188479204


# Predict and save submission

In [19]:
data = eval_set.drop('id', axis=1)
Y_eval = model.predict(data, num_iteration=model.best_iteration_)

In [20]:
#(Y_eval<0).all() 
Y_eval[Y_eval<0]

array([-0.00170513])

In [21]:
Y_eval = np.absolute(Y_eval)

In [22]:
eval_output = pd.DataFrame({'id': eval_data['id'], label: Y_eval})
print(len(eval_output))
eval_output.head()

892816


Unnamed: 0,id,target
0,0,0.029
1,1,0.0277
2,2,0.0257
3,3,0.015
4,4,0.0362


In [23]:
today = str(dtime.date.today())
print(today)
#eval_output.to_csv(DATA_DIR +'/' +today+'-submission.csv',index=False)
eval_output.to_csv(
            DATA_DIR + '/' + today + '-submission.csv.gz', index=False, float_format='%.5f',
            compression='gzip')

2017-10-15
