# Kaggle Safe Driver

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold
from pandas import scatter_matrix
import xgboost as xgb

import tqdm

%matplotlib inline



Load data

In [2]:
data = pd.read_csv("../input/train.csv")

train test split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(["id","target"],axis=1),data["target"],test_size=0.2, random_state=42)

In [4]:
X_train.index = range(X_train.shape[0])
X_test.index = range(X_test.shape[0])
y_train.index = range(len(y_train))
y_test.index = range(len(y_test))

Check

In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 476169 entries, 0 to 476168
Data columns (total 57 columns):
ps_ind_01         476169 non-null int64
ps_ind_02_cat     476169 non-null int64
ps_ind_03         476169 non-null int64
ps_ind_04_cat     476169 non-null int64
ps_ind_05_cat     476169 non-null int64
ps_ind_06_bin     476169 non-null int64
ps_ind_07_bin     476169 non-null int64
ps_ind_08_bin     476169 non-null int64
ps_ind_09_bin     476169 non-null int64
ps_ind_10_bin     476169 non-null int64
ps_ind_11_bin     476169 non-null int64
ps_ind_12_bin     476169 non-null int64
ps_ind_13_bin     476169 non-null int64
ps_ind_14         476169 non-null int64
ps_ind_15         476169 non-null int64
ps_ind_16_bin     476169 non-null int64
ps_ind_17_bin     476169 non-null int64
ps_ind_18_bin     476169 non-null int64
ps_reg_01         476169 non-null float64
ps_reg_02         476169 non-null float64
ps_reg_03         476169 non-null float64
ps_car_01_cat     476169 non-null int64
ps_

In [6]:
X_train.shape

(476169, 57)

In [7]:
X_train.head(3)

Unnamed: 0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,2,1,5,0,0,1,0,0,0,0,...,4,4,3,11,0,0,1,0,1,0
1,0,1,7,1,0,0,1,0,0,0,...,8,1,3,6,0,1,0,1,0,0
2,0,1,3,0,0,0,1,0,0,0,...,5,2,3,10,0,0,0,0,0,0


In [8]:
y_train.value_counts()

0    458860
1     17309
Name: target, dtype: int64

check 0/1 ratio after split

In [9]:
(y_train == 0).sum()/(y_train == 1).sum()

26.509908140273847

In [10]:
(y_test == 0).sum()/(y_test == 1).sum()

26.147776510832383

close to stratified sampling

Only a small portion of people got positive results

Features that belong to similar groupings are tagged as such in the feature names (e.g., ind, reg, car, calc).<br>
--- groups<br>
    

In addition, feature names include the postfix bin to indicate binary features<br>
cat to indicate categorical features<br>
Features without these designations are either continuous or ordinal<br>
Values of -1 indicate that the feature was missing from the observation. <br>
The target columns signifies whether or not a claim was filed for that policy holder.

Missing

In [11]:
(X_train == -1).sum()

ps_ind_01              0
ps_ind_02_cat        163
ps_ind_03              0
ps_ind_04_cat         66
ps_ind_05_cat       4637
ps_ind_06_bin          0
ps_ind_07_bin          0
ps_ind_08_bin          0
ps_ind_09_bin          0
ps_ind_10_bin          0
ps_ind_11_bin          0
ps_ind_12_bin          0
ps_ind_13_bin          0
ps_ind_14              0
ps_ind_15              0
ps_ind_16_bin          0
ps_ind_17_bin          0
ps_ind_18_bin          0
ps_reg_01              0
ps_reg_02              0
ps_reg_03          86083
ps_car_01_cat         83
ps_car_02_cat          3
ps_car_03_cat     329018
ps_car_04_cat          0
ps_car_05_cat     213533
ps_car_06_cat          0
ps_car_07_cat       9148
ps_car_08_cat          0
ps_car_09_cat        446
ps_car_10_cat          0
ps_car_11_cat          0
ps_car_11              2
ps_car_12              1
ps_car_13              0
ps_car_14          34038
ps_car_15              0
ps_calc_01             0
ps_calc_02             0
ps_calc_03             0


In [12]:
many_miss_col = X_train.columns[(X_train == -1).sum() > 30000]

In [13]:
many_miss_col

Index(['ps_reg_03', 'ps_car_03_cat', 'ps_car_05_cat', 'ps_car_14'], dtype='object')

1. ps_reg_03

In [14]:
not_miss_y_1 = y_train[X_train["ps_reg_03"]>=0]
miss_y_1 = y_train[X_train["ps_reg_03"]<0]
pd.concat([not_miss_y_1.value_counts() ,miss_y_1.value_counts()],axis=1).apply(lambda x: x/x.sum(), axis=0)

Unnamed: 0,target,target.1
0,0.961929,0.971446
1,0.038071,0.028554


2.ps_car_03_cat

In [15]:
not_miss_y_2 = y_train[X_train["ps_car_03_cat"]>=0]
miss_y_2 = y_train[X_train["ps_car_03_cat"]<0]
pd.concat([not_miss_y_2.value_counts() ,miss_y_2.value_counts()],axis=1).apply(lambda x: x/x.sum(), axis=0)

Unnamed: 0,target,target.1
0,0.955372,0.967351
1,0.044628,0.032649


3.ps_car_05_cat

In [16]:
not_miss_y_3 = y_train[X_train["ps_car_05_cat"]>=0]
miss_y_3 = y_train[X_train["ps_car_05_cat"]<0]
pd.concat([not_miss_y_3.value_counts() ,miss_y_3.value_counts()],axis=1).apply(lambda x: x/x.sum(), axis=0)

Unnamed: 0,target,target.1
0,0.959823,0.968356
1,0.040177,0.031644


4.ps_car_14

In [17]:
not_miss_y_4 = y_train[X_train["ps_car_14"]>=0]
miss_y_4 = y_train[X_train["ps_car_14"]<0]
pd.concat([not_miss_y_4.value_counts() ,miss_y_4.value_counts()],axis=1).apply(lambda x: x/x.sum(), axis=0)

Unnamed: 0,target,target.1
0,0.963959,0.959633
1,0.036041,0.040367


small missing value

In [18]:
small_miss_col = X_train.columns[((X_train == -1).sum() < 30000) * ((X_train == -1).sum() > 1000)]

  unsupported[op_str]))


In [19]:
small_miss_col

Index(['ps_ind_05_cat', 'ps_car_07_cat'], dtype='object')

5.ps_ind_05_cat

In [20]:
not_miss_y_5 = y_train[X_train["ps_ind_05_cat"]>=0]
miss_y_5 = y_train[X_train["ps_ind_05_cat"]<0]
pd.concat([not_miss_y_5.value_counts() ,miss_y_5.value_counts()],axis=1).apply(lambda x: x/x.sum(), axis=0)

Unnamed: 0,target,target.1
0,0.964128,0.915031
1,0.035872,0.084969


6.ps_car_07_cat

In [21]:
not_miss_y_6 = y_train[X_train["ps_car_07_cat"]>=0]
miss_y_6 = y_train[X_train["ps_car_07_cat"]<0]
pd.concat([not_miss_y_6.value_counts() ,miss_y_6.value_counts()],axis=1).apply(lambda x: x/x.sum(), axis=0)

Unnamed: 0,target,target.1
0,0.964468,0.921841
1,0.035532,0.078159


These two missing will be good indicator of positive values

little missing value

In [22]:
little_miss_col = X_train.columns[((X_train == -1).sum() < 1000) * ((X_train == -1).sum() > 0)]

  unsupported[op_str]))


In [23]:
(X_train == -1).sum()[little_miss_col]

ps_ind_02_cat    163
ps_ind_04_cat     66
ps_car_01_cat     83
ps_car_02_cat      3
ps_car_09_cat    446
ps_car_11          2
ps_car_12          1
dtype: int64

7.ps_ind_02_cat

In [24]:
not_miss_y_7 = y_train[X_train["ps_ind_02_cat"]>=0]
miss_y_7 = y_train[X_train["ps_ind_02_cat"]<0]
pd.concat([not_miss_y_7.value_counts() ,miss_y_7.value_counts()],axis=1).apply(lambda x: x/x.sum(), axis=0)

Unnamed: 0,target,target.1
0,0.963706,0.797546
1,0.036294,0.202454


8.ps_ind_04_cat

In [25]:
not_miss_y_8 = y_train[X_train["ps_ind_04_cat"]>=0]
miss_y_8 = y_train[X_train["ps_ind_04_cat"]<0]
pd.concat([not_miss_y_8.value_counts() ,miss_y_8.value_counts()],axis=1).apply(lambda x: x/x.sum(), axis=0)

Unnamed: 0,target,target.1
0,0.963701,0.590909
1,0.036299,0.409091


9.ps_car_01_cat

In [26]:
not_miss_y_9 = y_train[X_train["ps_car_01_cat"]>=0]
miss_y_9 = y_train[X_train["ps_car_01_cat"]<0]
pd.concat([not_miss_y_9.value_counts() ,miss_y_9.value_counts()],axis=1).apply(lambda x: x/x.sum(), axis=0)

Unnamed: 0,target,target.1
0,0.963702,0.662651
1,0.036298,0.337349


10.ps_car_09_cat

In [27]:
not_miss_y_10 = y_train[X_train["ps_car_09_cat"]>=0]
miss_y_10 = y_train[X_train["ps_car_09_cat"]<0]
pd.concat([not_miss_y_10.value_counts() ,miss_y_10.value_counts()],axis=1).apply(lambda x: x/x.sum(), axis=0)

Unnamed: 0,target,target.1
0,0.963695,0.914798
1,0.036305,0.085202


Correlation

In [28]:
corr_mat = pd.concat([X_train, y_train], axis=1).corr()

# Generate a custom diverging colormap
plt.figure(figsize=(10,10))

cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspeact ratio
sns.heatmap(corr_mat, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5});

ps\_calc\_\* may not be related to target

Gini Index

In [30]:
def gini(y_pred, d_train):
    y = d_train.get_label()
    y_sort = y[y_pred.argsort()]
    y_sort_cumsum = y_sort.cumsum()
    output = 0.5 - (1/(len(y))) * (y_sort_cumsum.sum() / y.sum())
    return 'gini', output

test gini

Simple Model 

class weights

In [34]:
from sklearn.utils import class_weight

In [35]:
c_weight = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)

In [36]:
c_weight

array([  0.51886087,  13.75495407])

Data preparation

We could use -1 as a special value in the data set and take care of it after the first try

Model

In [37]:
data = pd.read_csv("../input/train.csv")

Feature engineering

In [147]:
col_drop = data.columns[data.columns.str.startswith("ps_calc")]

In [148]:
data.drop(col_drop, axis=1, inplace=True)

In [149]:
X = data.drop(["id","target"],axis=1).as_matrix()
y = data["target"].as_matrix()

data_test = pd.read_csv("../input/test.csv")
data_test.drop(col_drop, axis=1, inplace=True)
test_id = data_test["id"]
data_test.drop("id", axis=1, inplace=True)
dtest = xgb.DMatrix(data_test.as_matrix())

In [151]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', gini_score

Add more regularization

In [156]:
# first
# params = {
#     'min_child_weight': 10.0,
#     'objective': 'binary:logistic',
#     'max_depth': 7,
#     'max_delta_step': 1.8,
#     'colsample_bytree': 0.4,
#     'subsample': 0.8,
#     'eta': 0.025,
#     'gamma': 0.65,
#     'num_boost_round' : 700
#     }

# second
# params = {
#     'min_child_weight': 10.0,
#     'objective': 'binary:logistic',
#     'max_depth': 8,
#     'colsample_bytree': 0.5,
#     'subsample': 0.8,
#     'eta': 0.01,
#     'gamma': 0.5,
#     'num_boost_round' : 1000,
#     "scale_pos_weight":3,
#     "reg_lambda": 10,
#     "random_state":42
#     }

# third
params = {'eta': 0.02, 'max_depth': 4, 'subsample': 0.9, 'colsample_bytree': 0.9, 
          'objective': 'binary:logistic', 'eval_metric': 'auc', 'silent': True}

In [157]:
kfolds = 5
pred = np.zeros(data_test.shape[0])
skf = StratifiedKFold(n_splits=5)
for i, (train_index, vaalid_index) in enumerate(skf.split(X, y)):
    print("The {} fold".format(i))
    print("###################################################")
    
    # train test split
    X_train, X_valid = X[train_index], X[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    dtrain = xgb.DMatrix(X_train, label=y_train.reshape(-1,1))
    dvalid = xgb.DMatrix(X_valid, label=y_valid.reshape(-1,1))
    evallist = [(dtrain, 'train'), (dvalid, 'valid')]
    
    mdl = xgb.train(params, dtrain, 
                2000, evallist, early_stopping_rounds=100, 
                feval=gini_xgb, maximize=True, verbose_eval=100)
    
    pred += mdl.predict(dtest, ntree_limit=mdl.best_ntree_limit+50) / kfolds

The 0 fold
###################################################
[0]	train-gini:0.19047	valid-gini:0.197622
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.250015	valid-gini:0.243101
[200]	train-gini:0.27618	valid-gini:0.2585
[300]	train-gini:0.296144	valid-gini:0.270284
[400]	train-gini:0.309669	valid-gini:0.276664
[500]	train-gini:0.319834	valid-gini:0.279849
[600]	train-gini:0.32752	valid-gini:0.281409
[700]	train-gini:0.334553	valid-gini:0.282087
[800]	train-gini:0.340891	valid-gini:0.281927
Stopping. Best iteration:
[707]	train-gini:0.334978	valid-gini:0.282232

The 1 fold
###################################################
[0]	train-gini:0.192166	valid-gini:0.181254
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.249937	valid-gini:0.232574
[200]	train-

In [158]:
sample_submit = pd.DataFrame({"id":test_id,"target":pred})

In [159]:
sample_submit.tail()

Unnamed: 0,id,target
892811,1488022,0.096524
892812,1488023,0.037369
892813,1488024,0.040034
892814,1488025,0.023925
892815,1488026,0.030653


In [160]:
sample_submit.to_csv("../output/fe_drop_ps_calc_third.csv", index=False)