# Porto Seguro’s Safe Driver Prediction
- Predict if a driver will file an insurance claim next year.
- https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/data
- install: http://jupyter-contrib-nbextensions.readthedocs.io/en/latest/install.html
- install: https://github.com/Jupyter-contrib/jupyter_nbextensions_configurator


# Credit to:
* https://www.kaggle.com/arthurtok/interactive-porto-insights-a-plot-ly-tutorial
* https://www.kaggle.com/anokas/simple-xgboost-btb-0-27
* https://www.kaggle.com/rshally/porto-xgb-lgb-kfold-lb-0-282
* https://www.kaggle.com/akashdeepjassal/simple-keras-mlp/code
* https://www.kaggle.com/pnagel/keras-starter/code
* https://datascience.stackexchange.com/questions/13490/how-to-set-class-weights-for-imbalanced-classes-in-keras

# Library import

In [1]:
# data processing
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# ML
# # Scikit-learn
from sklearn.model_selection import cross_val_score, train_test_split, learning_curve, validation_curve, KFold
from sklearn.utils import class_weight
from sklearn.preprocessing import LabelBinarizer, StandardScaler

# LightGBM
from lightgbm import LGBMClassifier, LGBMRegressor

# Keras
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasRegressor
from keras.optimizers import SGD, Adam
from keras.utils import np_utils

# System
import datetime as dtime

Using TensorFlow backend.


# Load data

In [2]:
pd.options.display.float_format = '{:,.4f}'.format
# Input data files are available in the DATA_DIR directory.
DATA_DIR = "data-temp"
# Load data. Download from:https://www.kaggle.com/c/nyc-taxi-trip-duration/data
train_data = pd.read_csv(DATA_DIR + "/train.csv")
eval_data =  pd.read_csv(DATA_DIR + "/test.csv")

In [4]:
print("train size:", train_data.shape, " test size:", eval_data.shape)

train size: (595212, 59)  test size: (892816, 58)


In [5]:
train_data.head(5)

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [6]:
eval_data.head(5)

Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,0,0,1,8,1,0,0,1,0,0,...,1,1,1,12,0,1,1,0,0,1
1,1,4,2,5,1,0,0,0,0,1,...,2,0,3,10,0,0,1,1,0,1
2,2,5,1,3,0,0,0,0,0,1,...,4,0,2,4,0,0,0,0,0,0
3,3,0,1,6,0,0,1,0,0,0,...,5,1,0,5,1,0,1,0,0,0
4,4,5,1,7,0,0,0,0,0,1,...,4,0,0,4,0,1,1,0,0,1


In [8]:
diff_cols = np.setdiff1d(train_data.columns.values, eval_data.columns.values)
diff_cols

array(['target'], dtype=object)

## Combine train data and eval data

In [3]:
label = 'target'
features = eval_data.columns.values
target = train_data[label]
combine_data = pd.concat([train_data[features], eval_data], keys=['train','eval'])
print("combine data:", len(combine_data))
combine_data.head(5)

combine data: 1488028


Unnamed: 0,Unnamed: 1,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
train,0,7,2,2,5,1,0,0,1,0,0,...,9,1,5,8,0,1,1,0,0,1
train,1,9,1,1,7,0,0,0,0,1,0,...,3,1,1,9,0,1,1,0,1,0
train,2,13,5,4,9,1,0,0,0,1,0,...,4,2,7,7,0,1,1,0,1,0
train,3,16,0,1,2,0,0,1,0,0,0,...,2,2,4,9,0,0,0,0,0,0
train,4,17,0,2,0,1,0,1,0,0,0,...,3,1,1,3,0,0,0,1,1,0


# Pre-process data

## Check& Fill  NaN 

In [11]:
def check_null_data(data):
    #Get high percent of NaN data
    null_data = data.isnull()
    total = null_data.sum().sort_values(ascending=False)
    percent = (null_data.sum()/null_data.count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    high_percent_miss_data = missing_data[missing_data['Percent']>0]
    #print(missing_data)
    print(high_percent_miss_data)
    miss_data_cols = high_percent_miss_data.index.values
    return miss_data_cols

In [12]:
# combine data for null
check_null_data(combine_data)

Empty DataFrame
Columns: [Total, Percent]
Index: []


array([], dtype=object)

## Split train_set and eval_set

In [4]:
data = combine_data
train_set = data.loc['train']
eval_set = data.loc['eval']
data = train_set
data.loc[:,label] = target
data[:5]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin,target
0,7,2,2,5,1,0,0,1,0,0,...,1,5,8,0,1,1,0,0,1,0
1,9,1,1,7,0,0,0,0,1,0,...,1,1,9,0,1,1,0,1,0,0
2,13,5,4,9,1,0,0,0,1,0,...,2,7,7,0,1,1,0,1,0,0
3,16,0,1,2,0,0,1,0,0,0,...,2,4,9,0,0,0,0,0,0,0
4,17,0,2,0,1,0,1,0,0,0,...,1,1,3,0,0,0,1,1,0,0


# Train model

In [86]:
real_vars = ['ps_ind_01', 'ps_ind_03', 'ps_ind_14', 'ps_ind_15', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03', 'ps_car_11', 'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14']
#data = train_set.drop(['id', label], axis=1)
data = train_set[real_vars]
print(data.shape)
data[:5]

(595212, 26)


Unnamed: 0,ps_ind_01,ps_ind_03,ps_ind_14,ps_ind_15,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_11,ps_car_12,ps_car_13,...,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14
0,2,5,0,11,0.7,0.2,0.7181,2,0.4,0.8837,...,1,10,1,10,1,5,9,1,5,8
1,1,7,0,3,0.8,0.4,0.7661,3,0.3162,0.6188,...,1,9,5,8,1,7,3,1,1,9
2,5,9,0,12,0.0,0.0,-1.0,1,0.3162,0.6416,...,2,9,1,8,2,7,4,2,7,7
3,0,2,0,8,0.9,0.2,0.5809,1,0.3742,0.5429,...,4,7,1,8,4,2,2,2,4,9
4,0,0,0,9,0.7,0.6,0.8408,3,0.3161,0.5658,...,2,6,3,10,2,12,3,1,1,3


## Compute class weigth for unbalanced labels
* credit to: https://datascience.stackexchange.com/questions/13490/how-to-set-class-weights-for-imbalanced-classes-in-keras

In [87]:
cw = class_weight.compute_class_weight('balanced', np.unique(target), target)
class_weight_dict = dict(enumerate(cw))
class_weight_dict

{0: 0.51891309427079879, 1: 13.718355305614455}

## Prepare train data

### Split train/test set

In [88]:
X_train, X_test, Y_train, Y_test = train_test_split(
    data, target, train_size=0.85, random_state=1234)
print("X_train:", X_train.shape, " Y_train:", Y_train.shape,
      " X_test:", X_test.shape, " Y_test:", Y_test.shape)
X_train[:5]

X_train: (505930, 26)  Y_train: (505930,)  X_test: (89282, 26)  Y_test: (89282,)




Unnamed: 0,ps_ind_01,ps_ind_03,ps_ind_14,ps_ind_15,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_11,ps_car_12,ps_car_13,...,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14
265838,2,9,0,7,0.9,0.3,0.5766,2,0.4,0.7274,...,5,9,3,10,1,11,3,0,5,6
125350,5,8,0,1,0.8,1.0,0.7826,1,0.3161,0.6329,...,1,9,1,8,3,6,3,3,2,4
37522,0,4,0,9,0.8,0.5,1.0461,2,0.4472,0.7917,...,3,9,1,9,2,6,6,3,4,10
467358,0,6,0,10,0.9,0.5,0.926,3,0.4243,0.6674,...,3,6,4,8,2,6,3,1,1,10
171996,0,7,0,5,0.5,0.2,0.6393,2,0.3742,0.6638,...,1,8,6,10,3,5,4,1,3,5


### Scaling features

In [89]:
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [90]:
# Transform train data
X_train =  scaler.transform(X_train)
X_test =  scaler.transform(X_test)

### Transform label to categorial

In [91]:
Y_train = np_utils.to_categorical(Y_train.values)
Y_test_pre = Y_test
Y_test = np_utils.to_categorical(Y_test.values)

In [92]:
Y_train[:5]

array([[ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.]])

## Eval metrics

In [11]:
# custom objective function (similar to auc)

def gini(y, pred):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def gini_normalized(y, pred):
    return gini(y, pred) / gini(y, y)

def gini_xgb(y, pred):
    return 'gini', gini(y, pred) / gini(y, y)

def gini_lgb(y, pred):
    score = gini(y, pred)/ gini(y,y)
    #score = gini(y, pred)
    return 'gini', score, True


In [12]:
# Define the gini metric - from https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703#5897
def gini2(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini2_normalized(a, p):
    return gini2(a, p) / gini2(a, a)

In [59]:
class keras_gini(keras.callbacks.Callback):
    def __init__(self, validation_data, classifier=False):
#         print("init validation len:", len(validation_data))
        self.validation_data = validation_data
        self.classifier = classifier
        self.maps = []

    def eval_metric(self):
#         print("")
#         print("validation len:", len(self.validation_data))
        #print(self.validation_data)
        x_val, y_true, _ = self.validation_data
        y_pred = self.model.predict(x_val)
        if self.classifier:
            score = gini2_normalized(y_true[:,1], y_pred[:,1])
        else:
            score = gini2_normalized(y_true, y_pred)
        return score

    def on_epoch_end(self, epoch, logs={}):
        score = self.eval_metric()
        print(". Eval for epoch %d is %f"%(epoch+1, score))
        self.maps.append(score)

## Model definition

In [114]:
KERAS_LEARNING_RATE = 0.1
KERAS_N_ROUNDS = 2
KERAS_BATCH_SIZE = 32
KERAS_NODES = 64
KERAS_LAYERS = 2
KERAS_DROPOUT_RATE = 0.2
random_state=12343
#n_features = len(data.columns) - 2
n_features = len(data.columns)
decay = KERAS_LEARNING_RATE / KERAS_N_ROUNDS
# create model
model = Sequential()
model.add(Dense(KERAS_NODES, input_shape=(n_features, ),
                activation='relu'))
model.add(Dropout(KERAS_DROPOUT_RATE, seed=random_state))
for i in range(KERAS_LAYERS):
    model.add(Dense(KERAS_NODES,
                    activation='relu'))
    model.add(Dropout(KERAS_DROPOUT_RATE, seed=random_state))
# model.add(Dense(1, kernel_initializer='normal'))
model.add(Dense(2, activation='softmax'))
#model.add(Dense(1, activation='sigmoid'))

# Compile model
#optimizer = Adam(lr=KERAS_LEARNING_RATE, decay=decay)
optimizer = Adam(lr=KERAS_LEARNING_RATE)
# Use Early-Stopping
callback_early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='auto')
callback_tensorboard = keras.callbacks.TensorBoard(log_dir=DATA_DIR + '/tensorboard', histogram_freq=1, batch_size=32, write_graph=True, write_grads=True, write_images=True)
callback_gini_metric = keras_gini(validation_data=(X_test, Y_test, True))
# model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['accuracy'])
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_53 (Dense)             (None, 64)                1728      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_54 (Dense)             (None, 64)                4160      
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_55 (Dense)             (None, 64)                4160      
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_56 (Dense)             (None, 2)                 130       
Total para

## Training model

In [115]:
model.fit(X_train, Y_train,
          validation_data=(X_test, Y_test),
          batch_size=KERAS_BATCH_SIZE,
          epochs=KERAS_N_ROUNDS,
          callbacks=[callback_early_stopping,
                     #callback_tensorboard,
                     #callback_gini_metric
                    ],
          class_weight = class_weight_dict,
          verbose=True
          )

Train on 505930 samples, validate on 89282 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fd22060cf60>

## Evaluate model

In [116]:
score = model.evaluate(X_test, Y_test, verbose=1)
print("")
print("Test score:", score[0])
print('Test accuracy:', score[1])

Test score: 0.604711477759
Test accuracy: 0.962276830716


In [117]:
y_pred = model.predict(X_test)

In [118]:
y_pred[:100]

array([[ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,  0.],
       [ 1.,

In [119]:
score = gini2_normalized(Y_test[:,1], y_pred[:,1])
#score = gini_normalized(Y_test, y_pred)
print('Score:', score)

Score: -0.000273142279992


# Predict and save submission

In [101]:
#data = eval_set.drop('id', axis=1)
data = eval_set[real_vars]
X_eval = scaler.transform(data)
Y_eval = model.predict(X_eval)

In [102]:
#(Y_eval<0).all() 
Y_eval[Y_eval<0]

array([], dtype=float32)

In [103]:
Y_eval = np.absolute(Y_eval)

In [105]:
eval_output = pd.DataFrame({'id': eval_data['id'], label: Y_eval[:,1]})
print(len(eval_output))
eval_output.head(100)

892816


Unnamed: 0,id,target
0,0,0.0000
1,1,0.0000
2,2,0.0000
3,3,0.0000
4,4,0.0000
5,5,0.0000
6,6,0.0000
7,8,0.7672
8,10,0.0000
9,11,0.0000


In [106]:
today = str(dtime.date.today())
print(today)
#eval_output.to_csv(DATA_DIR +'/' +today+'-submission.csv',index=False)
eval_output.to_csv(
            DATA_DIR + '/' + today + '-submission.csv.gz', index=False, float_format='%.5f',
            compression='gzip')

2017-10-15
