In [8]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
import xgboost as xgb
import matplotlib.pyplot as plt

In [9]:
os.chdir("e:/datascience/kaggle/Porto_Seguro")

In [10]:
# Read training data in files:
training = pd.read_csv("train.csv", sep = ',')
training[training==-1] = np.nan
training.shape
# sampling = np.random.choice(595212, replace=False, size=10000)
# training = training.iloc[sampling]

(595212, 59)

In [11]:
# Read testing data in files:
testing = pd.read_csv("test.csv", sep = ',')
testing[testing==-1] = np.nan
testing.shape

(892816, 58)

In [12]:
# split training features:
X = training.drop(['id','target'], axis=1).values
y = training.target.values

In [13]:
# define gini metric (from kaggle):
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
 
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_coefficient(preds,dtrain):
    y = dtrain.get_label()
    return 'gini', -gini_normalized(y,preds)

In [14]:
# xgboosting model:
xgb = XGBClassifier(max_depth=6, 
                    learning_rate=0.1, 
                    n_estimators=1000, 
                    min_child_weight=1,
                    objective='binary:logistic', 
                    nthread=-1, 
                    gamma=0, 
                    reg_lambda=1,
                    reg_alpha=0,
                    subsample=0.8,
                    colsample_bytree=1, 
                    colsample_bylevel=1, 
                    scale_pos_weight=1,
                    seed=11111,
                    missing=None)

skf = StratifiedKFold(n_splits=5, random_state=1111)

for train_index, test_index in skf.split(X, y):
    train_X, test_X = X[train_index], X[test_index]
    train_y, test_y = y[train_index], y[test_index]
    xgb.fit(train_X, train_y, 
            eval_set=[(train_X,train_y),(test_X,test_y)], 
            eval_metric=gini_coefficient,
            early_stopping_rounds=10)

[0]	validation_0-gini:-0.22953	validation_1-gini:-0.216779
Multiple eval metrics have been passed: 'validation_1-gini' will be used for early stopping.

Will train until validation_1-gini hasn't improved in 10 rounds.
[1]	validation_0-gini:-0.240011	validation_1-gini:-0.224676
[2]	validation_0-gini:-0.249564	validation_1-gini:-0.230763
[3]	validation_0-gini:-0.253355	validation_1-gini:-0.233261
[4]	validation_0-gini:-0.258543	validation_1-gini:-0.235853
[5]	validation_0-gini:-0.261083	validation_1-gini:-0.238603
[6]	validation_0-gini:-0.263838	validation_1-gini:-0.240488
[7]	validation_0-gini:-0.26567	validation_1-gini:-0.242739
[8]	validation_0-gini:-0.267246	validation_1-gini:-0.243123
[9]	validation_0-gini:-0.268318	validation_1-gini:-0.242907
[10]	validation_0-gini:-0.271709	validation_1-gini:-0.244548
[11]	validation_0-gini:-0.273061	validation_1-gini:-0.244168
[12]	validation_0-gini:-0.274415	validation_1-gini:-0.245145
[13]	validation_0-gini:-0.275451	validation_1-gini:-0.246739

[50]	validation_0-gini:-0.349812	validation_1-gini:-0.270077
[51]	validation_0-gini:-0.351704	validation_1-gini:-0.270594
[52]	validation_0-gini:-0.354229	validation_1-gini:-0.271032
[53]	validation_0-gini:-0.356263	validation_1-gini:-0.271373
[54]	validation_0-gini:-0.358769	validation_1-gini:-0.271576
[55]	validation_0-gini:-0.36042	validation_1-gini:-0.271369
[56]	validation_0-gini:-0.362725	validation_1-gini:-0.271912
[57]	validation_0-gini:-0.36522	validation_1-gini:-0.272265
[58]	validation_0-gini:-0.366878	validation_1-gini:-0.272941
[59]	validation_0-gini:-0.368787	validation_1-gini:-0.272779
[60]	validation_0-gini:-0.371368	validation_1-gini:-0.272693
[61]	validation_0-gini:-0.373517	validation_1-gini:-0.273505
[62]	validation_0-gini:-0.374834	validation_1-gini:-0.27423
[63]	validation_0-gini:-0.376652	validation_1-gini:-0.274422
[64]	validation_0-gini:-0.379345	validation_1-gini:-0.274375
[65]	validation_0-gini:-0.381867	validation_1-gini:-0.274926
[66]	validation_0-gini:-0.3

[80]	validation_0-gini:-0.40224	validation_1-gini:-0.27592
[81]	validation_0-gini:-0.403316	validation_1-gini:-0.275957
[82]	validation_0-gini:-0.404943	validation_1-gini:-0.275763
[83]	validation_0-gini:-0.405847	validation_1-gini:-0.275972
[84]	validation_0-gini:-0.406313	validation_1-gini:-0.276301
[85]	validation_0-gini:-0.408119	validation_1-gini:-0.276525
Stopping. Best iteration:
[75]	validation_0-gini:-0.394525	validation_1-gini:-0.276597

[0]	validation_0-gini:-0.2233	validation_1-gini:-0.213802
Multiple eval metrics have been passed: 'validation_1-gini' will be used for early stopping.

Will train until validation_1-gini hasn't improved in 10 rounds.
[1]	validation_0-gini:-0.239643	validation_1-gini:-0.227778
[2]	validation_0-gini:-0.247178	validation_1-gini:-0.233131
[3]	validation_0-gini:-0.250659	validation_1-gini:-0.23594
[4]	validation_0-gini:-0.254246	validation_1-gini:-0.237067
[5]	validation_0-gini:-0.256607	validation_1-gini:-0.239907
[6]	validation_0-gini:-0.259075	

[14]	validation_0-gini:-0.27096	validation_1-gini:-0.242142
[15]	validation_0-gini:-0.272171	validation_1-gini:-0.242565
[16]	validation_0-gini:-0.274716	validation_1-gini:-0.245248
[17]	validation_0-gini:-0.277699	validation_1-gini:-0.24702
[18]	validation_0-gini:-0.279757	validation_1-gini:-0.247561
[19]	validation_0-gini:-0.282337	validation_1-gini:-0.247443
[20]	validation_0-gini:-0.283252	validation_1-gini:-0.247738
[21]	validation_0-gini:-0.283725	validation_1-gini:-0.247938
[22]	validation_0-gini:-0.285232	validation_1-gini:-0.247745
[23]	validation_0-gini:-0.287759	validation_1-gini:-0.250065
[24]	validation_0-gini:-0.289863	validation_1-gini:-0.251079
[25]	validation_0-gini:-0.29246	validation_1-gini:-0.252983
[26]	validation_0-gini:-0.293907	validation_1-gini:-0.254115
[27]	validation_0-gini:-0.296257	validation_1-gini:-0.254688
[28]	validation_0-gini:-0.298181	validation_1-gini:-0.25499
[29]	validation_0-gini:-0.30119	validation_1-gini:-0.254751
[30]	validation_0-gini:-0.302