## XGBoost with GridSearchCV
* Private Score: 0.28381
* Public Score:  0.27892

reference: https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283

In [11]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold

import gc
from numba import jit

In [12]:
@jit
def eval_gini(y_true, y_prob):
    """
    Original author CPMP : https://www.kaggle.com/cpmpml
    In kernel : https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
    """
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = eval_gini(labels, preds)
    return [('gini', gini_score)]

In [13]:
gc.enable()

train_df = pd.read_csv("../input/train.csv", index_col=0)
test_df = pd.read_csv("../input/test.csv", index_col=0)

In [14]:
train_df.head()

Unnamed: 0_level_0,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,0,2,2,5,1,0,0,1,0,0,...,9,1,5,8,0,1,1,0,0,1
9,0,1,1,7,0,0,0,0,1,0,...,3,1,1,9,0,1,1,0,1,0
13,0,5,4,9,1,0,0,0,1,0,...,4,2,7,7,0,1,1,0,1,0
16,0,0,1,2,0,0,1,0,0,0,...,2,2,4,9,0,0,0,0,0,0
17,0,0,2,0,1,0,1,0,0,0,...,3,1,1,3,0,0,0,1,1,0


In [15]:
test_df.head()

Unnamed: 0_level_0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,1,8,1,0,0,1,0,0,0,...,1,1,1,12,0,1,1,0,0,1
1,4,2,5,1,0,0,0,0,1,0,...,2,0,3,10,0,0,1,1,0,1
2,5,1,3,0,0,0,0,0,1,0,...,4,0,2,4,0,0,0,0,0,0
3,0,1,6,0,0,1,0,0,0,0,...,5,1,0,5,1,0,1,0,0,0
4,5,1,7,0,0,0,0,0,1,0,...,4,0,0,4,0,1,1,0,0,1


In [16]:
y_train = train_df["target"]
X_train = train_df.drop(['target'], axis=1)

In [17]:
X_train.describe()

Unnamed: 0,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
count,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,...,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0
mean,1.900378,1.358943,4.423318,0.416794,0.405188,0.393742,0.257033,0.163921,0.185304,0.000373,...,5.441382,1.441918,2.872288,7.539026,0.122427,0.62784,0.554182,0.287182,0.349024,0.153318
std,1.983789,0.664594,2.699902,0.493311,1.350642,0.488579,0.436998,0.370205,0.388544,0.019309,...,2.332871,1.202963,1.694887,2.746652,0.327779,0.483381,0.497056,0.452447,0.476662,0.360295
min,0.0,-1.0,0.0,-1.0,-1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,1.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,1.0,3.0,7.0,0.0,1.0,1.0,0.0,0.0,0.0
75%,3.0,2.0,6.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,7.0,2.0,4.0,9.0,0.0,1.0,1.0,1.0,1.0,0.0
max,7.0,4.0,11.0,1.0,6.0,1.0,1.0,1.0,1.0,1.0,...,19.0,10.0,13.0,23.0,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
y_train[:5]

id
7     0
9     0
13    0
16    0
17    0
Name: target, dtype: int64

In [19]:
n_splits = 5
n_estimators = 200

# ラベルの比率を揃えて分割
folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=15)
# 各結果を格納するオブジェクトを用意
# 特徴量重要度
imp_df = np.zeros((len(X_train.columns), n_splits))
# ベストなgini係数
xgb_evals = np.zeros((n_estimators, n_splits))
# 検証データでの予測値
oof = np.empty(len(X_train))
# テストデータでの予測値
test_preds = np.zeros(len(test_df))

np.random.seed(0)

for fold_, (trn_idx, val_idx) in enumerate(folds.split(y_train, y_train)):
    trn_dat, trn_tgt = X_train.iloc[trn_idx], y_train.iloc[trn_idx]
    val_dat, val_tgt = X_train.iloc[val_idx], y_train.iloc[val_idx]

    clf = XGBClassifier(n_estimators=n_estimators,
                        max_depth=4,
                        objective="binary:logistic",
                        learning_rate=.1, 
                        subsample=.8, 
                        colsample_bytree=.8,
                        gamma=1,
                        reg_alpha=0,
                        reg_lambda=1,
                        nthread=2)

    clf.fit(trn_dat, trn_tgt, 
            eval_set=[(trn_dat, trn_tgt), (val_dat, val_tgt)],
            eval_metric=gini_xgb,
            early_stopping_rounds=None,
            verbose=False)
            
    # Keep feature importances
    imp_df[:, fold_] = clf.feature_importances_

    # Find best round for validation set
    xgb_evals[:, fold_] = clf.evals_result_["validation_1"]["gini"]
    # Xgboost provides best round starting from 0 so it has to be incremented
    best_round = np.argsort(xgb_evals[:, fold_])[::-1][0]

    # Predict OOF and submission probas with the best round
    oof[val_idx] = clf.predict_proba(val_dat, ntree_limit=best_round)[:, 1]
    # Update submission
    test_preds += clf.predict_proba(test_df, ntree_limit=best_round)[:, 1] / n_splits

    # Display results
    print("Fold %2d : %.6f @%4d / best score is %.6f @%4d"
          % (fold_ + 1,
             eval_gini(val_tgt, oof[val_idx]),
             n_estimators,
             xgb_evals[best_round, fold_],
             best_round))
          
print("Full OOF score : %.6f" % eval_gini(y_train, oof))

Fold  1 : 0.280946 @ 200 / best score is 0.281134 @ 168
Fold  2 : 0.275179 @ 200 / best score is 0.275478 @ 144
Fold  3 : 0.279432 @ 200 / best score is 0.279587 @ 143
Fold  4 : 0.281970 @ 200 / best score is 0.282437 @ 199
Fold  5 : 0.276721 @ 200 / best score is 0.276802 @ 116
Full OOF score : 0.278816


In [21]:
# Compute mean score and std
mean_eval = np.mean(xgb_evals, axis=1)
std_eval = np.std(xgb_evals, axis=1)
best_round = np.argsort(mean_eval)[::-1][0]

print("Best mean score : %.6f + %.6f @%4d"
      % (mean_eval[best_round], std_eval[best_round], best_round))
    
importances = sorted([(X_train.columns[i], imp) for i, imp in enumerate(imp_df.mean(axis=1))],
                     key=lambda x: x[1])

for f, imp in importances[::-1]:
    print("%-34s : %10.4f" % (f, imp))

Best mean score : 0.278422 + 0.002736 @ 181
ps_car_13                          :     0.1040
ps_reg_03                          :     0.0783
ps_ind_03                          :     0.0587
ps_ind_05_cat                      :     0.0435
ps_car_14                          :     0.0431
ps_ind_15                          :     0.0403
ps_ind_01                          :     0.0340
ps_reg_02                          :     0.0322
ps_reg_01                          :     0.0317
ps_car_01_cat                      :     0.0302
ps_car_11_cat                      :     0.0291
ps_car_12                          :     0.0229
ps_car_15                          :     0.0224
ps_calc_10                         :     0.0214
ps_calc_14                         :     0.0207
ps_car_06_cat                      :     0.0203
ps_ind_17_bin                      :     0.0193
ps_calc_11                         :     0.0187
ps_car_09_cat                      :     0.0184
ps_ind_02_cat                      :     0.0

In [22]:
SAMPLE_SUBMIT_FILE = '../input/sample_submission.csv'
df_submit = pd.read_csv(SAMPLE_SUBMIT_FILE)
df_submit['target'] = test_preds
df_submit.describe().iloc[:,1]

count    892816.000000
mean          0.036463
std           0.019311
min           0.006906
25%           0.024169
50%           0.032132
75%           0.043098
max           0.685749
Name: target, dtype: float64

In [34]:
DIR = '../result_tmp/'
df_submit.to_csv(DIR + 'submit.csv', index=False)