In [16]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import StratifiedKFold
from sklearn.grid_search import GridSearchCV
import xgboost as xgb

In [13]:
# 関数：ジニ係数
# from https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703#5897
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', gini_score

In [3]:
# データ読み込み
train = pd.read_csv('01.data/train.csv')
test = pd.read_csv('01.data/test.csv') 

In [4]:
# 特徴量と目的変数を分離
features = train.drop(['id','target'], axis=1).values
targets = train.target.values

In [5]:
# Drop unnessesary column
unwanted = train.columns[train.columns.str.startswith('ps_calc_')]
unwanted

Index(['ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05',
       'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10',
       'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14',
       'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin',
       'ps_calc_19_bin', 'ps_calc_20_bin'],
      dtype='object')

In [6]:
# 利用しない変数を除外
train = train.drop(unwanted, axis=1)  
test = test.drop(unwanted, axis=1)  

In [9]:
# Make X , y as features and targets
X = train.drop(['id', 'target'], axis=1).values # trainから
y = train.target.values
test_id = test.id.values
test = test.drop('id', axis=1)

sub = pd.DataFrame()
sub['id'] = test_id
sub['target'] = np.zeros_like(test_id)

In [7]:
# KFoldの設定
kfold = 5
skf = StratifiedKFold(n_splits=kfold, random_state=42)

In [8]:
# XGBoost 
# More parameters has to be tuned. Good luck :)
params = {
    # ブースター変数
    'min_child_weight': 10.0, # 子ノードにおける最小の重み　葉ノードの重みの合計がmin_child_weight未満で分割しない
    'max_depth': 7,
    'max_delta_step': 1.8, # 
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'eta': 0.025,
    'gamma': 0.65,
    'num_boost_round' : 700,
    # タスク変数
    'objective': 'binary:logistic'
    }

In [44]:
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print('[Fold %d/%d]' % (i + 1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    # Convert our data into XGBoost format
    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_valid, y_valid)
    d_test = xgb.DMatrix(test.values)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    # Train the model! We pass in a max of 2,000 rounds (with early stopping after 100)
    # and the custom metric (maximize=True tells xgb that higher metric is better)
    mdl = xgb.train(params, 
                    d_train, 
                    1600, 
                    watchlist, 
                    early_stopping_rounds=70,
                    feval=gini_xgb,
                    maximize=True,
                    verbose_eval=100)

    print('[Fold %d/%d Prediciton:]' % (i + 1, kfold))
    # Predict on our test data
    p_test = mdl.predict(d_test)
    sub['target'] += p_test/kfold

[Fold 1/5]
[0]	train-error:0.036447	valid-error:0.036449	train-gini:0.031021	valid-gini:0.036149
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 70 rounds.
[100]	train-error:0.036447	valid-error:0.036449	train-gini:0.320707	valid-gini:0.266656
[200]	train-error:0.036447	valid-error:0.036449	train-gini:0.358862	valid-gini:0.27362
[300]	train-error:0.036435	valid-error:0.036449	train-gini:0.393836	valid-gini:0.277752
[400]	train-error:0.036426	valid-error:0.036457	train-gini:0.422973	valid-gini:0.280028
Stopping. Best iteration:
[411]	train-error:0.036424	valid-error:0.036457	train-gini:0.425636	valid-gini:0.280271

[Fold 1/5 Prediciton:]
[Fold 2/5]
[0]	train-error:0.036447	valid-error:0.036449	train-gini:0.031254	valid-gini:0.042578
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 70 rounds.
[100]	train-error:0.036447	

In [46]:
sub.to_csv('02.output/StratifiedKFold.csv', index=False)