In [34]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

import xgboost as xgb

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
pd.set_option("display.max_rows", 11)
pd.set_option("display.max_columns", 80)

## 関数の定義

In [17]:
# 関数：ジニ係数
# from https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703#5897
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', gini_score

## データ読み込み・編集

In [4]:
# データ読み込み
train = pd.read_csv('01.data/train.csv')
test = pd.read_csv('01.data/test.csv') 

In [5]:
# 特徴量と目的変数を分離
features = train.drop(['id','target'], axis=1).values
targets = train.target.values

In [7]:
#除外する変数を設定
# 目的変数との相関がほぼゼロのため
unwanted = train.columns[train.columns.str.startswith('ps_calc_')]
unwanted

Index(['ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05',
       'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10',
       'ps_calc_11', 'ps_calc_12', 'ps_calc_13', 'ps_calc_14',
       'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin',
       'ps_calc_19_bin', 'ps_calc_20_bin'],
      dtype='object')

In [8]:
# 利用しない変数を除外
train = train.drop(unwanted, axis=1)  
test = test.drop(unwanted, axis=1)  

In [9]:
X = train.drop(['id', 'target'], axis=1).values # 説明変数(idとかtargetを除外)
y = train.target.values # 正解データ
test_id = test.id.values
test = test.drop('id', axis=1)

In [10]:
# submission用データフレーム
sub = pd.DataFrame()
sub['id'] = test_id
sub['target'] = np.zeros_like(test_id)

## k-Foldを実装してみる

In [11]:
# KFoldの設定
kfold = 2
skf = StratifiedKFold(n_splits=kfold, random_state=42)

In [12]:
# XGBoost 
# More parameters has to be tuned
# http://puyokw.hatenablog.com/entry/2015/04/11/040941
params = {
    # ブースター変数
    'min_child_weight' : 10.0, # 子ノードにおける最小の重み　葉ノードの重みの合計がmin_child_weight未満で分割しない
    'max_depth' : 7,
    'max_delta_step': 1.8,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'eta': 0.025,
    'gamma': 0.65,
    'num_boost_round' : 700,
    # タスク変数
    'objective': 'binary:logistic'
    }


In [13]:
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print('[Fold %d/%d]' % (i + 1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    
    # Convert our data into XGBoost format
    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_valid, y_valid)
    d_test = xgb.DMatrix(test.values)
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    # Train the model! We pass in a max of 2,000 rounds (with early stopping after 100)
    # and the custom metric (maximize=True tells xgb that higher metric is better)
    mdl = xgb.train(params, 
                    d_train, 
                    1600, 
                    watchlist, 
                    early_stopping_rounds=70,
                    feval=gini_xgb,
                    maximize=True,
                    verbose_eval=100)

    print('[Fold %d/%d Prediciton:]' % (i + 1, kfold))
    # Predict on our test data
    p_test = mdl.predict(d_test)
    sub['target'] += p_test/kfold
    print(' ')

[Fold 1/2]
[0]	train-error:0.036448	valid-error:0.036448	train-gini:0.000652	valid-gini:0.004843
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 70 rounds.
[100]	train-error:0.036448	valid-error:0.036448	train-gini:0.336714	valid-gini:0.264463
[200]	train-error:0.036448	valid-error:0.036448	train-gini:0.384257	valid-gini:0.274752
[300]	train-error:0.036448	valid-error:0.036448	train-gini:0.430691	valid-gini:0.279561
Stopping. Best iteration:
[314]	train-error:0.036448	valid-error:0.036448	train-gini:0.436123	valid-gini:0.279807

[Fold 1/2 Prediciton:]
[Fold 2/2]
[0]	train-error:0.036448	valid-error:0.036448	train-gini:0.004696	valid-gini:0.000795
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 70 rounds.
[100]	train-error:0.036448	valid-error:0.036448	train-gini:0.341451	valid-gini:0.262066
[200]	train-error:0.036448

In [46]:
sub.to_csv('02.output/StratifiedKFold.csv', index=False)

## Grid Searchを実装してみたいがよくわからない

In [74]:
# Convert our data into XGBoost format
d_train = xgb.DMatrix(X_train, y_train)
d_valid = xgb.DMatrix(X_valid, y_valid)
d_test = xgb.DMatrix(test.values)
#watchlist = [(d_train, 'train'), (d_valid, 'valid')]
watchlist = [(d_train, 'train'), (d_valid, 'valid')]

In [46]:
# Train the model! We pass in a max of 2,000 rounds (with early stopping after 100)
# and the custom metric (maximize=True tells xgb that higher metric is better)
mdl = xgb.train(params, 
                d_train, 
                1600, # num_boost_round 
                watchlist, 
                
                early_stopping_rounds=70,
                feval=gini_xgb, # 評価関数：in case using custom evaluation function
                maximize=True, # Whether to maximize feval.
                verbose_eval=100 # Requires at least one item in evals.
               )

[0]	train-error:0.036448	valid-error:0.036448	train-gini:0.004696	valid-gini:0.000795
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 70 rounds.
[100]	train-error:0.036448	valid-error:0.036448	train-gini:0.341451	valid-gini:0.262066
[200]	train-error:0.036448	valid-error:0.036448	train-gini:0.389597	valid-gini:0.269117
[300]	train-error:0.036448	valid-error:0.036448	train-gini:0.437675	valid-gini:0.273231
[400]	train-error:0.036431	valid-error:0.036448	train-gini:0.472215	valid-gini:0.273515
Stopping. Best iteration:
[356]	train-error:0.036434	valid-error:0.036448	train-gini:0.45827	valid-gini:0.274122



In [73]:
# クロスバリデーションをしていると思っているが、よくわからない
cv2 = xgb.cv(params,
             d_train,
             num_boost_round=300,
             nfold=3,
             stratified=True
            )

In [72]:
print(cv2)

     test-error-mean  test-error-std  train-error-mean  train-error-std
0           0.036448        0.000005          0.036448         0.000002
1           0.036448        0.000005          0.036448         0.000002
2           0.036448        0.000005          0.036448         0.000002
3           0.036448        0.000005          0.036448         0.000002
4           0.036448        0.000005          0.036448         0.000002
..               ...             ...               ...              ...
295         0.036448        0.000005          0.036448         0.000002
296         0.036448        0.000005          0.036448         0.000002
297         0.036448        0.000005          0.036448         0.000002
298         0.036448        0.000005          0.036448         0.000002
299         0.036448        0.000005          0.036448         0.000002

[300 rows x 4 columns]


## Grid Search 研究

In [24]:
param_range = [1, 5, 10, 20, 50 ]

In [25]:
param_grid = [{'min_child_weight' : param_range}]

In [35]:
gs = GridSearchCV(estimator=mdl,
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  cv=10,
                  n_jobs=-1)

In [36]:
gs_fit = gs.fit(X_train, y_train)

TypeError: estimator should be an estimator implementing 'fit' method, <xgboost.core.Booster object at 0x1136b9d68> was passed