In [2]:
import xgboost as xgb

import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [3]:
# 乳がんデータセットを読み込む
dataset = datasets.load_breast_cancer()
X, y = dataset.data, dataset.target
# データセットを学習用と検証用に分割する
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    shuffle=True,
                                                    random_state=42,
                                                    stratify=y)

In [17]:
# XGBoost が扱うデータセットの形式に直す
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
# 学習用のパラメータ
xgb_params = {
    # 二値分類問題
    'objective': 'binary:logistic',
    # 評価指標
    'eval_metric': 'logloss',
#     'device': 'cpu'
}

In [18]:
%%time
# モデルを学習する
bst = xgb.train(xgb_params,
                dtrain,
                num_boost_round=100,  # 学習ラウンド数は適当
                )

CPU times: user 2min 18s, sys: 166 ms, total: 2min 18s
Wall time: 12.9 s


In [19]:
%%time
# 検証用データが各クラスに分類される確率を計算する
y_pred_proba = bst.predict(dtest)
# しきい値 0.5 で 0, 1 に丸める
y_pred = np.where(y_pred_proba > 0.5, 1, 0)
# 精度 (Accuracy) を検証する
acc = accuracy_score(y_test, y_pred)
print('Accuracy:', acc)

Accuracy: 0.9649122807017544
CPU times: user 129 ms, sys: 22 µs, total: 129 ms
Wall time: 14.4 ms


In [20]:
y_pred_proba

array([1.1187010e-02, 9.9191225e-01, 9.8089212e-01, 7.1520073e-04,
       3.6118880e-01, 5.3184619e-04, 9.9880350e-01, 2.1883221e-03,
       9.9924409e-01, 4.6064783e-04, 9.9982470e-01, 9.9740440e-01,
       7.0444285e-04, 9.9057722e-01, 9.9866998e-01, 2.1687105e-01,
       6.1500096e-04, 8.9528394e-01, 9.9722147e-01, 9.9811792e-01,
       9.9958605e-01, 9.9601918e-01, 9.9775642e-01, 3.7365936e-04,
       2.6810478e-04, 9.8688912e-01, 9.9948299e-01, 9.9247462e-01,
       2.7092602e-03, 2.8294898e-04, 9.9650109e-01, 3.0782609e-04,
       9.9908686e-01, 9.9956685e-01, 5.3307129e-04, 9.9981552e-01,
       6.9987267e-01, 9.9973339e-01, 9.9924183e-01, 9.9811494e-01,
       9.9050587e-01, 1.4653592e-02, 9.9969780e-01, 9.9962664e-01,
       9.9958307e-01, 9.6911860e-01, 8.8935679e-01, 9.9972433e-01,
       9.9968171e-01, 8.0926931e-03, 9.9818987e-01, 9.9858838e-01,
       5.4042612e-04, 9.8444172e-04, 9.9890268e-01, 9.9945766e-01,
       5.6257611e-04, 9.9781859e-01, 9.9840409e-01, 9.7478116e

In [49]:
import xgboost as xgb

default_params = {  
    # 二値分類問題  
    'objective': 'binary:logistic',  
    # 評価指標  
    'eval_metric': 'logloss',  
    # cpu  
    'device': 'cpu'  
}

num_boost_round=100  # 学習ラウンド数は適当

class Xgb(object):
    def __init__(self, params=default_params, 
                 num_boost_round=num_boost_round):
        self.xgb = xgb
        self.params = params
        self.num_boost_round = num_boost_round
    
    def fit(self, X, y):
        dtrain = xgb.DMatrix(X, label=y)
        self.model = self.xgb.train(self.params, dtrain, 
                                    num_boost_round=self.num_boost_round)
        
    def predict(self, X):
        X = xgb.DMatrix(X)
        y_pred_proba = self.model.predict(X)
        # しきい値 0.5 で 0, 1 に丸める
        y_pred = np.where(y_pred_proba > 0.5, 1, 0)
        return y_pred

In [50]:
custom_xgb = Xgb()

In [51]:
%%time
custom_xgb.fit(X_train, y_train)

CPU times: user 46.1 s, sys: 44.8 ms, total: 46.1 s
Wall time: 4.32 s
