# XGBoost类库使用小结
https://www.cnblogs.com/pinard/p/11114748.html

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pylab as plt
%matplotlib inline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.datasets.samples_generator import make_classification
# X为样本特征，y为样本类别输出， 共10000个样本，每个样本20个特征，输出有2个类别，没有冗余特征，每个类别一个簇
X, y = make_classification(n_samples=10000, n_features=20, n_redundant=0,
                             n_clusters_per_class=1, n_classes=2, flip_y=0.1)



In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# 2.1 使用原生Python API接口

In [7]:
dtrain = xgb.DMatrix(X_train,y_train)
dtest = xgb.DMatrix(X_test,y_test)

In [9]:
#设置xgboost的参数，并进行训练
param = {'max_depth':5, 'eta':0.5, 'verbosity':1, 'objective':'binary:logistic'}
raw_model = xgb.train(param, dtrain, num_boost_round=20)

In [10]:
# 计算其准确度，训练集
from sklearn.metrics import accuracy_score
pred_train_raw = raw_model.predict(dtrain)
for i in range(len(pred_train_raw)):
    if pred_train_raw[i] > 0.5:
         pred_train_raw[i]=1
    else:
        pred_train_raw[i]=0               
print (accuracy_score(dtrain.get_label(), pred_train_raw))

0.9546666666666667


In [12]:
# 计算其准确度，验证集
pred_test_raw = raw_model.predict(dtest)
for i in range(len(pred_test_raw)):
    if pred_test_raw[i] > 0.5:
         pred_test_raw[i]=1
    else:
        pred_test_raw[i]=0               
print (accuracy_score(dtest.get_label(), pred_test_raw))

0.9424


# 2.2 使用sklearn风格接口，使用原生参数

In [13]:
sklearn_model_raw = xgb.XGBClassifier(**param)
sklearn_model_raw.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="error",
        eval_set=[(X_test, y_test)])

[0]	validation_0-error:0.0596
Will train until validation_0-error hasn't improved in 10 rounds.
[1]	validation_0-error:0.0592
[2]	validation_0-error:0.0596
[3]	validation_0-error:0.0584
[4]	validation_0-error:0.0588
[5]	validation_0-error:0.0588
[6]	validation_0-error:0.0584
[7]	validation_0-error:0.0584
[8]	validation_0-error:0.0584
[9]	validation_0-error:0.058
[10]	validation_0-error:0.0584
[11]	validation_0-error:0.0584
[12]	validation_0-error:0.0588
[13]	validation_0-error:0.0584
[14]	validation_0-error:0.0584
[15]	validation_0-error:0.0592
[16]	validation_0-error:0.0588
[17]	validation_0-error:0.0588
[18]	validation_0-error:0.0588
[19]	validation_0-error:0.0588
Stopping. Best iteration:
[9]	validation_0-error:0.058



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.5, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

# 2.3使用sklearn风格接口，使用sklearn风格参数

In [14]:
sklearn_model_new = xgb.XGBClassifier(max_depth=5,learning_rate= 0.5, verbosity=1, objective='binary:logistic',random_state=1)

In [15]:
sklearn_model_new.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="error",
        eval_set=[(X_test, y_test)])

[0]	validation_0-error:0.0596
Will train until validation_0-error hasn't improved in 10 rounds.
[1]	validation_0-error:0.0604
[2]	validation_0-error:0.0592
[3]	validation_0-error:0.0588
[4]	validation_0-error:0.0588
[5]	validation_0-error:0.058
[6]	validation_0-error:0.0576
[7]	validation_0-error:0.0568
[8]	validation_0-error:0.0552
[9]	validation_0-error:0.054
[10]	validation_0-error:0.0564
[11]	validation_0-error:0.056
[12]	validation_0-error:0.0556
[13]	validation_0-error:0.056
[14]	validation_0-error:0.056
[15]	validation_0-error:0.056
[16]	validation_0-error:0.0564
[17]	validation_0-error:0.0556
[18]	validation_0-error:0.0568
[19]	validation_0-error:0.0576
Stopping. Best iteration:
[9]	validation_0-error:0.054



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.5, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=1,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)