## 大杀器Xgboost入门
by 寒小阳(hanxiaoyang.ml@gmail.com)

### 1.数据读取与简单建模

In [6]:
#!/usr/bin/python
import numpy as np
import scipy.sparse
import pickle
import xgboost as xgb

### simple example
# load file from text file, also binary buffer generated by xgboost
dtrain = xgb.DMatrix('data/agaricus.txt.train')
dtest = xgb.DMatrix('data/agaricus.txt.test')

# specify parameters via map, definition are same as c++ version
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }

# specify validations set to watch performance
watchlist  = [(dtest,'eval'), (dtrain,'train')]
num_round = 2
bst = xgb.train(param, dtrain, num_round, watchlist)

# this is prediction
preds = bst.predict(dtest)
labels = dtest.get_label()
print ('error=%f' % ( sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) /float(len(preds))))

[0]	eval-error:0.042831	train-error:0.046522
[1]	eval-error:0.021726	train-error:0.022263
error=0.021726


In [7]:
# save dmatrix into binary buffer
dtest.save_binary('dtest.buffer')
# save model
bst.save_model('xgb.model')
# load model and data in
bst2 = xgb.Booster(model_file='xgb.model')
dtest2 = xgb.DMatrix('dtest.buffer')
preds2 = bst2.predict(dtest2)
# assert they are the same
assert np.sum(np.abs(preds2-preds)) == 0

### 2.交叉验证

In [9]:
#!/usr/bin/python
import numpy as np
import xgboost as xgb

### load data in do training
dtrain = xgb.DMatrix('data/agaricus.txt.train')
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
num_round = 2

print ('running cross validation')
# do cross validation, this will print result out as
# [iteration]  metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param, dtrain, num_round, nfold=5,
       metrics={'error'}, seed = 0,
       callbacks=[xgb.callback.print_evaluation(show_stdv=True)])

running cross validation
[0]	train-error:0.0506912+0.00919397	test-error:0.0557604+0.0158272
[1]	train-error:0.0213132+0.00207524	test-error:0.0211982+0.00381269


Unnamed: 0,test-error-mean,test-error-std,train-error-mean,train-error-std
0,0.05576,0.015827,0.050691,0.009194
1,0.021198,0.003813,0.021313,0.002075


In [10]:
print ('running cross validation, disable standard deviation display')
# do cross validation, this will print result out as
# [iteration]  metric_name:mean_value
res = xgb.cv(param, dtrain, num_boost_round=10, nfold=5,
             metrics={'error'}, seed = 0,
             callbacks=[xgb.callback.print_evaluation(show_stdv=False),
                        xgb.callback.early_stop(3)])
print (res)

running cross validation, disable standard deviation display
[0]	train-error:0.0506912	test-error:0.0557604
Multiple eval metrics have been passed: 'test-error' will be used for early stopping.

Will train until test-error hasn't improved in 3 rounds.
[1]	train-error:0.0213132	test-error:0.0211982
[2]	train-error:0.0099458	test-error:0.0099844
[3]	train-error:0.0141322	test-error:0.0144392
[4]	train-error:0.0059904	test-error:0.0062978
[5]	train-error:0.0020352	test-error:0.0016896
[6]	train-error:0.0012288	test-error:0.0012288
[7]	train-error:0.0012288	test-error:0.0012288
[8]	train-error:0.0009216	test-error:0.0012288
[9]	train-error:0.0006144	test-error:0.0012288
Stopping. Best iteration:
[6]	train-error:0.0012288+0.000260441	test-error:0.0012288+0.00104177

   test-error-mean  test-error-std  train-error-mean  train-error-std
0         0.055760        0.015827          0.050691         0.009194
1         0.021198        0.003813          0.021313         0.002075
2         0.009984

In [11]:
print ('running cross validation, with preprocessing function')
# define the preprocessing function
# used to return the preprocessed training, test data, and parameter
# we can use this to do weight rescale, etc.
# as a example, we try to set scale_pos_weight
def fpreproc(dtrain, dtest, param):
    label = dtrain.get_label()
    ratio = float(np.sum(label == 0)) / np.sum(label==1)
    param['scale_pos_weight'] = ratio
    return (dtrain, dtest, param)

# do cross validation, for each fold
# the dtrain, dtest, param will be passed into fpreproc
# then the return value of fpreproc will be used to generate
# results of that fold
xgb.cv(param, dtrain, num_round, nfold=5,
       metrics={'auc'}, seed = 0, fpreproc = fpreproc)

running cross validation, with preprocessing function


Unnamed: 0,test-auc-mean,test-auc-std,train-auc-mean,train-auc-std
0,0.958198,0.00552,0.958196,0.001377
1,0.981412,0.002618,0.981403,0.000655


In [12]:
###
# you can also do cross validation with cutomized loss function
# See custom_objective.py
##
print ('running cross validation, with cutomsized loss function')
def logregobj(preds, dtrain):
    labels = dtrain.get_label()
    preds = 1.0 / (1.0 + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1.0-preds)
    return grad, hess

def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'error', float(sum(labels != (preds > 0.0))) / len(labels)

param = {'max_depth':2, 'eta':1, 'silent':1}
# train with customized objective
xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
       obj = logregobj, feval=evalerror)

running cross validation, with cutomsized loss function


Unnamed: 0,test-error-mean,test-error-std,test-rmse-mean,test-rmse-std,train-error-mean,train-error-std,train-rmse-mean,train-rmse-std
0,0.05576,0.015827,1.597543,0.012465,0.050691,0.009194,1.59479,0.003879
1,0.021198,0.003813,2.448759,0.080213,0.021313,0.002075,2.442099,0.076912


## 3.网格搜索与交叉验证

### 见参数调优ipython notebook