# Basic

In [32]:
import xgboost as xgb
import numpy as np
import pickle
import scipy

In [2]:
# read in data
### simple example  
##load file from text file, also binary buffer generated by xgboost
dtrain = xgb.DMatrix('./data/agaricus.txt.train')
dtest = xgb.DMatrix('./data/agaricus.txt.test')
# specify parameters via map
param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
num_round = 2
bst = xgb.train(param, dtrain, num_round)
# make prediction
preds = bst.predict(dtest)



# Watchlist

In [3]:
# specify validations set to watch performance
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 2
bst = xgb.train(param, dtrain, num_round, watchlist)

[0]	eval-error:0.042831	train-error:0.046522
[1]	eval-error:0.021726	train-error:0.022263


In [4]:
bst = xgb.train(param,dtrain,evals=watchlist)

[0]	eval-error:0.042831	train-error:0.046522
[1]	eval-error:0.021726	train-error:0.022263
[2]	eval-error:0.006207	train-error:0.007063
[3]	eval-error:0.018001	train-error:0.0152
[4]	eval-error:0.006207	train-error:0.007063
[5]	eval-error:0	train-error:0.001228
[6]	eval-error:0	train-error:0.001228
[7]	eval-error:0	train-error:0.001228
[8]	eval-error:0	train-error:0.001228
[9]	eval-error:0	train-error:0


# Save model

In [5]:
# this is prediction
preds = bst.predict(dtest)
labels = dtest.get_label()
print('error=%f' % (sum(1 for i in range(len(preds)) if int(preds[i] > 0.5) != labels[i]) / float(len(preds))))
bst.save_model('./model/0001.model')
# dump model
bst.dump_model('./model/dump.raw.txt')
# dump model with feature map
# bst.dump_model('./model/dump.nice.txt', '../data/featmap.txt')

error=0.000000


## model `save_binary`

In [19]:
# save dmatrix into binary buffer
dtest.save_binary('./data/dtest.buffer')

In [22]:
# load model and data in
bst2 = xgb.Booster(model_file='./model/0001.model')
dtest2 = xgb.DMatrix('./data/dtest.buffer')
preds2 = bst2.predict(dtest2)
# assert they are the same
assert np.sum(np.abs(preds2 - preds)) == 0

## model dump to `pickle`

In [26]:
pks = pickle.dumps(bst2)

In [31]:
bst3 = pickle.loads(pks)
preds3 = bst3.predict(dtest2)
assert np.sum(np.abs(preds3 - preds)) == 0

## build dmatrix from `csc_matrix`

In [34]:
# build dmatrix from scipy.sparse
print('start running example of build DMatrix from scipy.sparse CSR Matrix')
labels = []
row = []; col = []; dat = []
i = 0
for l in open('./data/agaricus.txt.train'):
    arr = l.split()
    labels.append(int(arr[0]))
    for it in arr[1:]:
        k,v = it.split(':')
        row.append(i); col.append(int(k)); dat.append(float(v))
    i += 1
csr = scipy.sparse.csr_matrix((dat, (row, col)))
dtrain = xgb.DMatrix(csr, label=labels)
watchlist = [(dtest, 'eval'), (dtrain, 'train')]
bst = xgb.train(param, dtrain, num_round, watchlist)

start running example of build DMatrix from scipy.sparse CSR Matrix
[0]	eval-error:0.042831	train-error:0.046522
[1]	eval-error:0.021726	train-error:0.022263


In [37]:
npymat = csr.todense()
dtrain = xgb.DMatrix(npymat,label=labels)
watchlist = [(dtest, 'eval'),(dtrain,'train')]

In [38]:
bst = xgb.train(param,dtrain,num_round,watchlist)

[0]	eval-error:0.042831	train-error:0.046522
[1]	eval-error:0.021726	train-error:0.022263
