In [43]:
import numpy as np
import xgboost as xgb
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.datasets import dump_svmlight_file
### commented out because of warning:
#from sklearn.externals import joblib
### above replaced with this:
import joblib
from sklearn.metrics import precision_score


In [44]:
# First you load the dataset from sklearn, where X will be the data, y – the class labels:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [45]:
# split the data into train and test sets with 80-20% split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [46]:
# use DMatrix for xgboost, create the Xgboost specific DMatrix data
# format from the numpy array. Xgboost can work with numpy arrays directly, load data from svmlight files and other formats
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [47]:
# use svmlight file for xgboost, If you want to use svmlight for less memory 
# consumption, first dump the numpy array into svmlight format and then just pass the filename to DMatrix:
dump_svmlight_file(X_train, y_train, 'dtrain.svm', zero_based=True)
dump_svmlight_file(X_test, y_test, 'dtest.svm', zero_based=True)
dtrain_svm = xgb.DMatrix('dtrain.svm')
dtest_svm = xgb.DMatrix('dtest.svm')

[18:35:37] 120x4 matrix with 480 entries loaded from dtrain.svm
[18:35:37] 30x4 matrix with 120 entries loaded from dtest.svm


In [48]:
# set xgboost params
param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 20  # the number of training iterations

In [49]:
#------------- numpy array ------------------
# training and testing - numpy matrices
bst = xgb.train(param, dtrain, num_round)
preds = bst.predict(dtest)

In [50]:
# extracting most confident predictions
best_preds = np.asarray([np.argmax(line) for line in preds])
print("Numpy array precision:", precision_score(y_test, best_preds, average='macro'))

Numpy array precision: 1.0


In [51]:
# ------------- svm file ---------------------
# training and testing - svm file
bst_svm = xgb.train(param, dtrain_svm, num_round)
preds = bst.predict(dtest_svm)

In [52]:
# extracting most confident predictions
best_preds_svm = [np.argmax(line) for line in preds]
print("Svm file precision:",precision_score(y_test, best_preds_svm, average='macro'))
# --------------------------------------------

Svm file precision: 1.0


In [53]:
# dump the models
bst.dump_model('dump.raw.txt')
bst_svm.dump_model('dump_svm.raw.txt')

In [54]:
# save the models for later
joblib.dump(bst, 'bst_model.pkl', compress=True)
joblib.dump(bst_svm, 'bst_svm_model.pkl', compress=True)

['bst_svm_model.pkl']