# Classification with XGBoost

In [3]:
from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()
X = breast_cancer.data
y = breast_cancer.target

In [7]:
# Import xgboost
import xgboost as xgb

# Import necessary modules
from sklearn.model_selection import train_test_split
import numpy as np

# Create the training and test sets
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=123)

# Instantiate the XGBClassifier: xg_cl
xg_cl = xgb.XGBClassifier(n_estimators=10, objective='binary:logistic', seed=123)

# Fit the classifier to the training set
xg_cl.fit(X_train,y_train)

# Predict the labels of the test set: preds
preds = xg_cl.predict(X_test)

# Compute the accuracy: accuracy
accuracy = float(np.sum(preds==y_test))/y_test.shape[0]
print("accuracy: %f" % (accuracy))

accuracy: 0.956140


## Cross-validation in XGBoost example

In [9]:
import xgboost as xgb

# for cv in XGBoost, we have to first explicitly convert our data into a DMatrix
breast_cancer_matrix = xgb.DMatrix(data=X,
                                  label=y)
breast_cancer_matrix

<xgboost.core.DMatrix at 0x7fae21efa3a0>

In [14]:
# parameters to pass into our cross-validation
params = {'objective':'binary:logistic',
         'max_depth':4}

# call cv method and pass DMatrix object storing all the data
cv_results = xgb.cv(dtrain=breast_cancer_matrix,
                   params=params,
                   nfold=4, #no. of cv folds
                   num_boost_round=10, #no. of trees we want to build
                   metrics='error', #metric to compute
                   as_pandas=True) #output to be stored as pandas dataframe

print("CV_results: \n",cv_results)
print("Accuracy: %f" %((1-cv_results['test-error-mean']).iloc[-1]))

CV_results: 
    train-error-mean  train-error-std  test-error-mean  test-error-std
0          0.023432         0.004962         0.072060        0.003127
1          0.016986         0.004489         0.059761        0.007919
2          0.011715         0.001648         0.059785        0.011790
3          0.013472         0.002543         0.059785        0.010687
4          0.009373         0.000010         0.058049        0.017602
5          0.007030         0.001656         0.054528        0.017592
6          0.007030         0.001656         0.052768        0.017675
7          0.005858         0.001168         0.052768        0.017675
8          0.005272         0.001013         0.052768        0.017675
9          0.004100         0.001941         0.047498        0.017573
Accuracy: 0.952502


## When to use XGBoost
- Greater than 1000 training samples and less 100 features
- The number of features < number of training samples
- mixture of categorical and numerical features

## When to NOT use XGBoost
- Image recognition
- Computer vision
- Natural Language Processing and understanding problems