# XGBoost

In [3]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

## Intro to XGBoost

## XGBoost Library

XGBoost is an industry-proven, open-source software library that provides a gradient boosting framework for scaling billions of data points quickly and efficiently.

Docs: https://xgboost.readthedocs.io/en/stable/index.html

**XGBoost** is an optimized distributed gradient boosting library designed to be highly **efficient, flexible and portable**. It implements machine learning algorithms under the Gradient Boosting framework. XGBoost provides a parallel tree boosting (also known as GBDT, GBM) that solve many data science problems in a fast and accurate way. The same code runs on major distributed environment (Hadoop, SGE, MPI) and can solve problems beyond billions of examples.

Installation: https://xgboost.readthedocs.io/en/stable/install.html#python
- `pip install xgboost`

In [4]:
import xgboost

print(xgboost.__version__)

2.0.0


## XGBoost native API

In [5]:
diamonds = sns.load_dataset("diamonds")
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [6]:
diamonds.shape

(53940, 10)

In [7]:
diamonds.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [8]:
diamonds.describe(exclude=np.number)

Unnamed: 0,cut,color,clarity
count,53940,53940,53940
unique,5,7,8
top,Ideal,G,SI1
freq,21551,11292,13065


In [10]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


In [11]:
from sklearn.model_selection import train_test_split

# Extract feature and target arrays
X, y = diamonds.drop('price', axis=1), diamonds[['price']]

# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
    X[col] = X[col].astype('category')
print(X.info())    
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   x        53940 non-null  float64 
 7   y        53940 non-null  float64 
 8   z        53940 non-null  float64 
dtypes: category(3), float64(6)
memory usage: 2.6 MB
None


In [12]:
import xgboost as xgb

# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [13]:
# Define hyperparameters
params = {"objective": "reg:squarederror", "tree_method": "hist"}

In [14]:
model = xgb.train(params=params, dtrain=dtrain_reg, num_boost_round=100)

In [15]:
from sklearn.metrics import mean_squared_error

preds = model.predict(dtest_reg)

In [16]:
rmse = mean_squared_error(y_test, preds, squared=False)
print(f"RMSE of the base model: {rmse:.3f}")

RMSE of the base model: 552.861


### Using Validation Sets During Training

In [17]:
params = {"objective": "reg:squarederror", "tree_method": "hist"}

In [18]:
evals = [(dtest_reg, "validation"), (dtrain_reg, "train")]

In [19]:
evals = [(dtest_reg, "validation"), (dtrain_reg, "train")]

model = xgb.train(params=params, dtrain=dtrain_reg, num_boost_round=100, evals=evals, verbose_eval=10) # Print every ten rounds

[0]	validation-rmse:2817.90814	train-rmse:2874.49146
[10]	validation-rmse:592.03160	train-rmse:548.36512
[20]	validation-rmse:558.53485	train-rmse:491.09887
[30]	validation-rmse:555.51015	train-rmse:469.58201
[40]	validation-rmse:554.45666	train-rmse:454.32953
[50]	validation-rmse:554.13365	train-rmse:438.68033
[60]	validation-rmse:551.57888	train-rmse:425.38361
[70]	validation-rmse:549.26109	train-rmse:414.71115
[80]	validation-rmse:549.03952	train-rmse:405.41008
[90]	validation-rmse:551.87206	train-rmse:391.04269
[99]	validation-rmse:552.86131	train-rmse:383.48826


### XGBoost Early Stopping

In [20]:
model = xgb.train(params=params, dtrain=dtrain_reg, num_boost_round=5000, evals=evals,verbose_eval=500)

[0]	validation-rmse:2817.90814	train-rmse:2874.49146
[500]	validation-rmse:564.77532	train-rmse:201.44074
[1000]	validation-rmse:574.22590	train-rmse:127.04692
[1500]	validation-rmse:577.88702	train-rmse:87.17633
[2000]	validation-rmse:579.64879	train-rmse:64.86890
[2500]	validation-rmse:580.65263	train-rmse:50.04183
[3000]	validation-rmse:581.63495	train-rmse:39.38436
[3500]	validation-rmse:582.04309	train-rmse:31.85327
[4000]	validation-rmse:582.60757	train-rmse:26.57816
[4500]	validation-rmse:582.92145	train-rmse:22.47317
[4999]	validation-rmse:583.12697	train-rmse:19.43452


In [21]:
# If there’s more than one metric in the eval_metric parameter given in params, 
# the last metric will be used for early stopping.
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=10_000,
   evals=evals,
   verbose_eval=10,
   # Activate early stopping
   early_stopping_rounds=50
)

[0]	train-rmse:2874.49146	validation-rmse:2817.90814
[10]	train-rmse:548.36512	validation-rmse:592.03160
[20]	train-rmse:491.09887	validation-rmse:558.53485
[30]	train-rmse:469.58201	validation-rmse:555.51015
[40]	train-rmse:454.32953	validation-rmse:554.45666
[50]	train-rmse:438.68033	validation-rmse:554.13365
[60]	train-rmse:425.38361	validation-rmse:551.57888
[70]	train-rmse:414.71115	validation-rmse:549.26109
[80]	train-rmse:405.41008	validation-rmse:549.03952
[90]	train-rmse:391.04269	validation-rmse:551.87206
[100]	train-rmse:381.96310	validation-rmse:553.73941
[110]	train-rmse:370.00003	validation-rmse:553.28780
[120]	train-rmse:364.36677	validation-rmse:553.45340
[129]	train-rmse:357.32858	validation-rmse:552.90303


### XGBoost Cross-Validation

In [22]:
params = {"objective": "reg:squarederror", "tree_method": "hist"}
n = 1000

results = xgb.cv(params, dtrain_reg, num_boost_round=n, nfold=5, early_stopping_rounds=20)

In [24]:
results.tail()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
52,418.350687,3.376886,551.238403,10.366408
53,416.487264,2.196313,550.834285,10.208928
54,415.267876,1.417937,550.730704,10.253714
55,414.061884,1.38734,550.419242,10.294152
56,412.701,1.472351,550.273554,10.203995


In [25]:
best_rmse = results['test-rmse-mean'].min()
best_rmse

550.2735543625861

### XGBoost Classification

* `binary:logistic` - binary classification
* `multi:softprob` - multi-class classification

In [26]:
from sklearn.preprocessing import OrdinalEncoder

X, y = diamonds.drop("cut", axis=1), diamonds[['cut']]

# Encode y to numeric
y_encoded = OrdinalEncoder().fit_transform(y)

# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to pd.Categorical
for col in cats:
    X[col] = X[col].astype('category')

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, random_state=1, stratify=y_encoded)

In [27]:
y["cut"].unique()

['Ideal', 'Premium', 'Good', 'Very Good', 'Fair']
Categories (5, object): ['Ideal', 'Premium', 'Very Good', 'Good', 'Fair']

In [28]:
# Create classification matrices
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_clf = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [29]:
params = {"objective": "multi:softprob", "tree_method": "hist", "num_class": 5}

results = xgb.cv(
   params, dtrain_clf,
   num_boost_round=100,
   nfold=5,
   metrics=["mlogloss", "auc", "merror"],
   early_stopping_rounds=20
)

In [31]:
results.tail()

Unnamed: 0,train-mlogloss-mean,train-mlogloss-std,train-auc-mean,train-auc-std,train-merror-mean,train-merror-std,test-mlogloss-mean,test-mlogloss-std,test-auc-mean,test-auc-std,test-merror-mean,test-merror-std
95,0.361481,0.001081,0.975623,0.000224,0.128779,0.000896,0.539447,0.003425,0.938675,0.000865,0.202249,0.003571
96,0.359926,0.001068,0.975863,0.000193,0.128136,0.001049,0.539425,0.003399,0.938685,0.000863,0.202324,0.004029
97,0.35842,0.000689,0.976101,0.000135,0.127197,0.00119,0.539284,0.003607,0.938694,0.000869,0.202175,0.00354
98,0.357136,0.000771,0.976307,0.000113,0.126443,0.001209,0.53929,0.003768,0.938686,0.000853,0.202027,0.003388
99,0.355459,0.000581,0.976555,9.5e-05,0.125677,0.001072,0.538866,0.003866,0.938777,0.000878,0.201804,0.003563


In [32]:
results['test-auc-mean'].max()

0.9387768205705852

## XGBoost Sklearn

In [33]:
from sklearn import datasets

X,y = datasets.load_diabetes(return_X_y=True)

In [34]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

In [35]:
scores = cross_val_score(XGBRegressor(objective='reg:squarederror'), X, y, scoring='neg_mean_squared_error')

In [36]:
(-scores)**0.5 

array([55.68547317, 58.18498778, 68.622534  , 64.15281814, 68.4826639 ])

In [37]:
from sklearn import datasets

X,y = datasets.load_breast_cancer(return_X_y=True)

In [38]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

cross_val_score(XGBClassifier(), X, y).mean()

0.9701288619779538