# XGBoost

In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

## Intro to XGBoost

## XGBoost Library

XGBoost is an industry-proven, open-source software library that provides a gradient boosting framework for scaling billions of data points quickly and efficiently.

Docs: https://xgboost.readthedocs.io/en/stable/index.html

**XGBoost** is an optimized distributed gradient boosting library designed to be highly **efficient, flexible and portable**. It implements machine learning algorithms under the Gradient Boosting framework. XGBoost provides a parallel tree boosting (also known as GBDT, GBM) that solve many data science problems in a fast and accurate way. The same code runs on major distributed environment (Hadoop, SGE, MPI) and can solve problems beyond billions of examples.

Installation: https://xgboost.readthedocs.io/en/stable/install.html#python
- `pip install xgboost`

In [2]:
import xgboost

print(xgboost.__version__)

2.0.0


## XGBoost native API

In [3]:
diamonds = sns.load_dataset("diamonds")
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [8]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


In [4]:
diamonds.shape

(53940, 10)

In [6]:
diamonds.describe(include="all")

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
count,53940.0,53940,53940,53940,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
unique,,5,7,8,,,,,,
top,,Ideal,G,SI1,,,,,,
freq,,21551,11292,13065,,,,,,
mean,0.79794,,,,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,,,,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,,,,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,,,,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,,,,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,,,,62.5,59.0,5324.25,6.54,6.54,4.04


In [7]:
diamonds.describe(exclude=np.number)

Unnamed: 0,cut,color,clarity
count,53940,53940,53940
unique,5,7,8
top,Ideal,G,SI1
freq,21551,11292,13065


In [9]:
from sklearn.model_selection import train_test_split

# Extract feature and target arrays
X, y = diamonds.drop('price', axis=1), diamonds[['price']]

# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
    X[col] = X[col].astype('category')
print(X.info())    
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   x        53940 non-null  float64 
 7   y        53940 non-null  float64 
 8   z        53940 non-null  float64 
dtypes: category(3), float64(6)
memory usage: 2.6 MB
None


In [10]:
import xgboost as xgb

# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [11]:
# Define hyperparameters
params = {"objective": "reg:squarederror", "tree_method": "hist"}

In [12]:
model = xgb.train(params=params, dtrain=dtrain_reg, num_boost_round=100)

In [13]:
from sklearn.metrics import mean_squared_error

preds = model.predict(dtest_reg)

In [14]:
rmse = mean_squared_error(y_test, preds, squared=False)
print(f"RMSE of the base model: {rmse:.3f}")

RMSE of the base model: 552.861


### Using Validation Sets During Training

In [15]:
params = {"objective": "reg:squarederror", "tree_method": "hist"}

In [16]:
evals = [(dtest_reg, "validation"), (dtrain_reg, "train")]

In [17]:
model = xgb.train(params=params, dtrain=dtrain_reg, num_boost_round=100, evals=evals, verbose_eval=10) # Print every ten rounds

[0]	validation-rmse:2817.90814	train-rmse:2874.49146
[10]	validation-rmse:592.03160	train-rmse:548.36512
[20]	validation-rmse:558.53485	train-rmse:491.09887
[30]	validation-rmse:555.51015	train-rmse:469.58201
[40]	validation-rmse:554.45666	train-rmse:454.32953
[50]	validation-rmse:554.13365	train-rmse:438.68033
[60]	validation-rmse:551.57888	train-rmse:425.38361
[70]	validation-rmse:549.26109	train-rmse:414.71115
[80]	validation-rmse:549.03952	train-rmse:405.41008
[90]	validation-rmse:551.87206	train-rmse:391.04269
[99]	validation-rmse:552.86131	train-rmse:383.48826


### XGBoost Early Stopping

In [18]:
model = xgb.train(params=params, dtrain=dtrain_reg, num_boost_round=5000, evals=evals,verbose_eval=500)

[0]	validation-rmse:2817.90814	train-rmse:2874.49146
[500]	validation-rmse:564.77532	train-rmse:201.44074
[1000]	validation-rmse:574.22590	train-rmse:127.04692
[1500]	validation-rmse:577.88702	train-rmse:87.17633
[2000]	validation-rmse:579.64879	train-rmse:64.86890


KeyboardInterrupt: 

In [19]:
# If there’s more than one metric in the eval_metric parameter given in params, 
# the last metric will be used for early stopping.
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=1000,
   evals=evals,
   verbose_eval=50,
   # Activate early stopping
   early_stopping_rounds=50
)

[0]	train-rmse:2874.49146	validation-rmse:2817.90814
[50]	train-rmse:438.68033	validation-rmse:554.13365
[100]	train-rmse:381.96310	validation-rmse:553.73941
[128]	train-rmse:358.11000	validation-rmse:553.05030


### XGBoost Cross-Validation

In [20]:
params = {"objective": "reg:squarederror", "tree_method": "hist"}
n = 1000


results = xgb.cv(params, dtrain_reg, num_boost_round=n, nfold=5, early_stopping_rounds=20)

In [21]:
results.head()

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,2874.224552,9.424846,2876.318793,36.995997
1,2088.350837,7.595382,2093.063623,25.351925
2,1552.629638,4.97414,1560.552731,19.550836
3,1185.994963,4.133544,1198.669943,14.648669
4,943.402904,4.757288,962.349383,11.724038


In [22]:
best_rmse = results['test-rmse-mean'].min()
best_rmse

550.2735543625861

### XGBoost Classification

In [25]:
diamonds["cut"].unique()

['Ideal', 'Premium', 'Good', 'Very Good', 'Fair']
Categories (5, object): ['Ideal', 'Premium', 'Very Good', 'Good', 'Fair']

In [26]:
from sklearn.preprocessing import OrdinalEncoder

X, y = diamonds.drop("cut", axis=1), diamonds[['cut']]

# Encode y to numeric
encoder = OrdinalEncoder(categories=[['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']])
y_encoded = encoder.fit_transform(y)

# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to pd.Categorical
for col in cats:
    X[col] = X[col].astype('category')

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, random_state=1, stratify=y_encoded)

In [27]:
y.head(), y_encoded[:5]

(       cut
 0    Ideal
 1  Premium
 2     Good
 3  Premium
 4     Good,
 array([[4.],
        [3.],
        [1.],
        [3.],
        [1.]]))

In [28]:
# Create classification matrices
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_clf = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [29]:
params = {"objective": "multi:softprob", "tree_method": "hist", "num_class": 5}

results = xgb.cv(
   params, dtrain_clf,
   num_boost_round=100,
   nfold=5,
   metrics=["mlogloss", "auc", "merror"],
   early_stopping_rounds=20
)

In [30]:
results.head()

Unnamed: 0,train-mlogloss-mean,train-mlogloss-std,train-auc-mean,train-auc-std,train-merror-mean,train-merror-std,test-mlogloss-mean,test-mlogloss-std,test-auc-mean,test-auc-std,test-merror-mean,test-merror-std
0,1.257943,0.000997,0.891393,0.00063,0.254857,0.001343,1.261583,0.002479,0.886678,0.002057,0.259745,0.00282
1,1.07426,0.001194,0.89638,0.000756,0.253121,0.001328,1.081017,0.003486,0.890917,0.002089,0.257545,0.002881
2,0.956133,0.001643,0.90025,0.00099,0.251792,0.000893,0.965752,0.004386,0.894291,0.002224,0.256408,0.003637
3,0.875197,0.001545,0.902765,0.000853,0.249907,0.000781,0.88751,0.005268,0.896256,0.00231,0.255222,0.004134
4,0.816448,0.001812,0.905934,0.000791,0.24911,0.000991,0.831721,0.005485,0.898843,0.002048,0.254579,0.004024


In [31]:
results['test-auc-mean'].max()

0.9390686841532215

## XGBoost Sklearn

In [32]:
from sklearn import datasets

X,y = datasets.load_diabetes(return_X_y=True)

In [33]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

In [34]:
scores = cross_val_score(XGBRegressor(objective='reg:squarederror'), X, y, scoring='neg_mean_squared_error')

Classifier example

In [36]:
from sklearn import datasets

X,y = datasets.load_breast_cancer(return_X_y=True)

In [37]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

cross_val_score(XGBClassifier(), X, y).mean()

0.9701288619779538