# XGBoost

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

## Intro to XGBoost

## XGBoost Library

XGBoost is an industry-proven, open-source software library that provides a gradient boosting framework for scaling billions of data points quickly and efficiently.

Docs: https://xgboost.readthedocs.io/en/stable/index.html

**XGBoost** is an optimized distributed gradient boosting library designed to be highly **efficient, flexible and portable**. It implements machine learning algorithms under the Gradient Boosting framework. XGBoost provides a parallel tree boosting (also known as GBDT, GBM) that solve many data science problems in a fast and accurate way. The same code runs on major distributed environment (Hadoop, SGE, MPI) and can solve problems beyond billions of examples.

Installation: https://xgboost.readthedocs.io/en/stable/install.html#python
- `pip install xgboost`

In [None]:
import xgboost

print(xgboost.__version__)

## XGBoost native API

In [None]:
diamonds = sns.load_dataset("diamonds")
diamonds.head()

In [None]:
diamonds.shape

In [None]:
diamonds.describe()

In [None]:
diamonds.describe(exclude=np.number)

In [None]:
from sklearn.model_selection import train_test_split

# Extract feature and target arrays
X, y = diamonds.drop('price', axis=1), diamonds[['price']]

# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to Pandas category
for col in cats:
    X[col] = X[col].astype('category')
print(X.info())    
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
import xgboost as xgb

# Create regression matrices
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [None]:
# Define hyperparameters
params = {"objective": "reg:squarederror", "tree_method": "hist"}

In [None]:
model = xgb.train(params=params, dtrain=dtrain_reg, num_boost_round=100)

In [None]:
from sklearn.metrics import mean_squared_error

preds = model.predict(dtest_reg)

In [None]:
rmse = mean_squared_error(y_test, preds, squared=False)
print(f"RMSE of the base model: {rmse:.3f}")

### Using Validation Sets During Training

In [None]:
params = {"objective": "reg:squarederror", "tree_method": "hist"}

In [None]:
evals = [(dtest_reg, "validation"), (dtrain_reg, "train")]

In [None]:
evals = [(dtest_reg, "validation"), (dtrain_reg, "train")]

model = xgb.train(params=params, dtrain=dtrain_reg, num_boost_round=100, evals=evals,verbose_eval=10) # Print every ten rounds

### XGBoost Early Stopping

In [None]:
model = xgb.train(params=params, dtrain=dtrain_reg, num_boost_round=5000, evals=evals,verbose_eval=500)

In [None]:
# If there’s more than one metric in the eval_metric parameter given in params, 
# the last metric will be used for early stopping.
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   num_boost_round=1000,
   evals=evals,
   verbose_eval=50,
   # Activate early stopping
   early_stopping_rounds=50
)

### XGBoost Cross-Validation

In [None]:
params = {"objective": "reg:squarederror", "tree_method": "hist"}
n = 1000


results = xgb.cv(params, dtrain_reg, num_boost_round=n, nfold=5, early_stopping_rounds=20)

In [None]:
results.head()

In [None]:
best_rmse = results['test-rmse-mean'].min()
best_rmse

### XGBoost Classification

In [None]:
from sklearn.preprocessing import OrdinalEncoder

X, y = diamonds.drop("cut", axis=1), diamonds[['cut']]

# Encode y to numeric
y_encoded = OrdinalEncoder().fit_transform(y)

# Extract text features
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Convert to pd.Categorical
for col in cats:
    X[col] = X[col].astype('category')

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, random_state=1, stratify=y_encoded)

In [None]:
y["cut"].unique()

In [None]:
# Create classification matrices
dtrain_clf = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_clf = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [None]:
params = {"objective": "multi:softprob", "tree_method": "hist", "num_class": 5}

results = xgb.cv(
   params, dtrain_clf,
   num_boost_round=100,
   nfold=5,
   metrics=["mlogloss", "auc", "merror"],
   early_stopping_rounds=20
)

In [None]:
results.head()

In [None]:
results['test-auc-mean'].max()

## XGBoost Sklearn

In [None]:
from sklearn import datasets

X,y = datasets.load_diabetes(return_X_y=True)

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(XGBRegressor(objective='reg:squarederror'), X, y, scoring='neg_mean_squared_error')

In [None]:
(-scores)**0.5 

In [None]:
from sklearn import datasets

X,y = datasets.load_breast_cancer(return_X_y=True)

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

cross_val_score(XGBClassifier(), X, y).mean()