In [1]:
# Info from https://xgboost.readthedocs.io/en/stable/tutorials/

In [2]:
# For my Python 3.11 venv
!pip install xgboost



In [18]:
#1. GETTING STARTED

from xgboost import XGBClassifier
# read data
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'], test_size=.2)
# create model instance
bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
# fit model
bst.fit(X_train, y_train)
# make predictions
preds = bst.predict(X_test)

In [103]:
X_train

array([[7.9, 3.8, 6.4, 2. ],
       [4.5, 2.3, 1.3, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [6.7, 3.3, 5.7, 2.5],
       [5.7, 2.8, 4.1, 1.3],
       [5.8, 2.6, 4. , 1.2],
       [4.6, 3.2, 1.4, 0.2],
       [6.2, 2.9, 4.3, 1.3],
       [4.6, 3.4, 1.4, 0.3],
       [6.4, 3.2, 5.3, 2.3],
       [7.6, 3. , 6.6, 2.1],
       [5.6, 3. , 4.5, 1.5],
       [6.2, 3.4, 5.4, 2.3],
       [6.7, 3.1, 4.7, 1.5],
       [5.7, 2.6, 3.5, 1. ],
       [5.1, 3.7, 1.5, 0.4],
       [5.4, 3. , 4.5, 1.5],
       [7. , 3.2, 4.7, 1.4],
       [5.4, 3.9, 1.7, 0.4],
       [5.5, 2.4, 3.7, 1. ],
       [4.7, 3.2, 1.3, 0.2],
       [5.2, 4.1, 1.5, 0.1],
       [4.8, 3.4, 1.9, 0.2],
       [6.5, 3. , 5.8, 2.2],
       [4.6, 3.1, 1.5, 0.2],
       [6.1, 2.9, 4.7, 1.4],
       [7.7, 3. , 6.1, 2.3],
       [5.6, 2.9, 3.6, 1.3],
       [6.8, 2.8, 4.8, 1.4],
       [7.2, 3. , 5.8, 1.6],
       [5.1, 3.8, 1.6, 0.2],
       [5.2, 3.5, 1.5, 0.2],
       [5.5, 2.4, 3.8, 1.1],
       [6.4, 2.9, 4.3, 1.3],
       [6. , 3

In [19]:
#2. Introduction to Model IO
#Shift+Alt+F for normal view in VSC
bst.save_model('model_file_name.json')

In [91]:
bst # it is a XGBClassifier https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBClassifier

In [65]:
bst.get_params()
# About these part in the next video with Python Package

{'objective': 'multi:softprob',
 'use_label_encoder': None,
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'gpu_id': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': 1,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': 2,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 2,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [76]:
bst.score(X_test, y_test)

0.9333333333333333

In [106]:
preds-y_test

array([ 0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0], dtype=int64)

In [21]:
model_acc = bst.score(X_train, y_train)
model_acc

0.9916666666666667

In [110]:
#3. DART BOOSTER
# XGBoost mostly combines a huge number of regression trees with a small learning rate. In this situation, trees added early are significant and trees added late are unimportant.
# It's a new method to add dropout techniques from the deep neural net community to boosted trees, and reported better results in some situations.
# Features:
# - Drop trees in order to solve the over-fitting.
# - Trivial trees (to correct trivial errors) may be prevented.

import xgboost as xgb
# read in data
#d_train = xgb.DMatrix('./agaricus.txt.train#dtrain.cache')
#d_test = xgb.DMatrix('./agaricus.txt.test#dtest.cache')

#d_train = xgb.DMatrix('./agaricus.txt.train')
#d_test = xgb.DMatrix('./agaricus.txt.test')


# DMatrix is the basic data storage for XGBoost used by all XGBoost algorithms including both training, prediction and explanation. There are a few
# variants of DMatrix including normal DMatrix, which is a CSR matrix, QuantileDMatrix, which is used by histogram-based tree methods for saving memory,
# and lastly the experimental external-memory-based DMatrix, which reads data in batches during training. 
# NB! XGBoost DMatrix will blindly use the default LIBSVM parser. For CSV files, users need to provide an URI in the form of train.csv?format=csv

d_train = xgb.DMatrix(X_train, y_train)
d_test = xgb.DMatrix(X_test, y_test)

# specify parameters via map
params = {'booster': 'dart',
         'max_depth': 5, 'learning_rate': 0.1,
         'num_class': 3, 'objective': 'multi:softmax', #for multicalss classification
         'sample_type': 'uniform',
         'normalize_type': 'tree',
         'rate_drop': 0.1,
         'skip_drop': 0.5}
num_round = 50
bst_dart = xgb.train(params, d_train, num_round)
preds_dart = bst_dart.predict(d_test)

In [111]:
# Works fine:)
preds_dart-y_test

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0., -1.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0., -1.,  0.,  0.])

In [112]:
# It's a booster - https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.Booster
bst_dart

<xgboost.core.Booster at 0x2936ea5e850>

In [113]:
bst_dart.get_score() # feature importance of each feature

{'f0': 35.0, 'f1': 29.0, 'f2': 265.0, 'f3': 93.0}

In [None]:
#4. Standalone Random Forest  https://xgboost.readthedocs.io/en/stable/tutorials/rf.html
# We can use XGBoost to train a standalone random forest or use random forest as a base model for gradient boosting with the following params:

params = {
  'colsample_bynode': 0.8,
  'learning_rate': 1,
  'max_depth': 5,
  'num_parallel_tree': 100,
  'objective': 'binary:logistic',
  'subsample': 0.8,
  'tree_method': 'gpu_hist'
}

bst_SRF = xgb.train(params, d_train, num_boost_round=1)

#It will give a mistake, because data is not binary

In [27]:
#5. Feature interaction constraints.
# 
# It allows users to decide which variables are allowed to interact and which are not.
# Potential benefits include:
#   Better predictive performance from focusing on interactions that work – whether through domain specific knowledge or algorithms that rank interactions
#   Less noise in predictions; better generalization
#   More control to the user on what the model can fit. For example, the user may want to exclude some interactions even if they perform well due to regulatory constraints.

# For example, the constraint [0, 1] indicates that variables X0 and X1 are allowed to interact with each other but with no other variable. 

params_constrained = params.copy()
# Use nested list to define feature interaction constraints
params_constrained['interaction_constraints'] = '[[0, 1], [2, 3]]'

model_with_constraints = xgb.train(params_constrained, d_train,
                                   num_boost_round = 1000)
model_with_constraints.save_model('model_with_constraints.json')

In [115]:
preds_constr = model_with_constraints.predict(d_test)

In [116]:
preds_constr-y_test
#Difference appered - it works.

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0., -1.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0., -1.,  0.,  0.])

In [None]:
# NB! XGBoost’s Python package supports using feature names instead of feature index for specifying the constraints. 
# Given a data frame with columns ["f0", "f1", "f2"], the feature interaction constraint can be specified as [["f0", "f2"]].

In [96]:
#6. Survival analysis

# Survival analysis (regression) models time to an event of interest. Survival analysis is a special kind of regression and differs from the conventional regression task as follows:
# The label is always positive, since you cannot wait a negative amount of time until the event occurs.
# The label may not be fully known, or censored, because “it takes time to measure time.”
# For example, it helps to works with infinity in target values

import numpy as np
import xgboost as xgb

# 4-by-2 Data matrix
X = np.array([[1, -1], [-1, 1], [0, 1], [1, 0]])
SAtrain = xgb.DMatrix(X)

# Associate ranged labels with the data matrix.
# This example shows each kind of censored labels.
#                         uncensored    right     left  interval
y_lower_bound = np.array([      2.0,     3.0,     0.0,     4.0])
y_upper_bound = np.array([      2.0, +np.inf,     4.0,     5.0])
SAtrain.set_float_info('label_lower_bound', y_lower_bound)
SAtrain.set_float_info('label_upper_bound', y_upper_bound)

In [98]:
# invoke the training API:
# Note that it is not yet possible to set the ranged label using the scikit-learn interface (e.g. xgboost.XGBRegressor). For now, you should use xgboost.train with xgboost.DMatrix

params = {'objective': 'survival:aft', # for this task
          'eval_metric': 'aft-nloglik', # for this task
          'aft_loss_distribution': 'normal', # for this task
          'aft_loss_distribution_scale': 1.20, # for this task
          'tree_method': 'hist', 'learning_rate': 0.05, 'max_depth': 2}
bst_surv_SA = xgb.train(params, SAtrain, num_boost_round=5,
                evals=[(SAtrain, 'train')])

bst_surv_SA.save_model('bst_surv.json')

[0]	train-aft-nloglik:2.30142
[1]	train-aft-nloglik:2.24184
[2]	train-aft-nloglik:2.18633
[3]	train-aft-nloglik:2.13462
[4]	train-aft-nloglik:2.08645


In [32]:
#7. Customized Objective Function

from typing import Tuple

def gradient(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the gradient squared log error.'''
    y = dtrain.get_label()
    return (np.log1p(predt) - np.log1p(y)) / (predt + 1)

def hessian(predt: np.ndarray, dtrain: xgb.DMatrix) -> np.ndarray:
    '''Compute the hessian for squared log error.'''
    y = dtrain.get_label()
    return ((-np.log1p(predt) + np.log1p(y) + 1) /
            np.power(predt + 1, 2))

def squared_log(predt: np.ndarray,
                dtrain: xgb.DMatrix) -> Tuple[np.ndarray, np.ndarray]:
    '''Squared Log Error objective. A simplified version for RMSLE used as
    objective function.
    '''
    predt[predt < -1] = -1 + 1e-6
    grad = gradient(predt, dtrain)
    hess = hessian(predt, dtrain)
    return grad, hess

In [33]:
# Objective is then used as a callback function for XGBoost during training by passing it as an argument to xgb.train

xgb_def = xgb.train({'tree_method': 'hist', 'seed': 1994},  # any other tree method is fine.
           dtrain=d_train,
           num_boost_round=10,
           obj=squared_log)

xgb_def.save_model('xgb_def.json')

In [45]:
# Customized Metric Function

def rmsle(predt: np.ndarray, dtrain: xgb.DMatrix) -> Tuple[str, float]:
    ''' Root mean squared log error metric.'''
    y = dtrain.get_label()
    predt[predt < -1] = -1 + 1e-6
    elements = np.power(np.log1p(y) - np.log1p(predt), 2)
    return 'PyRMSLE', float(np.sqrt(np.sum(elements) / len(y)))


xgb_def_metric = xgb.train({'tree_method': 'hist', 'seed': 1994,
           'disable_default_eval_metric': 1},
                           
# Notice that the parameter disable_default_eval_metric is used to suppress the default metric in XGBoost.
          dtrain=d_train,
          num_boost_round=10,
          obj=squared_log,
          custom_metric=rmsle,
# In tutorial is used feval, but it gives an error with recommendation to put custom_metric
          evals=[(d_train, 'dtrain'), (d_test, 'dtest')],)
#          evals_result=results)


[0]	dtrain-PyRMSLE:0.37276	dtest-PyRMSLE:0.40974
[1]	dtrain-PyRMSLE:0.30139	dtest-PyRMSLE:0.34202
[2]	dtrain-PyRMSLE:0.24429	dtest-PyRMSLE:0.29062
[3]	dtrain-PyRMSLE:0.19758	dtest-PyRMSLE:0.24313
[4]	dtrain-PyRMSLE:0.15996	dtest-PyRMSLE:0.20576
[5]	dtrain-PyRMSLE:0.12990	dtest-PyRMSLE:0.17592
[6]	dtrain-PyRMSLE:0.10622	dtest-PyRMSLE:0.15912
[7]	dtrain-PyRMSLE:0.08804	dtest-PyRMSLE:0.14321
[8]	dtrain-PyRMSLE:0.07397	dtest-PyRMSLE:0.13277
[9]	dtrain-PyRMSLE:0.06376	dtest-PyRMSLE:0.12323


In [99]:
#8. Scikit-Learn Interface - to improve the integration with standard scikit-learn functions

from sklearn.datasets import load_diabetes
from sklearn.metrics import mean_absolute_error
X, y = load_diabetes(return_X_y=True)
reg = xgb.XGBRegressor(
    tree_method="hist",
    eval_metric=mean_absolute_error,
)
reg.fit(X, y, eval_set=[(X, y)])

[0]	validation_0-rmse:125.60229	validation_0-mean_absolute_error:107.86327
[1]	validation_0-rmse:94.53059	validation_0-mean_absolute_error:78.02611
[2]	validation_0-rmse:72.70615	validation_0-mean_absolute_error:57.60754
[3]	validation_0-rmse:57.41636	validation_0-mean_absolute_error:44.09879
[4]	validation_0-rmse:46.72110	validation_0-mean_absolute_error:35.53532
[5]	validation_0-rmse:39.40697	validation_0-mean_absolute_error:30.12643
[6]	validation_0-rmse:33.75610	validation_0-mean_absolute_error:25.94312
[7]	validation_0-rmse:29.48226	validation_0-mean_absolute_error:22.60080
[8]	validation_0-rmse:26.30025	validation_0-mean_absolute_error:20.16968
[9]	validation_0-rmse:23.10979	validation_0-mean_absolute_error:17.79017
[10]	validation_0-rmse:21.35165	validation_0-mean_absolute_error:16.31033
[11]	validation_0-rmse:19.53509	validation_0-mean_absolute_error:14.95299
[12]	validation_0-rmse:18.42825	validation_0-mean_absolute_error:14.12309
[13]	validation_0-rmse:17.15199	validation_0-m

In [100]:
# For custom objective function, users can define the objective without having to access DMatrix

def softprob_obj(labels: np.ndarray, predt: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    rows = labels.shape[0]
    grad = np.zeros((rows, classes), dtype=float)
    hess = np.zeros((rows, classes), dtype=float)
    eps = 1e-6
    for r in range(predt.shape[0]):
        target = labels[r]
        p = softmax(predt[r, :])
        for c in range(predt.shape[1]):
            g = p[c] - 1.0 if c == target else p[c]
            h = max((2.0 * p[c] * (1.0 - p[c])).item(), eps)
            grad[r, c] = g
            hess[r, c] = h

    grad = grad.reshape((rows * classes, 1))
    hess = hess.reshape((rows * classes, 1))
    return grad, hess

clf = xgb.XGBClassifier(tree_method="hist", objective=softprob_obj)

In [None]:
#9. Categorical Data - in my previous video

In [118]:
#10. Multiple Outputs. For instance, a movie can be simultaneously classified as both sci-fi and comedy. 

from sklearn.datasets import make_multilabel_classification
import numpy as np

X, y = make_multilabel_classification(
    n_samples=32, n_classes=5, n_labels=3, random_state=0
)
clf = xgb.XGBClassifier(tree_method="hist")
clf.fit(X, y)
res = np.testing.assert_allclose(clf.predict(X), y)
# np.testing.assert_allclose: Raises an AssertionError if two objects are not equal up to desired tolerance. No error.

In [None]:
#11. XGBoost forum - https://discuss.xgboost.ai/

In [None]:
#12. Parameter Tuning (the list of parameteres is here - https://xgboost.readthedocs.io/en/stable/python/index.html#)

#12.1. Control overfitting parameters:
    
# - max_depth, min_child_weight and gamma.
# - subsample and colsample_bytree.
# - eta. Remember to increase num_round when you do so.

# 2. Faster training performance
# - tree_method, set it to hist or gpu_hist for faster computation.

# 2. Handle Imbalanced Dataset (Should use in my previois video, where data is unbalanced. I used it and got an effect).

# - Balance the positive and negative weights via scale_pos_weight. It is counted = sum(negative instances) / sum(positive instances).
# - Use AUC for evaluation
# - If you care about predicting the right probability Set parameter max_delta_step to a finite number (say 1)

#12.2. 

# General parameters relate to which booster we are using to do boosting, commonly tree or linear model

# Booster parameters depend on which booster you have chosen

# Learning task parameters decide on the learning scenario. For example, regression tasks may use different parameters with ranking tasks.

# Command line parameters relate to behavior of CLI version of XGBoost.

# Details here - https://xgboost.readthedocs.io/en/stable/parameter.html#


# About XGBoost Parameters, Prediction, Tree methods and Python Package will be in next video.