In [None]:
# Info from https://xgboost.readthedocs.io/en/stable/tutorials/

In [34]:
# For my Python 3.11 venv
!pip install xgboost



In [17]:
#1. GETTING STARTED

from xgboost import XGBClassifier
# read data
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

data = load_iris()
X_train, X_test, y_train, y_test = train_test_split(data['data'], data['target'], test_size=.2)
# create model instance
bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
# fit model
bst.fit(X_train, y_train)
# make predictions
preds = bst.predict(X_test)

In [2]:
#2. Introduction to Model IO
#Shift+Alt+F for normal view in VSC
bst.save_model('model_file_name.json')

In [3]:
preds-y_test

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0], dtype=int64)

In [4]:
model_acc = bst.score(X_train, y_train)
model_acc

0.975

In [35]:
# DART BOOSTER
# XGBoost mostly combines a huge number of regression trees with a small learning rate. In this situation, trees added early are significant and trees added late are unimportant.
# It's a new method to add dropout techniques from the deep neural net community to boosted trees, and reported better results in some situations.
# Features:
# - Drop trees in order to solve the over-fitting.
# - Trivial trees (to correct trivial errors) may be prevented.

import xgboost as xgb
# read in data
#d_train = xgb.DMatrix('./agaricus.txt.train')
#d_test = xgb.DMatrix('./agaricus.txt.test')

# DMatrix is the basic data storage for XGBoost used by all XGBoost algorithms including both training, prediction and explanation. There are a few
# variants of DMatrix including normal DMatrix, which is a CSR matrix, QuantileDMatrix, which is used by histogram-based tree methods for saving memory,
# and lastly the experimental external-memory-based DMatrix, which reads data in batches during training. 
# NB! XGBoost DMatrix will blindly use the default LIBSVM parser. For CSV files, users need to provide an URI in the form of train.csv?format=csv

d_train = xgb.DMatrix(X_train, y_train)
d_test = xgb.DMatrix(X_test, y_test)

# specify parameters via map
params = {'booster': 'dart',
         'max_depth': 5, 'learning_rate': 0.1,
         'num_class': 3, 'objective': 'multi:softmax', #for multicalss classification
         'sample_type': 'uniform',
         'normalize_type': 'tree',
         'rate_drop': 0.1,
         'skip_drop': 0.5}
num_round = 50
bst_dart = xgb.train(params, d_train, num_round)
preds_dart = bst_dart.predict(d_test)

In [29]:
# Works fine:)
preds_dart-y_test

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
# Standalone Random Forest  https://xgboost.readthedocs.io/en/stable/tutorials/rf.html
# We can use XGBoost to train a standalone random forest or use random forest as a base model for gradient boosting with the following params:

params = {
  'colsample_bynode': 0.8,
  'learning_rate': 1,
  'max_depth': 5,
  'num_parallel_tree': 100,
  'objective': 'binary:logistic',
  'subsample': 0.8,
  'tree_method': 'gpu_hist'
}

bst = train(params, dmatrix, num_boost_round=1)

In [21]:
# Feature interaction constraints.
# 
# It allows users to decide which variables are allowed to interact and which are not.
# Potential benefits include:
#   Better predictive performance from focusing on interactions that work – whether through domain specific knowledge or algorithms that rank interactions
#   Less noise in predictions; better generalization
#   More control to the user on what the model can fit. For example, the user may want to exclude some interactions even if they perform well due to regulatory constraints.

# For example, the constraint [0, 1] indicates that variables X0 and X1 are allowed to interact with each other but with no other variable. 

params_constrained = params.copy()
# Use nested list to define feature interaction constraints
params_constrained['interaction_constraints'] = '[[0, 1], [2, 3]]'
# Features 0 and 2 are allowed to interact with each other but with no other feature
# Features 1, 3, 4 are allowed to interact with one another but with no other feature
# Features 5 and 6 are allowed to interact with each other but with no other feature

model_with_constraints = xgb.train(params_constrained, d_train,
                                   num_boost_round = 1000)
model_with_constraints.save_model('model_with_constraints.json')

In [6]:
preds_constr = model_with_constraints.predict(d_test)

In [7]:
preds_constr-y_test
#Difference appered - it works.

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0., -1.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.])

In [None]:
# NB! XGBoost’s Python package supports using feature names instead of feature index for specifying the constraints. 
# Given a data frame with columns ["f0", "f1", "f2"], the feature interaction constraint can be specified as [["f0", "f2"]].

In [22]:
# Survival analysis

# Survival analysis (regression) models time to an event of interest. Survival analysis is a special kind of regression and differs from the conventional regression task as follows:
# The label is always positive, since you cannot wait a negative amount of time until the event occurs.
# The label may not be fully known, or censored, because “it takes time to measure time.”
# For example, it helps to works with infinity in target values

import numpy as np
import xgboost as xgb

# 4-by-2 Data matrix
X = np.array([[1, -1], [-1, 1], [0, 1], [1, 0]])
dtrain = xgb.DMatrix(X)

# Associate ranged labels with the data matrix.
# This example shows each kind of censored labels.
#                         uncensored    right     left  interval
y_lower_bound = np.array([      2.0,     3.0,     0.0,     4.0])
y_upper_bound = np.array([      2.0, +np.inf,     4.0,     5.0])
dtrain.set_float_info('label_lower_bound', y_lower_bound)
dtrain.set_float_info('label_upper_bound', y_upper_bound)

In [38]:
#invoke the training API:
#Note that it is not yet possible to set the ranged label using the scikit-learn interface (e.g. xgboost.XGBRegressor). For now, you should use xgboost.train with xgboost.DMatrix

params = {'objective': 'survival:aft', # for this task
          'eval_metric': 'aft-nloglik', # for this task
          'aft_loss_distribution': 'normal', # for this task
          'aft_loss_distribution_scale': 1.20, # for this task
          'tree_method': 'hist', 'learning_rate': 0.05, 'max_depth': 2}
bst_surv = xgb.train(params, dtrain, num_boost_round=5,
                evals=[(dtrain, 'train')])

bst_surv.save_model('bst_surv.json')

[0]	train-aft-nloglik:2.30142
[1]	train-aft-nloglik:2.24184
[2]	train-aft-nloglik:2.18633
[3]	train-aft-nloglik:2.13462
[4]	train-aft-nloglik:2.08645


In [None]:
#Parameter Tuning

# 1. Control overfitting parameters:
    
# - max_depth, min_child_weight and gamma.
# - subsample and colsample_bytree.
# - eta. Remember to increase num_round when you do so.

# 2. Faster training performance
# - tree_method, set it to hist or gpu_hist for faster computation.

# 2. Handle Imbalanced Dataset

# - Balance the positive and negative weights via scale_pos_weight. It is counted = sum(negative instances) / sum(positive instances)
# - Use AUC for evaluation
# - If you care about predicting the right probability Set parameter max_delta_step to a finite number (say 1)