# Introduction to the xgbsurv package - Accelerated Hazards

This notebook introduces `xgbsurv` using a specific dataset. It structured by the following steps:

- Load data
- Load model
- Fit model
- Predict and evaluate model

The syntax conveniently follows that of sklearn.

In [1]:
from xgbsurv.datasets import load_metabric, load_flchain, load_support
from xgbsurv.models.utils import sort_X_y, transform_back
from pycox.evaluation import EvalSurv
from xgbsurv import XGBSurv
from sklearn.model_selection import train_test_split
import numpy as np
%load_ext autoreload
%autoreload 2


## Load Data

In [2]:
data, target = load_metabric(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=False, return_X_y=True)
target_sign = np.sign(target)
X_train, X_test, y_train, y_test = train_test_split(data, target, stratify=target_sign)

## Load Model

In [3]:
model = XGBSurv(n_estimators=100, objective="ah_objective",
                                             eval_metric="ah_loss",
                                             learning_rate=0.01,
                                             random_state=7, 
                                             disable_default_eval_metric=True,
                                             base_score=0.0)

The options of loss and objective functions can be obtained like below:

In [4]:
print(model.get_loss_functions().keys())
print(model.get_objective_functions().keys())

dict_keys(['breslow_loss', 'efron_loss', 'cind_loss', 'deephit_loss', 'aft_loss', 'ah_loss', 'eh_loss'])
dict_keys(['breslow_objective', 'efron_objective', 'cind_objective', 'deephit_objective', 'aft_objective', 'ah_objective', 'eh_objective'])


## Fit Model

In [5]:
eval_set = [(X_train, y_train)]

In [6]:
model.fit(X_train, y_train, eval_set=eval_set)

[0]	validation_0-ah_likelihood:4433.84373
[1]	validation_0-ah_likelihood:4433.42728
[2]	validation_0-ah_likelihood:4432.97618
[3]	validation_0-ah_likelihood:4432.51962
[4]	validation_0-ah_likelihood:4432.05468
[5]	validation_0-ah_likelihood:4431.58214
[6]	validation_0-ah_likelihood:4431.10021
[7]	validation_0-ah_likelihood:4430.61116
[8]	validation_0-ah_likelihood:4430.11294
[9]	validation_0-ah_likelihood:4429.62807
[10]	validation_0-ah_likelihood:4429.13555
[11]	validation_0-ah_likelihood:4428.62078
[12]	validation_0-ah_likelihood:4428.09506
[13]	validation_0-ah_likelihood:4427.55095
[14]	validation_0-ah_likelihood:4427.01107
[15]	validation_0-ah_likelihood:4426.43391
[16]	validation_0-ah_likelihood:4425.85402
[17]	validation_0-ah_likelihood:4425.27310
[18]	validation_0-ah_likelihood:4424.67131
[19]	validation_0-ah_likelihood:4424.05783
[20]	validation_0-ah_likelihood:4423.43350
[21]	validation_0-ah_likelihood:4422.79726
[22]	validation_0-ah_likelihood:4422.15059
[23]	validation_0-ah_

The model can be saved like below. Note that objective and eval_metric are not saved.

## Predict

In [None]:
preds_train = model.predict(X_train, output_margin=True)
preds_test = model.predict(X_test, output_margin=True)

In [None]:
preds_test

array([-3.4017135e-05,  5.0393733e-06, -3.4017135e-05, -3.4017135e-05,
       -3.4017135e-05,  5.0393733e-06, -3.4017135e-05,  5.0393733e-06,
       -3.4017135e-05, -3.4017135e-05, -3.4017135e-05,  8.3081555e-05,
       -3.4017135e-05, -3.4017135e-05, -3.4017135e-05,  8.3081555e-05,
       -3.4017135e-05,  8.3081555e-05, -3.4017135e-05, -3.4017135e-05,
       -3.4017135e-05,  5.0393733e-06, -3.4017135e-05, -3.4017135e-05,
       -3.4017135e-05, -3.4017135e-05,  5.0393733e-06,  5.0393733e-06,
       -3.4017135e-05, -3.4017135e-05, -3.4017135e-05, -3.4017135e-05,
       -3.4017135e-05, -3.4017135e-05,  5.0393733e-06, -3.4017135e-05,
       -3.4017135e-05,  8.3081555e-05,  5.0393733e-06,  5.0393733e-06,
        8.3081555e-05,  8.3081555e-05,  8.3081555e-05, -3.4017135e-05,
       -3.4017135e-05,  8.3081555e-05, -3.4017135e-05,  5.0393733e-06,
        5.0393733e-06, -3.4017135e-05, -3.4017135e-05, -3.4017135e-05,
        5.0393733e-06, -3.4017135e-05,  5.0393733e-06,  8.3081555e-05,
      

## Evaluate

In [None]:
df_cum_hazards = model.predict_cumulative_hazard_function(X_train, X_test, y_train, y_test)
df_cum_hazards # = df_cum_hazards.T.sort_index(axis=0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,466,467,468,469,470,471,472,473,474,475
1.766667,0.000787,0.000787,0.000787,0.000787,0.000787,0.000787,0.000787,0.000787,0.000787,0.000787,...,0.000787,0.000787,0.000787,0.000787,0.000787,0.000787,0.000787,0.000787,0.000787,0.000787
2.300000,0.000998,0.000998,0.000998,0.000998,0.000998,0.000998,0.000998,0.000998,0.000998,0.000998,...,0.000998,0.000998,0.000998,0.000998,0.000998,0.000998,0.000998,0.000998,0.000998,0.000998
2.400000,0.001049,0.001049,0.001049,0.001049,0.001049,0.001049,0.001049,0.001049,0.001049,0.001049,...,0.001049,0.001049,0.001049,0.001049,0.001049,0.001049,0.001049,0.001049,0.001049,0.001049
3.500000,0.001789,0.001789,0.001789,0.001789,0.001789,0.001789,0.001789,0.001789,0.001789,0.001789,...,0.001789,0.001789,0.001789,0.001789,0.001789,0.001789,0.001789,0.001789,0.001789,0.001789
5.433333,0.003772,0.003772,0.003772,0.003772,0.003772,0.003772,0.003772,0.003772,0.003772,0.003772,...,0.003772,0.003772,0.003772,0.003772,0.003772,0.003772,0.003772,0.003772,0.003772,0.003772
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297.233337,1.375410,1.375356,1.375410,1.375410,1.375410,1.375356,1.375410,1.375356,1.375410,1.375410,...,1.375410,1.375410,1.375356,1.375416,1.375410,1.375410,1.375416,1.375416,1.375416,1.375410
300.866669,1.393679,1.393624,1.393679,1.393679,1.393679,1.393624,1.393679,1.393624,1.393679,1.393679,...,1.393679,1.393679,1.393624,1.393683,1.393679,1.393679,1.393683,1.393683,1.393683,1.393679
307.933319,1.429249,1.429193,1.429249,1.429249,1.429249,1.429193,1.429249,1.429193,1.429249,1.429249,...,1.429249,1.429249,1.429193,1.429249,1.429249,1.429249,1.429249,1.429249,1.429249,1.429249
330.366669,1.542450,1.542389,1.542450,1.542450,1.542450,1.542389,1.542450,1.542389,1.542450,1.542450,...,1.542450,1.542450,1.542389,1.542437,1.542450,1.542450,1.542437,1.542437,1.542437,1.542450


In [None]:
df_survival_function = np.exp(-df_cum_hazards)
durations_test, events_test = transform_back(y_test)
time_grid = np.linspace(durations_test.min(), durations_test.max(), 100)
ev = EvalSurv(df_survival_function, durations_test, events_test, censor_surv='km')
print('Concordance Index',ev.concordance_td('antolini'))
print('Brier Score',ev.integrated_brier_score(time_grid))

Concordance Index 0.24266752833565322
Brier Score 0.19069861253671785


In [16]:
from xgbsurv.evaluation import cindex_censored, ibs
print(cindex_censored(y_train, preds_train))
print(cindex_censored(y_test, preds_test))

0.5603801100476893
0.5493450487174388
