# Introduction to the xgbsurv package - Accelerated Hazards

This notebook introduces `xgbsurv` using a specific dataset. It structured by the following steps:

- Load data
- Load model
- Fit model
- Predict and evaluate model

The syntax conveniently follows that of sklearn.

In [1]:
from xgbsurv.datasets import load_metabric, load_flchain
from xgbsurv.models.utils import sort_X_y, transform_back
from pycox.evaluation import EvalSurv
from xgbsurv import XGBSurv
from sklearn.model_selection import train_test_split
import numpy as np
%load_ext autoreload
%autoreload 2


## Load Data

In [2]:
data, target = load_metabric(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=False, return_X_y=True)
target_sign = np.sign(target)
X_train, X_test, y_train, y_test = train_test_split(data, target, stratify=target_sign)

## Load Model

In [3]:
model = XGBSurv(n_estimators=100, objective="ah_objective",
                                             eval_metric="ah_loss",
                                             learning_rate=0.01,
                                             random_state=7, 
                                             disable_default_eval_metric=True,
                                             base_score=0.0)

The options of loss and objective functions can be obtained like below:

In [4]:
print(model.get_loss_functions().keys())
print(model.get_objective_functions().keys())

dict_keys(['breslow_loss', 'efron_loss', 'cind_loss', 'deephit_loss', 'aft_loss', 'ah_loss', 'eh_loss'])
dict_keys(['breslow_objective', 'efron_objective', 'cind_objective', 'deephit_objective', 'aft_objective', 'ah_objective', 'eh_objective'])


## Fit Model

In [5]:
eval_set = [(X_train, y_train)]

In [6]:
model.fit(X_train, y_train, eval_set=eval_set)

[0]	validation_0-ah_likelihood:3.11114
[1]	validation_0-ah_likelihood:3.11114
[2]	validation_0-ah_likelihood:3.11114
[3]	validation_0-ah_likelihood:3.11113
[4]	validation_0-ah_likelihood:3.11113
[5]	validation_0-ah_likelihood:3.11113
[6]	validation_0-ah_likelihood:3.11113
[7]	validation_0-ah_likelihood:3.11113
[8]	validation_0-ah_likelihood:3.11113
[9]	validation_0-ah_likelihood:3.11113
[10]	validation_0-ah_likelihood:3.11113
[11]	validation_0-ah_likelihood:3.11112
[12]	validation_0-ah_likelihood:3.11112
[13]	validation_0-ah_likelihood:3.11112
[14]	validation_0-ah_likelihood:3.11112
[15]	validation_0-ah_likelihood:3.11112
[16]	validation_0-ah_likelihood:3.11112
[17]	validation_0-ah_likelihood:3.11112
[18]	validation_0-ah_likelihood:3.11112
[19]	validation_0-ah_likelihood:3.11111
[20]	validation_0-ah_likelihood:3.11111
[21]	validation_0-ah_likelihood:3.11111
[22]	validation_0-ah_likelihood:3.11111
[23]	validation_0-ah_likelihood:3.11111
[24]	validation_0-ah_likelihood:3.11111
[25]	valid

The model can be saved like below. Note that objective and eval_metric are not saved.

## Predict

In [7]:
preds_train = model.predict(X_train, output_margin=True)
preds_test = model.predict(X_test, output_margin=True)

In [8]:
preds_test

array([-5.0846679e-04, -1.3310128e-04,  5.2669726e-04,  4.2177667e-04,
        4.0728963e-04, -5.7755277e-04,  5.2669726e-04, -7.2762673e-04,
       -5.7755277e-04,  4.0728963e-04,  2.0074837e-03,  4.0728963e-04,
       -3.7794394e-04, -7.0622435e-04, -5.5071570e-05, -7.2762673e-04,
        2.6970375e-03, -7.2762673e-04, -7.2762673e-04, -7.0622435e-04,
       -5.5071570e-05,  5.2669726e-04, -3.7794394e-04,  4.0728963e-04,
        5.9974001e-04,  2.0074837e-03,  5.2669726e-04, -1.1233825e-03,
       -7.2762673e-04, -5.7755277e-04,  2.3529445e-05, -3.7794394e-04,
       -7.7500002e-04,  4.0728963e-04, -3.7794394e-04, -3.7794394e-04,
       -7.2762673e-04, -7.2762673e-04, -5.7755277e-04, -5.0846679e-04,
       -5.7755277e-04,  5.2669726e-04,  5.2669726e-04,  5.2669726e-04,
        4.2177667e-04, -1.3310128e-04,  4.0728963e-04,  5.2669726e-04,
       -5.0547312e-04,  2.3529445e-05, -7.2762673e-04, -5.7755277e-04,
        5.4763812e-05, -7.0622435e-04,  4.0728963e-04,  2.3529445e-05,
      

## Evaluate

In [9]:
df_cum_hazards = model.predict_cumulative_hazard_function(X_train, X_test, y_train, y_test)
df_cum_hazards # = df_cum_hazards.T.sort_index(axis=0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,466,467,468,469,470,471,472,473,474,475
2.400000,0.001443,0.001474,0.001473,0.001474,0.001474,0.001443,0.001473,0.001443,0.001443,0.001474,...,0.001443,0.001474,0.001471,0.001443,0.001474,0.001443,0.001443,0.001474,0.001443,0.001443
3.366667,0.002259,0.002281,0.002280,0.002280,0.002280,0.002259,0.002280,0.002260,0.002259,0.002280,...,0.002260,0.002280,0.002276,0.002260,0.002281,0.002259,0.002260,0.002280,0.002259,0.002259
3.766667,0.002517,0.002536,0.002535,0.002535,0.002535,0.002517,0.002535,0.002518,0.002517,0.002535,...,0.002518,0.002535,0.002531,0.002518,0.002536,0.002517,0.002518,0.002535,0.002517,0.002517
4.166667,0.002751,0.002769,0.002767,0.002767,0.002767,0.002751,0.002767,0.002751,0.002751,0.002767,...,0.002751,0.002767,0.002763,0.002751,0.002769,0.002750,0.002751,0.002767,0.002751,0.002751
4.433333,0.002904,0.002923,0.002921,0.002921,0.002921,0.002905,0.002921,0.002905,0.002905,0.002921,...,0.002905,0.002921,0.002917,0.002905,0.002923,0.002904,0.002905,0.002921,0.002905,0.002905
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282.566681,1.286850,1.286845,1.286791,1.286767,1.286786,1.286780,1.286791,1.286814,1.286780,1.286786,...,1.286787,1.286786,1.286951,1.286787,1.286780,1.286842,1.286814,1.286786,1.286780,1.286780
291.166656,1.327775,1.327914,1.327834,1.327973,1.327833,1.327867,1.327834,1.327907,1.327867,1.327833,...,1.327879,1.327833,1.327935,1.327879,1.327847,1.327921,1.327907,1.327833,1.327867,1.327867
307.933319,1.407942,1.408051,1.408078,1.408067,1.408087,1.408040,1.408078,1.407932,1.408040,1.408087,...,1.408061,1.408087,1.408221,1.408061,1.407981,1.408077,1.407932,1.408087,1.408040,1.408040
335.733337,1.540904,1.540963,1.541061,1.541064,1.540927,1.540851,1.541061,1.540923,1.540851,1.540927,...,1.540890,1.540927,1.541166,1.540890,1.540886,1.540863,1.540923,1.540927,1.540851,1.540851


In [10]:
df_survival_function = np.exp(-df_cum_hazards)
durations_test, events_test = transform_back(y_test)
time_grid = np.linspace(durations_test.min(), durations_test.max(), 100)
ev = EvalSurv(df_survival_function, durations_test, events_test, censor_surv='km')
print('Concordance Index',ev.concordance_td('antolini'))
print('Brier Score',ev.integrated_brier_score(time_grid))

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


Concordance Index 0.4440193131092284
Brier Score 0.1798179818373157


In [11]:
from xgbsurv.evaluation import cindex_censored, ibs
print(cindex_censored(y_train, preds_train))
print(cindex_censored(y_test, preds_test))

0.5112650375595262
0.48294485873059256
