# Introduction to the xgbsurv package

This notebook introduces `xgbsurv` using a specific dataset. It structured by the following steps:

- Load data
- Load model
- Fit model
- Predict and evaluate model

The syntax conveniently follows that of sklearn.

In [1]:
from xgbsurv.datasets import load_metabric
from xgbsurv import XGBSurv
from xgbsurv.models.utils import sort_X_y, transform_back
#from xgbsurv.models import aft_baseline_hazard_estimator
from pycox.evaluation import EvalSurv
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
%load_ext autoreload
%autoreload 2


## Load Data

In [2]:
data = load_metabric(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=False)
# stratify by event indicated by sign
target_sign = np.sign(data.target)
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, stratify=target_sign)
# sort data
X_train, y_train = sort_X_y(X_train, y_train)
X_test, y_test = sort_X_y(X_test, y_test)

## Load Model

In [3]:
model = XGBSurv(n_estimators=25, objective="aft_objective",
                                             eval_metric="aft_loss",
                                             learning_rate=0.3,
                                             random_state=7, 
                                             disable_default_eval_metric=True)

The options of loss and objective functions can be obtained like below:

In [4]:
print(model.get_loss_functions().keys())
print(model.get_objective_functions().keys())

dict_keys(['breslow_loss', 'efron_loss', 'cind_loss', 'deephit_loss', 'aft_loss', 'ah_loss', 'eh_loss'])
dict_keys(['breslow_objective', 'efron_objective', 'cind_objective', 'deephit_objective', 'aft_objective', 'ah_objective', 'eh_objective'])


## Fit Model

In [5]:
eval_set = [(X_train, y_train)]

In [6]:
model.fit(X_train, y_train, eval_set=eval_set)

[0]	validation_0-aft_likelihood:3.12115
[1]	validation_0-aft_likelihood:3.12112
[2]	validation_0-aft_likelihood:3.12108
[3]	validation_0-aft_likelihood:3.12105
[4]	validation_0-aft_likelihood:3.12102
[5]	validation_0-aft_likelihood:3.12099
[6]	validation_0-aft_likelihood:3.12096
[7]	validation_0-aft_likelihood:3.12092
[8]	validation_0-aft_likelihood:3.12089
[9]	validation_0-aft_likelihood:3.12086
[10]	validation_0-aft_likelihood:3.12083
[11]	validation_0-aft_likelihood:3.12080
[12]	validation_0-aft_likelihood:3.12076
[13]	validation_0-aft_likelihood:3.12073
[14]	validation_0-aft_likelihood:3.12070
[15]	validation_0-aft_likelihood:3.12067
[16]	validation_0-aft_likelihood:3.12064
[17]	validation_0-aft_likelihood:3.12060
[18]	validation_0-aft_likelihood:3.12057
[19]	validation_0-aft_likelihood:3.12054
[20]	validation_0-aft_likelihood:3.12051
[21]	validation_0-aft_likelihood:3.12047
[22]	validation_0-aft_likelihood:3.12044
[23]	validation_0-aft_likelihood:3.12041
[24]	validation_0-aft_like

The model can be saved like below. Note that objective and eval_metric are not saved.

In [7]:
model.save_model("introduction_model_breslow.json")

## Predict

In [8]:
preds_train = model.predict(X_train, output_margin=True)
preds_test = model.predict(X_test, output_margin=True)
time_train, event_train = transform_back(y_train)
#rlp = np.log(time_train * np.exp(preds_train))
#from xgbsurv.evaluation import cindex_censored, ibs
#cindex_censored(y_train, rlp)

0.0007307678716562423

### Predict Cumulative Hazard

In [9]:
df_cum_hazards = model.predict_cumulative_hazard_function(X_train[:100], X_test[:100], y_train[:100], y_test[:100])
df_cum_hazards.head(3)

TypeError: only size-1 arrays can be converted to Python scalars

In [None]:
X_train.shape

(1427, 9)

In [None]:
df_cum_hazards

Unnamed: 0,1.433333,2.400000,2.500000,4.866667,7.866667,9.833333,10.066667,10.633333,11.866667,12.933333,...,50.666668,50.900002,51.200001,52.299999,52.299999.1,52.466667,52.733334,53.633335,54.933334,55.466667
0,0.010038,0.010283,0.010338,0.014739,0.028451,0.040720,0.042307,0.046267,0.055393,0.063835,...,0.691458,0.696791,0.703662,0.728987,0.728987,0.732841,0.739018,0.759944,0.790383,0.802940
1,0.010036,0.010276,0.010330,0.014654,0.028206,0.040357,0.041929,0.045852,0.054894,0.063258,...,0.685115,0.690408,0.697227,0.722360,0.722360,0.726186,0.732316,0.753088,0.783304,0.795769
2,0.010038,0.010283,0.010338,0.014739,0.028451,0.040720,0.042307,0.046267,0.055393,0.063835,...,0.691458,0.696791,0.703662,0.728987,0.728987,0.732841,0.739018,0.759944,0.790383,0.802940
3,0.010037,0.010278,0.010332,0.014675,0.028266,0.040446,0.042022,0.045955,0.055017,0.063400,...,0.686681,0.691984,0.698816,0.723997,0.723997,0.727829,0.733971,0.754781,0.785052,0.797540
4,0.010039,0.010289,0.010346,0.014819,0.028681,0.041061,0.042662,0.046656,0.055862,0.064377,...,0.697403,0.702775,0.709695,0.735198,0.735198,0.739079,0.745299,0.766370,0.797018,0.809659
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.010036,0.010275,0.010329,0.014642,0.028173,0.040309,0.041879,0.045798,0.054829,0.063182,...,0.684280,0.689568,0.696380,0.721488,0.721488,0.725310,0.731434,0.752185,0.782372,0.794825
96,0.010038,0.010283,0.010338,0.014739,0.028451,0.040720,0.042307,0.046267,0.055393,0.063835,...,0.691458,0.696791,0.703662,0.728987,0.728987,0.732841,0.739018,0.759944,0.790383,0.802940
97,0.010037,0.010281,0.010336,0.014719,0.028394,0.040637,0.042220,0.046172,0.055279,0.063703,...,0.690003,0.695327,0.702187,0.727467,0.727467,0.731315,0.737481,0.758372,0.788760,0.801296
98,0.010040,0.010295,0.010352,0.014891,0.028886,0.041365,0.042978,0.047003,0.056279,0.064861,...,0.702696,0.708101,0.715065,0.740726,0.740726,0.744632,0.750889,0.772089,0.802922,0.815638


## Predict Survival Function

In [None]:
df_survival_function = model.predict_survival_function(X_train, X_test, y_train, y_test)
df_survival_function.tail(3)

KeyboardInterrupt: 

### Visualize Predictions

In [None]:
df_survival_function.iloc[:, :5].plot(drawstyle='steps-post')
plt.ylabel('S(t | x)')
_ = plt.xlabel('Time')
plt.title("Survival Curve Test Set")

NameError: name 'df_survival_function' is not defined

## Evaluate

In [None]:
durations_test, events_test = transform_back(y_test)
time_grid = np.linspace(durations_test.min(), durations_test.max(), 100)
ev = EvalSurv(df_survival_function, durations_test, events_test, censor_surv='km')
print('Concordance Index',ev.concordance_td('antolini'))

NameError: name 'df_survival_function' is not defined

In [None]:
print('Integrated Brier Score:',ev.integrated_brier_score(time_grid))

NameError: name 'ev' is not defined