# Introduction to the xgbsurv package

This notebook demonstrates how to use `xgbsurv` using cross validation from scikit-learn. It structured by the following steps:

- Load data
- Load model
- Fit model
- Predict and evaluate model

The syntax conveniently follows that of sklearn.

In [1]:
from xgbsurv.datasets import load_metabric
from xgbsurv.models.breslow_final import breslow_likelihood
from xgbsurv import XGBSurv
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
#from sksurv.metrics import concordance_index_censored
from xgbsurv.evaluation import cindex_censored, ibs
import numpy as np
import pandas as pd
from scipy.stats import uniform as scuniform
from scipy.stats import randint as scrandint
from scipy.stats import loguniform as scloguniform 
%load_ext autoreload
%autoreload 2


## Load Data

In [2]:
data, target = load_metabric(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=False)
# stratification on event type
target_sign = np.sign(target)
X_train, X_test, y_train, y_test = train_test_split(data, target, stratify=target_sign)

In [3]:
# Set Hyperparameter Space

param_grid = {
'alpha': scloguniform(1e-10,1),#[1e-10,1], # from hyp augmentation, L1 regularization
'reg_lambda': scloguniform(1e-10,1), #[1e-10,1], #alias l2_regularization, lambda in augmentation
'learning_rate': scloguniform(0.001,1), #[0.001,1], # assumed alias eta from augmentation,
'n_estimators':  scrandint(1,100), # corresponds to num_rounds
'gamma': scuniform(0.1,1-0.1),#[0.1,1], # minimum loss reduction required to make a further partition on a leaf node of the tree.
'colsample_bylevel': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation
'colsample_bynode': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation, uniform(0.1,1),
'colsample_bytree': scuniform(0.5, 1-0.5),#[0.5,1], # from hyp augmentation, seems to exceed the bound, uniform(0.5,1)
'max_depth': scrandint(1,20),#[1,20], # from hyp augmentation
'max_delta_step': scrandint(0,10),#[0,10], # from hyp augmentation
'min_child_weight' : scloguniform(0.1,20-0.1),#[0.1,20], # from hyp augmentation
'subsample': scuniform(0.01,1-0.01),#[0.01,1], # from hyp augmentation
}

## Load Model - Cross Validation without Early Stopping

In [4]:
# xgbsurv_pipe = Pipeline(steps=[
#                      ('xgbsurv',XGBSurv(objective="breslow_objective",
#                                              eval_metric="breslow_loss",
#                                              random_state=8, disable_default_eval_metric=True ))    
#                                              ])
model = XGBSurv(objective="breslow_objective",eval_metric="breslow_loss",
                                             random_state=8, disable_default_eval_metric=True )
#scoring function form of score_func(y, y_pred)
# -1 means using all processors.
grid_search = RandomizedSearchCV(model, param_grid, scoring = make_scorer(breslow_likelihood, greater_is_better=False), n_jobs=-1, cv=10, n_iter=40)



## Fit Model

In [5]:
grid_result = grid_search.fit(X_train, y_train)

  -3.99947518  -3.98173693  -4.36450862  -3.9951201   -3.97306117
  -3.99429822  -7.11103642  -3.99147166  -4.000707    -3.98214157
 -10.57911902  -4.01943258  -4.12748425  -4.0597818   -4.24551157
  -3.99980001  -3.99038441  -4.36608858  -4.00002085  -3.99596869
  -3.98278999  -3.98781478  -4.65198334  -3.99579746  -6.58575466
  -6.70978161  -4.0108124   -3.99908798  -4.25431323  -9.41157816
  -3.9995286   -3.982679    -3.9980151   -4.00141457  -3.99648284]


In [6]:
pd.DataFrame(grid_result.cv_results_).describe()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
count,40.0,40.0,40.0,40.0,39.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,40.0,39.0,39.0,40.0
mean,0.096664,0.021198,0.013112,0.004688,-4.651374,-4.920611,-4.875066,-4.95827,-4.753346,-4.808299,-5.119976,-4.726235,-4.845717,-4.775712,-4.572586,0.232955,20.5
std,0.218407,0.086385,0.065587,0.02576,1.240087,2.379924,2.889297,2.585013,2.197634,2.663744,3.054322,1.878811,2.481411,2.05501,1.48971,0.376254,11.690452
min,0.004257,0.000796,0.000484,1.3e-05,-9.289078,-15.513205,-18.881498,-18.404423,-13.742781,-18.811923,-20.34185,-13.368444,-15.79807,-14.562003,-10.579119,0.076338,1.0
25%,0.033198,0.004056,0.000657,9.4e-05,-4.309382,-4.217608,-4.006407,-4.266847,-4.199319,-4.087098,-4.269314,-4.312014,-4.222259,-4.223894,-4.186498,0.086509,10.75
50%,0.056159,0.006437,0.000845,0.000302,-4.168826,-4.055195,-3.874941,-4.031298,-3.886686,-3.919628,-4.102598,-3.996548,-3.948128,-4.020837,-3.999529,0.090762,20.5
75%,0.074365,0.008796,0.000971,0.000497,-4.163227,-4.03184,-3.863617,-4.026195,-3.882991,-3.915676,-4.09183,-3.98923,-3.940694,-4.010559,-3.994176,0.098491,30.25
max,1.40703,0.552603,0.40972,0.163115,-4.11641,-3.955963,-3.836388,-4.007899,-3.876605,-3.89648,-4.044449,-3.983516,-3.91495,-3.974248,-3.973061,1.577838,40.0


## Train Model with Best Parameters

In [7]:
best_params = grid_result.best_estimator_
p = best_params.get_params()
bmodel = XGBSurv(**p)
bmodel.fit(X_train, y_train)

In [8]:
bmodel.save_model("best_cv_model.json")



## Predict

In [9]:
preds_train = bmodel.predict(X_train, output_margin=True)
preds_test = bmodel.predict(X_test, output_margin=True)

## Evaluation

In [10]:
# train
cindex_censored(y_train, preds_train)

0.6253458330845325

In [11]:
# test
cindex_censored(y_test, preds_test)

0.6397093385164577

## Cross Validation with Early Stopping


In [12]:
# Set Hyperparameter Space

param_grid = {
'early_stopping_rounds': scrandint(1,20),
'alpha': scloguniform(1e-10,1),#[1e-10,1], # from hyp augmentation, L1 regularization
'reg_lambda': scloguniform(1e-10,1), #[1e-10,1], #alias l2_regularization, lambda in augmentation
'learning_rate': scloguniform(0.001,1), #[0.001,1], # assumed alias eta from augmentation,
'n_estimators':  scrandint(1,100), # corresponds to num_rounds
'gamma': scuniform(0.1,1-0.1),#[0.1,1], # minimum loss reduction required to make a further partition on a leaf node of the tree.
'colsample_bylevel': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation
'colsample_bynode': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation, uniform(0.1,1),
'colsample_bytree': scuniform(0.5, 1-0.5),#[0.5,1], # from hyp augmentation, seems to exceed the bound, uniform(0.5,1)
'max_depth': scrandint(1,20),#[1,20], # from hyp augmentation
'max_delta_step': scrandint(0,10),#[0,10], # from hyp augmentation
'min_child_weight' : scloguniform(0.1,20-0.1),#[0.1,20], # from hyp augmentation
'subsample': scuniform(0.01,1-0.01),#[0.01,1], # from hyp augmentation
}

In [13]:
model = XGBSurv(objective="breslow_objective",eval_metric="breslow_loss",
                                             random_state=8, disable_default_eval_metric=True )
#scoring function form of score_func(y, y_pred)
# -1 means using all processors.
grid_search = RandomizedSearchCV(model, param_grid, scoring = make_scorer(breslow_likelihood, greater_is_better=False), n_jobs=-1, cv=10, n_iter=40)

grid_result = grid_search.fit(X_train, y_train, eval_test_size=0.1)

[0]	validation_0-breslow_likelihood:6.06703	validation_1-breslow_likelihood:3.84744
[1]	validation_0-breslow_likelihood:6.06682	validation_1-breslow_likelihood:3.84719
[2]	validation_0-breslow_likelihood:6.06680	validation_1-breslow_likelihood:3.84718
[3]	validation_0-breslow_likelihood:6.06663	validation_1-breslow_likelihood:3.84697
[0]	validation_0-breslow_likelihood:6.08303	validation_1-breslow_likelihood:3.87975
[0]	validation_0-breslow_likelihood:6.05657	validation_1-breslow_likelihood:3.88026
[4]	validation_0-breslow_likelihood:6.06657	validation_1-breslow_likelihood:3.84693
[1]	validation_0-breslow_likelihood:6.08285	validation_1-breslow_likelihood:3.87967
[5]	validation_0-breslow_likelihood:6.06654	validation_1-breslow_likelihood:3.84697
[1]	validation_0-breslow_likelihood:6.05637	validation_1-breslow_likelihood:3.87995
[2]	validation_0-breslow_likelihood:6.08281	validation_1-breslow_likelihood:3.87961
[6]	validation_0-breslow_likelihood:6.06653	validation_1-breslow_likelihood:

## Fit Best Model

In [14]:
best_params = grid_result.best_estimator_
p = best_params.get_params()
bmodel = XGBSurv(**p)
bmodel.fit(X_train, y_train, eval_test_size=0.1)

[0]	validation_0-breslow_likelihood:6.12884	validation_1-breslow_likelihood:3.90586
[1]	validation_0-breslow_likelihood:6.12503	validation_1-breslow_likelihood:3.90382
[2]	validation_0-breslow_likelihood:6.11808	validation_1-breslow_likelihood:3.90093
[3]	validation_0-breslow_likelihood:6.11417	validation_1-breslow_likelihood:3.90335
[4]	validation_0-breslow_likelihood:6.11439	validation_1-breslow_likelihood:3.90394
[5]	validation_0-breslow_likelihood:6.11100	validation_1-breslow_likelihood:3.90096
[6]	validation_0-breslow_likelihood:6.10847	validation_1-breslow_likelihood:3.89893
[7]	validation_0-breslow_likelihood:6.10049	validation_1-breslow_likelihood:3.89884
[8]	validation_0-breslow_likelihood:6.09913	validation_1-breslow_likelihood:3.89754
[9]	validation_0-breslow_likelihood:6.09387	validation_1-breslow_likelihood:3.89849
[10]	validation_0-breslow_likelihood:6.09291	validation_1-breslow_likelihood:3.89968
[11]	validation_0-breslow_likelihood:6.08990	validation_1-breslow_likelihoo

In [15]:
preds_train = bmodel.predict(X_train, output_margin=True)
preds_test = bmodel.predict(X_test, output_margin=True)

In [16]:
# train
cindex_censored(y_train, preds_train)

0.5955579819555457

In [17]:
# test
cindex_censored(y_test, preds_test)

0.6012932403419846