# Introduction to the xgbsurv package

This notebook introduces `xgbsurv` using a specific dataset. It structured by the following steps:

- Load data
- Load model
- Fit model
- Predict and evaluate model

The syntax conveniently follows that of sklearn.

In [44]:
from xgbsurv.datasets import load_metabric
from xgbsurv.preprocessing.dataset_preprocessing import discretizer_df
from xgbsurv.models.utils import sort_X_y, transform_back, transform
from xgbsurv import XGBSurv
from sklearn.model_selection import train_test_split
import numpy as np
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load Data

In [45]:
data, target = load_metabric(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
# add discretizer

time, event = transform_back(target.to_numpy())
data['time'] = time
data['event'] = event
df = discretizer_df(data, n_cuts=100, type = 'equidistant', min_time=0.0)
df.head()

Unnamed: 0,MKI67,EGFR,PGR,ERBB2,hormone_treatment,radiotherapy,chemotherapy,ER_positive,age,time,event
0,5.818934,6.470783,10.672935,5.630679,0.0,0.0,0.0,1.0,75.33,1,1
1,10.117913,5.335095,9.717084,5.893656,0.0,0.0,0.0,0.0,54.1,1,0
2,5.705204,8.450347,10.859011,5.667925,0.0,0.0,0.0,1.0,73.64,1,0
3,5.18406,8.427523,10.361415,5.575082,1.0,0.0,0.0,1.0,73.98,1,0
4,5.621474,5.456216,9.500981,5.753597,1.0,0.0,0.0,1.0,34.68,1,0


In [46]:
target = transform(df.time.to_numpy(), df.event.to_numpy())
print(target)
data = df.iloc[:,:-2].to_numpy()
#target = target[:10]
#data = data[:10,:]
target_sign = np.sign(target)
X_train, X_test, y_train, y_test = train_test_split(data, target, stratify=target_sign)
X_train, y_train = sort_X_y(X_train, y_train)
X_test, y_test = sort_X_y(X_test, y_test)
n = len(np.unique(np.absolute(y_train)))
print(n)
y_train= np.tile(y_train, (n,1)).T
y_train.shape

[  1  -1  -1 ... -94  98 100]
90


(1427, 90)

In [47]:
y_test.shape

(476,)

In [48]:
event

array([1, 0, 0, ..., 0, 1, 1])

## Load Model

In [49]:
model = XGBSurv(n_estimators=2000, objective="deephit_objective",
                                             eval_metric="deephit_loss",
                                             learning_rate=0.10,
                                             random_state=7, disable_default_metric=True, base_score=0.3)
#model.get_params()

The options of loss and objective functions can be obtained like below:

In [50]:
print(model.get_loss_functions().keys())
print(model.get_objective_functions().keys())

dict_keys(['breslow_loss', 'efron_loss', 'cind_loss', 'deephit_loss', 'aft_loss'])
dict_keys(['breslow_objective', 'efron_objective', 'cind_objective', 'deephit_objective', 'aft_objective'])


## Fit Model

In [51]:
eval_set = [(X_train, y_train)]


In [52]:
model.fit(X_train, y_train, eval_set=eval_set)

Parameters: { "disable_default_metric" } are not used.

[0]	validation_0-rmse:40.94116	validation_0-deephit_loss1_pycox:4072.31716
[1]	validation_0-rmse:40.93432	validation_0-deephit_loss1_pycox:3949.27767
[2]	validation_0-rmse:40.92861	validation_0-deephit_loss1_pycox:3833.94574
[3]	validation_0-rmse:40.92256	validation_0-deephit_loss1_pycox:3726.46669
[4]	validation_0-rmse:40.91604	validation_0-deephit_loss1_pycox:3629.81873
[5]	validation_0-rmse:40.90998	validation_0-deephit_loss1_pycox:3536.02988
[6]	validation_0-rmse:40.90417	validation_0-deephit_loss1_pycox:3441.79081
[7]	validation_0-rmse:40.89821	validation_0-deephit_loss1_pycox:3354.60703
[8]	validation_0-rmse:40.89282	validation_0-deephit_loss1_pycox:3270.57953
[9]	validation_0-rmse:40.88763	validation_0-deephit_loss1_pycox:3187.60195
[10]	validation_0-rmse:40.88300	validation_0-deephit_loss1_pycox:3111.06013
[11]	validation_0-rmse:40.87798	validation_0-deephit_loss1_pycox:3035.26880
[12]	validation_0-rmse:40.87275	validation

In [53]:
target.shape

(1903,)

The model can be saved like below. Note that objective and eval_metric are not saved.

In [54]:
model.save_model("deephit_model.json")



## Predict & Evaluate

In [55]:
import sys
!conda install --yes --prefix {sys.prefix} -c conda-forge pycox
#from sksurv.metrics import concordance_index_censored
from xgbsurv.evaluation import cindex_censored, ibs
from pycox.evaluation import EvalSurv

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [56]:
import pandas as pd
preds_test = model.predict(X_test, output_margin=True, validate_features=False)
preds_train = model.predict(X_train, output_margin=True, validate_features=False)
preds_test.shape

(476, 90)

In [57]:
preds_test.T.shape[1]
cols = [str(i) for i in range(preds_test.T.shape[1])]
surv = pd.DataFrame(data=preds_test.T, columns=cols)
surv

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,466,467,468,469,470,471,472,473,474,475
0,-6.890857,-6.890857,-6.796816,-7.929599,-6.756095,-8.410784,-5.927662,-4.743150,-3.418882,-6.970855,...,-6.670851,-6.286502,-6.499992,-8.502278,-6.536090,-7.105617,-6.062424,-6.286502,-6.199988,-7.886333
1,-9.895530,-5.321617,-4.261624,-6.743558,-8.661213,-8.760814,-7.653399,-5.949537,-5.176513,-10.898017,...,-9.545039,-9.908096,-10.066805,-10.563773,-9.475101,-10.375940,-7.424945,-8.119374,-6.769309,-11.337688
2,-7.455766,-7.514219,-6.992144,-5.912518,-6.457172,-6.607211,-10.417833,-3.309223,-4.802322,-10.334987,...,-7.551781,-8.474930,-8.762161,-9.120632,-7.478767,-8.521143,-8.420971,-8.639685,-7.519887,-9.255885
3,-8.312634,-7.200647,-6.059400,-1.116745,-6.596586,-7.956530,-9.803360,1.702691,-2.690145,-7.607093,...,-4.865688,-8.878933,-9.834019,-9.899033,-7.143303,-3.682276,-6.255867,-8.831538,-7.086382,-9.835267
4,-5.341692,-6.748853,-6.461722,-4.583495,-5.260877,-7.094065,-6.592380,-1.273125,-2.958032,-6.233868,...,-8.360245,-3.939768,-10.532650,-9.529613,-5.885591,-10.297590,-7.586340,-8.177292,-7.548578,-8.055818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,-4.282473,-4.282473,-4.282473,-4.282473,-4.282473,-4.282473,-4.282473,-4.282473,-4.282473,-4.282473,...,-4.282473,-4.282473,-4.282473,-4.282473,-4.282473,-4.282473,-4.282473,-4.282473,-4.282473,-4.282473
86,-4.281430,-4.281430,-4.281430,-4.281430,-4.281430,-4.281430,-4.281430,-4.281430,-4.281430,-4.281430,...,-4.281430,-4.281430,-4.281430,-4.281430,-4.281430,-4.281430,-4.281430,-4.281430,-4.281430,-4.281430
87,-4.280212,-4.280212,-4.280212,-4.280212,-4.280212,-4.280212,-4.280212,-4.280212,-4.280212,-4.280212,...,-4.280212,-4.280212,-4.280212,-4.280212,-4.280212,-4.280212,-4.280212,-4.280212,-4.280212,-4.280212
88,-5.048205,-4.760234,-4.113268,-4.006267,-3.097589,-4.760234,-5.115020,-4.006267,-2.590357,-4.641217,...,-5.059464,-5.115020,-5.228909,-5.228909,-4.662936,-5.228909,-4.945575,-5.115020,-5.228909,-5.228909


In [58]:
time, event = transform_back(y_test)
ev = EvalSurv(surv, time, event, censor_surv='km')
ev.concordance_td('antolini')

  assert pd.Series(self.index_surv).is_monotonic


0.4134386538189397

In [59]:
cols = [str(i) for i in range(preds_train.T.shape[1])]
surv = pd.DataFrame(data=preds_train.T, columns=cols)
time, event = transform_back(y_train[:,0])
ev = EvalSurv(surv, time, event, censor_surv='km')
ev.concordance_td('antolini')

  assert pd.Series(self.index_surv).is_monotonic


0.42011173661716894

In [60]:
event

array([0, 0, 0, ..., 1, 0, 1])

In [61]:
surv.to_csv('surv_data.csv', index=False)

In [62]:
# # XGBsurv benchmark
# from xgbsurv.datasets import load_metabric
# from xgbsurv import XGBSurv
# from xgbsurv.evaluation import cindex_censored, ibs
# import numpy as np
# import pandas as pd
# from scipy.stats import uniform as scuniform
# from scipy.stats import randint as scrandint
# from scipy.stats import loguniform as scloguniform 
# from sklearn.model_selection import RandomizedSearchCV, KFold, StratifiedKFold
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.datasets import load_iris
# from sklearn.metrics import make_scorer
# # import models
# from xgbsurv.models.breslow_final import breslow_likelihood
# from xgbsurv.models.efron_final import efron_likelihood
# from xgbsurv.models.cind_final import cind_loss
# from xgbsurv.models.deephit_pycox_final import deephit_loss1_pycox

# # set parameters
# n_outer_splits = 5
# n_inner_splits = 5
# rand_state = 42
# n_iter = 40
# n_iter_cind = 200
# early_stopping_rounds=10

# # Load dataset
# X, y = load_metabric(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=False)

# # deephit data adaptation
# n = len(np.unique(np.absolute(y)))
# y_deephit = np.tile(y, (n,1)).T



# # Define parameter grid for random forest classifier
# param_grid = {
# 'alpha': scloguniform(1e-10,1),#[1e-10,1], # from hyp augmentation, L1 regularization
# 'reg_lambda': scloguniform(1e-10,1), #[1e-10,1], #alias l2_regularization, lambda in augmentation
# 'learning_rate': scloguniform(0.001,1), #[0.001,1], # assumed alias eta from augmentation,
# 'n_estimators':  scrandint(1,100), # corresponds to num_rounds
# 'gamma': scuniform(0.1,1-0.1),#[0.1,1], # minimum loss reduction required to make a further partition on a leaf node of the tree.
# 'colsample_bylevel': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation
# 'colsample_bynode': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation, uniform(0.1,1),
# 'colsample_bytree': scuniform(0.5, 1-0.5),#[0.5,1], # from hyp augmentation, seems to exceed the bound, uniform(0.5,1)
# 'max_depth': scrandint(1,20),#[1,20], # from hyp augmentation
# 'max_delta_step': scrandint(0,10),#[0,10], # from hyp augmentation
# 'min_child_weight' : scloguniform(0.1,20-0.1),#[0.1,20], # from hyp augmentation
# 'subsample': scuniform(0.01,1-0.01),#[0.01,1], # from hyp augmentation
# }

# # Define stratified outer k-fold cross-validation
# #outer_cv = StratifiedKFold(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)


# # Define stratified inner k-fold cross-validation
# class CustomSplit(StratifiedKFold):
#     def __init__(self, n_splits=5, shuffle=True, random_state=None):
#         super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

#     def split(self, X, y, groups=None):
#         try:
#             if y.shape[1]>1:
#                 y = y[:,0]
#         except:
#             pass
#         bins = np.sign(y)
#         return super().split(X, bins, groups=groups)

#     def get_n_splits(self, X=None, y=None, groups=None):
#         return self.n_splits

# outer_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)
# inner_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)




# # Define models to apply
# loss_functions = ['breslow_loss', 'efron_loss', 'cind_loss', 'deephit_loss', 'aft_loss']
# objective_functions = ['breslow_objective', 'efron_objective', 'cind_objective', 'deephit_objective', 'aft_objective']
# scoring_functions = [breslow_likelihood, efron_likelihood, cind_loss, deephit_loss1_pycox]
# n_models = len(scoring_functions)

# # dict of outer scores
# outer_scores = {'breslow_loss':[], 'efron_loss':[], 'cind_loss':[], 'deephit_loss':[]} #, 'aft_loss':[]
# # Loop over models
# for model in range(n_models):
#     obj = objective_functions[model]
#     print('Current model being run:', obj)
#     # adapt for cind (learning rate should also be adapted)
#     if obj=='cind_objective':
#         n_iter = n_iter_cind
#     if obj=='deephit_objective':
#         y = y_deephit

#     # Define Scorer
#     scoring_function = make_scorer(scoring_functions[model], greater_is_better=False)
#     # Define custom cross-validation strategy
#     # custom_cv = custom_split(X, y)
#     # Define Model
#     estimator = XGBSurv(objective=objective_functions[model],eval_metric=loss_functions[model],
#                                             random_state=rand_state, disable_default_eval_metric=True,
#                                             early_stopping_rounds=early_stopping_rounds)
    
    
#     # Define RandomizedSearchCV object
#     rs = RandomizedSearchCV(estimator, param_grid, scoring = scoring_function, n_jobs=-1, 
#                             cv=inner_custom_cv, n_iter=n_iter, refit=True)

#     # loop over outer split
#     for train_index, test_index in outer_custom_cv.split(X, y):
#         # Split data into training and testing sets for outer fold
#         X_train, X_test = X[train_index], X[test_index]
#         y_train, y_test = y[train_index], y[test_index]
#         # Early stopping within model
#         rs.fit(X_train, y_train, eval_test_size=0.1)
#         preds_test = rs.best_estimator_.predict(X_test, output_margin = True)
#         score = cindex_censored(y_test, preds_test)
#         print('cindex score:', score)
#         outer_scores[loss_functions[model]] += [score]

# df = pd.DataFrame(outer_scores)
# df.columns = ['breslow', 'efron', 'cind', 'deephit']
# df.to_csv('benchmarking_results.csv', index=False)
# print(df.describe())


# # Print the mean and standard deviation of the outer scores
# print(f"Nested CV score: {np.mean(np.array(df).T, axis=1)} (+/- {np.std(np.array(df).T, axis=1)})")
