# Introduction to the xgbsurv package

This notebook introduces `xgbsurv` using a specific dataset. It structured by the following steps:

- Load data
- Load model
- Fit model
- Predict and evaluate model

The syntax conveniently follows that of sklearn.

In [1]:
from xgbsurv.datasets import load_metabric
from xgbsurv.preprocessing.dataset_preprocessing import discretizer_df
from xgbsurv.models.utils import sort_X_y, transform_back, transform
from xgbsurv import XGBSurv
from sklearn.model_selection import train_test_split
import numpy as np
%load_ext autoreload
%autoreload 2


## Load Data

In [2]:
data, target = load_metabric(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
# add discretizer

time, event = transform_back(target.to_numpy())
data['time'] = time
data['event'] = event
df = discretizer_df(data, n_cuts=100, type = 'equidistant', min_time=0.0)
df.head()

Unnamed: 0,MKI67,EGFR,PGR,ERBB2,hormone_treatment,radiotherapy,chemotherapy,ER_positive,age,time,event
0,5.818934,6.470783,10.672935,5.630679,0.0,0.0,0.0,1.0,75.33,1,1
1,10.117913,5.335095,9.717084,5.893656,0.0,0.0,0.0,0.0,54.1,1,0
2,5.705204,8.450347,10.859011,5.667925,0.0,0.0,0.0,1.0,73.64,1,0
3,5.18406,8.427523,10.361415,5.575082,1.0,0.0,0.0,1.0,73.98,1,0
4,5.621474,5.456216,9.500981,5.753597,1.0,0.0,0.0,1.0,34.68,1,0


In [3]:
target = transform(df.time.to_numpy(), df.event.to_numpy())
print(target)
data = df.iloc[:,:-2].to_numpy()
#target = target[:10]
#data = data[:10,:]
target_sign = np.sign(target)
X_train, X_test, y_train, y_test = train_test_split(data, target, stratify=target_sign)
X_train, y_train = sort_X_y(X_train, y_train)
X_test, y_test = sort_X_y(X_test, y_test)
n = len(np.unique(np.absolute(y_train)))
print(n)
y_train= np.tile(y_train, (n,1)).T
y_train.shape

[  1  -1  -1 ... -38  39  40]
39


(1427, 39)

In [4]:
y_test.shape

(476,)

In [5]:
event

array([1, 0, 0, ..., 0, 1, 1])

## Load Model

In [18]:
model = XGBSurv(n_estimators=2000, objective="deephit_objective",
                                             eval_metric="deephit_loss",
                                             learning_rate=0.10,
                                             random_state=7, disable_default_metric=True, base_score=0.3)
#model.get_params()

The options of loss and objective functions can be obtained like below:

In [19]:
print(model.get_loss_functions().keys())
print(model.get_objective_functions().keys())

dict_keys(['breslow_loss', 'efron_loss', 'cind_loss', 'deephit_loss', 'aft_loss'])
dict_keys(['breslow_objective', 'efron_objective', 'cind_objective', 'deephit_objective', 'aft_objective'])


## Fit Model

In [20]:
eval_set = [(X_train, y_train)]


In [21]:
model.fit(X_train, y_train, eval_set=eval_set)

Parameters: { "disable_default_metric" } are not used.

[0]	validation_0-rmse:16.51855	validation_0-deephit_loss1_pycox:3311.89805
[1]	validation_0-rmse:16.51156	validation_0-deephit_loss1_pycox:3186.06853
[2]	validation_0-rmse:16.50401	validation_0-deephit_loss1_pycox:3069.16019
[3]	validation_0-rmse:16.49772	validation_0-deephit_loss1_pycox:2961.46970
[4]	validation_0-rmse:16.49157	validation_0-deephit_loss1_pycox:2864.42747
[5]	validation_0-rmse:16.48562	validation_0-deephit_loss1_pycox:2774.60639
[6]	validation_0-rmse:16.48007	validation_0-deephit_loss1_pycox:2695.59851
[7]	validation_0-rmse:16.47480	validation_0-deephit_loss1_pycox:2613.73135
[8]	validation_0-rmse:16.46971	validation_0-deephit_loss1_pycox:2540.67455
[9]	validation_0-rmse:16.46416	validation_0-deephit_loss1_pycox:2468.68435
[10]	validation_0-rmse:16.45927	validation_0-deephit_loss1_pycox:2403.16207
[11]	validation_0-rmse:16.45427	validation_0-deephit_loss1_pycox:2343.38719
[12]	validation_0-rmse:16.44932	validation

In [22]:
target.shape

(1903,)

The model can be saved like below. Note that objective and eval_metric are not saved.

In [23]:
model.save_model("deephit_model.json")



## Predict & Evaluate

In [24]:
import sys
!conda install --yes --prefix {sys.prefix} -c conda-forge pycox
#from sksurv.metrics import concordance_index_censored
from xgbsurv.evaluation import cindex_censored, ibs
from pycox.evaluation import EvalSurv

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [37]:
import pandas as pd
preds_test = model.predict(X_test, output_margin=True, validate_features=False)
preds_train = model.predict(X_train, output_margin=True, validate_features=False)
preds_test.shape

(476, 39)

In [38]:
preds_test.T.shape[1]
cols = [str(i) for i in range(preds_test.T.shape[1])]
surv = pd.DataFrame(data=preds_test.T, columns=cols)
surv

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,466,467,468,469,470,471,472,473,474,475
0,-3.696275,-6.08521,-4.873325,-6.804355,-3.441493,-7.053183,-7.124068,-7.775833,-7.294406,-10.092867,...,-7.991036,-7.668395,-9.356113,-8.130119,-6.184692,-7.985476,-9.01732,-8.355248,-7.364399,-8.526234
1,-2.373668,-2.563405,-6.111287,-7.503165,1.075123,-9.682673,-10.311274,-7.615728,-8.718108,-3.577562,...,-9.411132,-8.968921,-8.836917,-1.820707,-1.609249,-9.534242,-8.873509,-9.707536,-7.876701,-8.185926
2,-1.190985,-4.545753,-7.834105,-4.718167,-3.600434,-5.992932,-8.408125,-7.349136,-6.381931,-5.549983,...,-3.935445,-8.77109,-4.207017,-8.569579,0.166331,-8.383279,-12.51571,-7.770042,-6.44883,-4.937863
3,0.568572,-3.995586,-8.657955,0.698108,-1.134283,-2.196686,-10.576089,-10.388279,-6.529315,-3.605409,...,-6.875849,-5.96582,-10.640692,-2.893686,-3.389629,-8.880622,-6.77885,-4.120806,-7.466785,-5.595185
4,0.510398,0.221514,-5.584674,2.552826,-2.305153,-4.47448,-7.424498,-4.753262,-8.469719,-6.342701,...,-3.491636,-8.657011,-5.745327,-8.498201,-8.242394,-3.366477,-8.172676,-9.674054,-10.396328,-4.639692
5,0.715203,-1.427269,-1.608162,-5.120244,3.399627,-5.658895,-6.643463,0.834838,-3.762678,-8.198701,...,-2.636532,-5.933687,-7.391525,-6.000797,-8.03896,-3.118515,-8.246271,-6.466876,-9.895689,-5.883349
6,-4.811928,-6.80028,-9.428293,-5.712794,-4.353967,-5.691224,-6.146769,-8.072568,-5.293762,-7.710203,...,-5.946919,-6.927227,-8.027557,-9.817085,-4.172598,-12.632052,-2.867439,-8.804027,-4.153743,-7.134697
7,-1.91884,-6.696613,-6.266142,-4.439771,-1.224747,-6.270138,-8.737864,-6.802374,-3.920429,-10.000623,...,-8.14519,-6.796229,-8.103961,-7.642014,-7.87173,-9.016355,-5.422179,-7.013284,-8.760154,-4.633595
8,0.499517,-0.689363,-2.405354,-2.893821,-1.349314,-5.128953,-9.973175,-7.410383,-4.011025,-9.171122,...,-7.093409,-5.162817,-8.082338,-8.913595,-8.374586,-7.171342,-3.428717,-9.190054,-9.376945,-6.228648
9,-0.278508,-6.820578,-4.641427,-6.015306,-1.820251,-3.815733,-6.454815,-8.566336,-7.192728,-3.754286,...,-3.276837,-6.952279,-4.201973,-4.908966,-3.929816,-11.490213,-7.086055,-7.181049,-7.166852,-4.598407


In [39]:
time, event = transform_back(y_test)
ev = EvalSurv(surv, time, event, censor_surv='km')
ev.concordance_td('antolini')

  assert pd.Series(self.index_surv).is_monotonic


0.42302839527048997

In [43]:
cols = [str(i) for i in range(preds_train.T.shape[1])]
surv = pd.DataFrame(data=preds_train.T, columns=cols)
time, event = transform_back(y_train[:,0])
ev = EvalSurv(surv, time, event, censor_surv='km')
ev.concordance_td('antolini')

  assert pd.Series(self.index_surv).is_monotonic


0.40957654807109717

In [41]:
event

array([[1, 1, 1, ..., 1, 1, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1]])

In [34]:
surv.to_csv('surv_data.csv', index=False)

In [17]:
# # XGBsurv benchmark
# from xgbsurv.datasets import load_metabric
# from xgbsurv import XGBSurv
# from xgbsurv.evaluation import cindex_censored, ibs
# import numpy as np
# import pandas as pd
# from scipy.stats import uniform as scuniform
# from scipy.stats import randint as scrandint
# from scipy.stats import loguniform as scloguniform 
# from sklearn.model_selection import RandomizedSearchCV, KFold, StratifiedKFold
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.datasets import load_iris
# from sklearn.metrics import make_scorer
# # import models
# from xgbsurv.models.breslow_final import breslow_likelihood
# from xgbsurv.models.efron_final import efron_likelihood
# from xgbsurv.models.cind_final import cind_loss
# from xgbsurv.models.deephit_pycox_final import deephit_loss1_pycox

# # set parameters
# n_outer_splits = 5
# n_inner_splits = 5
# rand_state = 42
# n_iter = 40
# n_iter_cind = 200
# early_stopping_rounds=10

# # Load dataset
# X, y = load_metabric(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=False)

# # deephit data adaptation
# n = len(np.unique(np.absolute(y)))
# y_deephit = np.tile(y, (n,1)).T



# # Define parameter grid for random forest classifier
# param_grid = {
# 'alpha': scloguniform(1e-10,1),#[1e-10,1], # from hyp augmentation, L1 regularization
# 'reg_lambda': scloguniform(1e-10,1), #[1e-10,1], #alias l2_regularization, lambda in augmentation
# 'learning_rate': scloguniform(0.001,1), #[0.001,1], # assumed alias eta from augmentation,
# 'n_estimators':  scrandint(1,100), # corresponds to num_rounds
# 'gamma': scuniform(0.1,1-0.1),#[0.1,1], # minimum loss reduction required to make a further partition on a leaf node of the tree.
# 'colsample_bylevel': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation
# 'colsample_bynode': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation, uniform(0.1,1),
# 'colsample_bytree': scuniform(0.5, 1-0.5),#[0.5,1], # from hyp augmentation, seems to exceed the bound, uniform(0.5,1)
# 'max_depth': scrandint(1,20),#[1,20], # from hyp augmentation
# 'max_delta_step': scrandint(0,10),#[0,10], # from hyp augmentation
# 'min_child_weight' : scloguniform(0.1,20-0.1),#[0.1,20], # from hyp augmentation
# 'subsample': scuniform(0.01,1-0.01),#[0.01,1], # from hyp augmentation
# }

# # Define stratified outer k-fold cross-validation
# #outer_cv = StratifiedKFold(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)


# # Define stratified inner k-fold cross-validation
# class CustomSplit(StratifiedKFold):
#     def __init__(self, n_splits=5, shuffle=True, random_state=None):
#         super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

#     def split(self, X, y, groups=None):
#         try:
#             if y.shape[1]>1:
#                 y = y[:,0]
#         except:
#             pass
#         bins = np.sign(y)
#         return super().split(X, bins, groups=groups)

#     def get_n_splits(self, X=None, y=None, groups=None):
#         return self.n_splits

# outer_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)
# inner_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)




# # Define models to apply
# loss_functions = ['breslow_loss', 'efron_loss', 'cind_loss', 'deephit_loss', 'aft_loss']
# objective_functions = ['breslow_objective', 'efron_objective', 'cind_objective', 'deephit_objective', 'aft_objective']
# scoring_functions = [breslow_likelihood, efron_likelihood, cind_loss, deephit_loss1_pycox]
# n_models = len(scoring_functions)

# # dict of outer scores
# outer_scores = {'breslow_loss':[], 'efron_loss':[], 'cind_loss':[], 'deephit_loss':[]} #, 'aft_loss':[]
# # Loop over models
# for model in range(n_models):
#     obj = objective_functions[model]
#     print('Current model being run:', obj)
#     # adapt for cind (learning rate should also be adapted)
#     if obj=='cind_objective':
#         n_iter = n_iter_cind
#     if obj=='deephit_objective':
#         y = y_deephit

#     # Define Scorer
#     scoring_function = make_scorer(scoring_functions[model], greater_is_better=False)
#     # Define custom cross-validation strategy
#     # custom_cv = custom_split(X, y)
#     # Define Model
#     estimator = XGBSurv(objective=objective_functions[model],eval_metric=loss_functions[model],
#                                             random_state=rand_state, disable_default_eval_metric=True,
#                                             early_stopping_rounds=early_stopping_rounds)
    
    
#     # Define RandomizedSearchCV object
#     rs = RandomizedSearchCV(estimator, param_grid, scoring = scoring_function, n_jobs=-1, 
#                             cv=inner_custom_cv, n_iter=n_iter, refit=True)

#     # loop over outer split
#     for train_index, test_index in outer_custom_cv.split(X, y):
#         # Split data into training and testing sets for outer fold
#         X_train, X_test = X[train_index], X[test_index]
#         y_train, y_test = y[train_index], y[test_index]
#         # Early stopping within model
#         rs.fit(X_train, y_train, eval_test_size=0.1)
#         preds_test = rs.best_estimator_.predict(X_test, output_margin = True)
#         score = cindex_censored(y_test, preds_test)
#         print('cindex score:', score)
#         outer_scores[loss_functions[model]] += [score]

# df = pd.DataFrame(outer_scores)
# df.columns = ['breslow', 'efron', 'cind', 'deephit']
# df.to_csv('benchmarking_results.csv', index=False)
# print(df.describe())


# # Print the mean and standard deviation of the outer scores
# print(f"Nested CV score: {np.mean(np.array(df).T, axis=1)} (+/- {np.std(np.array(df).T, axis=1)})")
