In [1]:
## XGBSurv EH Benchmark

In [2]:
from xgbsurv.datasets import (load_metabric, load_flchain, load_rgbsg, load_support, load_tcga)
from xgbsurv import XGBSurv
from xgbsurv.evaluation import cindex_censored, ibs
from xgbsurv.models.utils import sort_X_y
import os
import numpy as np
import pandas as pd
from scipy.stats import uniform as scuniform
from scipy.stats import randint as scrandint
from scipy.stats import loguniform as scloguniform 
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, LabelBinarizer, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.decomposition import PCA
# import models
from xgbsurv.models.breslow_final import breslow_likelihood,get_cumulative_hazard_function_breslow
from xgbsurv.models.eh_final import eh_likelihood
from pycox.evaluation import EvalSurv
from xgbsurv.models.utils import sort_X_y_pandas, transform_back, transform
from xgbsurv.preprocessing.dataset_preprocessing import discretizer_df

## Set Parameters

In [3]:
# set parameters
n_outer_splits = 5
n_inner_splits = 5
rand_state = 42
n_iter = 100
n_iter_cind = 200
early_stopping_rounds=10
base_score = 0.0

# Define parameter grid for random forest classifier
param_grid = {
'estimator__reg_alpha': scloguniform(1e-10,1),#[1e-10,1], # from hyp augmentation, L1 regularization
'estimator__reg_lambda': scloguniform(1e-10,1), #[1e-10,1], #alias l2_regularization, lambda in augmentation
'estimator__learning_rate': scloguniform(0.001,1), #[0.001,1], # assumed alias eta from augmentation,
'estimator__n_estimators':  scrandint(1,100), # corresponds to num_rounds
'estimator__gamma': scuniform(0.1,1-0.1),#[0.1,1], # minimum loss reduction required to make a further partition on a leaf node of the tree.
'estimator__colsample_bylevel': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation
'estimator__colsample_bynode': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation, uniform(0.1,1),
'estimator__colsample_bytree': scuniform(0.5, 1-0.5),#[0.5,1], # from hyp augmentation, seems to exceed the bound, uniform(0.5,1)
'estimator__max_depth': scrandint(1,20),#[1,20], # from hyp augmentation
'estimator__max_delta_step': scrandint(0,10),#[0,10], # from hyp augmentation
'estimator__min_child_weight' : scloguniform(0.1,20-0.1),#[0.1,20], # from hyp augmentation
'estimator__subsample': scuniform(0.01,1-0.01),#[0.01,1], # from hyp augmentation
}

## Custom Splitting

In [4]:
class CustomSplit(StratifiedKFold):
    def __init__(self, n_splits=5, shuffle=True, random_state=None):
        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    def split(self, X, y, groups=None):
        y = y.to_numpy()
        try:
            if y.shape[1]>1:
                y = y[:,0]
        except:
            pass
        bins = np.sign(y)
        #print('bins', bins.shape)
        return super().split(X, bins, groups=groups)

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

outer_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)
inner_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)

## Scoring Function

In [7]:
scoring_function = make_scorer(eh_likelihood, greater_is_better=False)

## Set Model & Train Test Evaluate

In [9]:
data_set_fns = [load_metabric,  load_flchain, load_rgbsg, load_support] #, load_flchain, load_rgbsg, load_support, load_tcga]
data_set_fns_str = ['load_metabric', 'load_flchain', 'load_rgbsg', 'load_support'] 
one_hot_dict = {'load_flchain': ['mgus'], 'load_support':['cancer'], 'load_rgbsg':['grade']}

for idx, dataset in enumerate(data_set_fns):
    model = 'xgbsurv_eh_'
    # get name of current dataset
    data = dataset(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
    filename = data.filename
    X  = data.data
    y  = data.target #.astype(np.float32)
    X, y = sort_X_y_pandas(X, y)
    y = pd.concat([data.target,data.target], axis=1) #.values #.to_numpy()
    #print('x y shape', X.shape,y.shape)
    
    print(data_set_fns_str[idx])
    if data_set_fns_str[idx] in one_hot_dict.keys():
        X = pd.get_dummies(X, columns=one_hot_dict[data_set_fns_str[idx]])
    
    #print(X.dtypes)
    dataset_name = filename.split('_')[0]
    # add IBS later
    outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[],
                    'ibs_train_'+dataset_name:[], 'ibs_test_'+dataset_name:[]}
    best_params = {'best_params_'+dataset_name:[]}
    best_model = {'best_model_'+dataset_name:[]}
    ct = make_column_transformer(
            #(OneHotEncoder(sparse_output=False), make_column_selector(dtype_include=['category', 'object']))
            (StandardScaler(), make_column_selector(dtype_include=['float32']))
            ,remainder='passthrough')
    
    estimator = XGBSurv(
        objective='eh_objective',
        eval_metric='eh_loss',
        random_state=rand_state, 
        disable_default_eval_metric=True,
        early_stopping_rounds=early_stopping_rounds, 
        base_score=base_score
                        )
    pipe = Pipeline([('scaler',ct),
                    ('estimator', estimator)])
    
    rs = RandomizedSearchCV(pipe, param_grid, scoring = scoring_function, n_jobs=-1, 
                             cv=inner_custom_cv, n_iter=2, refit=True)
    
    for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)

        #print(X_train.shape, type(X_train))
        #print(y_train.shape, type(y_train))
        #print(X_test.shape, type(X_test))
        #print(y_test.shape, type(y_test))
        # save splits and data
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        np.savetxt('splits/'+model+'X_train_'+str(i)+'_'+filename, X_train, delimiter=',')
        np.savetxt('splits/'+model+'X_test_'+str(i)+'_'+filename, X_test, delimiter=',')

        np.savetxt('splits/'+model+'y_train_'+str(i)+'_'+filename, y_train, delimiter=',')
        np.savetxt('splits/'+model+'y_test_'+str(i)+'_'+filename, y_test, delimiter=',')

        rs.fit(X_train, y_train, estimator__eval_test_size=0.2)
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)
        # save hyperparameter settings
        params = rs.best_estimator_.get_params
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [params]
    #     try:
    #         cum_hazard_train = get_cumulative_hazard_function_breslow(
    #                 X_train.values, X_train.values, y_train.values, y_train.values,
    #                 best_preds_train.reshape(-1), best_preds_train.reshape(-1)
    #                 )

    #         df_survival_train = np.exp(-cum_hazard_train)
    #         durations_train, events_train = transform_back(y_train.values)
    #         time_grid_train = np.linspace(durations_train.min(), durations_train.max(), 100)
    #         ev = EvalSurv(df_survival_train, durations_train, events_train, censor_surv='km')
    #         print('Concordance Index',ev.concordance_td('antolini'))
    #         print('Integrated Brier Score:',ev.integrated_brier_score(time_grid_train))
    #         cindex_score_train = ev.concordance_td('antolini')
    #         ibs_score_train = ev.integrated_brier_score(time_grid_train)

    #         outer_scores['cindex_train_'+dataset_name] += [cindex_score_train]
    #         outer_scores['ibs_train_'+dataset_name] += [ibs_score_train]

    #     except:
    #         outer_scores['cindex_train_'+dataset_name] += [np.nan]
    #         outer_scores['ibs_train_'+dataset_name] += [np.nan]
            
    #     try:
    #         cum_hazard_test = get_cumulative_hazard_function_breslow(
    #                 X_train.values, X_test.values, y_train.values, y_test.values,
    #                 best_preds_train.reshape(-1), best_preds_test.reshape(-1)
    #                 )
    #         df_survival_test = np.exp(-cum_hazard_test)
    #         durations_test, events_test = transform_back(y_test.values)
    #         print('durations',durations_test.min(), durations_test.max())
    #         time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
    #         ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
    #         print('Concordance Index',ev.concordance_td('antolini'))
    #         print('Integrated Brier Score:',ev.integrated_brier_score(time_grid_test))
    #         cindex_score_test = ev.concordance_td('antolini')
    #         ibs_score_test = ev.integrated_brier_score(time_grid_test)

    #         outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
    #         outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
    #     except: 
    #         outer_scores['cindex_test_'+dataset_name] += [np.nan]
    #         outer_scores['ibs_test_'+dataset_name] += [np.nan]
            
    # df_best_params = pd.DataFrame(best_params)
    # df_best_model = pd.DataFrame(best_model)
    # df_outer_scores = pd.DataFrame(outer_scores)
    # df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
    # df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
    

    


load_metabric
bins (1903,)
bins (1522,)
[0]	validation_0-eh_likelihood:3.09534	validation_1-eh_likelihood:3.21812
[1]	validation_0-eh_likelihood:3.09534	validation_1-eh_likelihood:3.21812
[2]	validation_0-eh_likelihood:3.09534	validation_1-eh_likelihood:3.21812
[0]	validation_0-eh_likelihood:3.09364	validation_1-eh_likelihood:3.22469
[0]	validation_0-eh_likelihood:3.09031	validation_1-eh_likelihood:3.23914
[0]	validation_0-eh_likelihood:3.12291	validation_1-eh_likelihood:3.13984
[1]	validation_0-eh_likelihood:3.09364	validation_1-eh_likelihood:3.22469
[3]	validation_0-eh_likelihood:3.09534	validation_1-eh_likelihood:3.21812
[0]	validation_0-eh_likelihood:3.12291	validation_1-eh_likelihood:3.13984
[1]	validation_0-eh_likelihood:3.09031	validation_1-eh_likelihood:3.23914
[1]	validation_0-eh_likelihood:3.12291	validation_1-eh_likelihood:3.13984
[4]	validation_0-eh_likelihood:3.09534	validation_1-eh_likelihood:3.21812
[2]	validation_0-eh_likelihood:3.09364	validation_1-eh_likelihood:3.2246

## TCGA Train, Test, Evaluation

In [10]:
param_grid = {
'estimator__reg_alpha': scloguniform(1e-10,1),#[1e-10,1], # from hyp augmentation, L1 regularization
'estimator__reg_lambda': scloguniform(1e-10,1), #[1e-10,1], #alias l2_regularization, lambda in augmentation
'estimator__learning_rate': scloguniform(0.001,1), #[0.001,1], # assumed alias eta from augmentation,
'estimator__n_estimators':  scrandint(1,100), # corresponds to num_rounds
'estimator__gamma': scuniform(0.1,1-0.1),#[0.1,1], # minimum loss reduction required to make a further partition on a leaf node of the tree.
'estimator__colsample_bylevel': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation
'estimator__colsample_bynode': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation, uniform(0.1,1),
'estimator__colsample_bytree': scuniform(0.5, 1-0.5),#[0.5,1], # from hyp augmentation, seems to exceed the bound, uniform(0.5,1)
'estimator__max_depth': scrandint(1,20),#[1,20], # from hyp augmentation
'estimator__max_delta_step': scrandint(0,10),#[0,10], # from hyp augmentation
'estimator__min_child_weight' : scloguniform(0.1,20-0.1),#[0.1,20], # from hyp augmentation
'estimator__subsample': scuniform(0.01,1-0.01),#[0.01,1], # from hyp augmentation
'pca__n_components': [8, 10, 12, 14, 16]
}

In [11]:
cancer_types = ['BLCA',
    'BRCA',
    'HNSC',
    'KIRC',
    'LGG',
    'LIHC',
    'LUAD',
    'LUSC',
    'OV',
    'STAD']


for idx, cancer_type in enumerate(cancer_types):
    model = 'xgbsurv_breslow_'
    # get name of current dataset
    data = load_tcga(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", cancer_type=cancer_type, as_frame=True)
    X  = data.data #.astype(np.float32)
    y = data.target #.values #.to_numpy()
    filename = data.filename
    dataset_name = filename.split('_')[0]
    X, y = sort_X_y_pandas(X, y)
    y = pd.concat([data.target,data.target], axis=1)
    # add IBS later
    outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[],
                    'ibs_train_'+dataset_name:[], 'ibs_test_'+dataset_name:[]}
    best_params = {'best_params_'+dataset_name:[]}
    best_model = {'best_model_'+dataset_name:[]}
    ct = make_column_transformer(
            #(OneHotEncoder(sparse_output=False), make_column_selector(dtype_include=['category', 'object']))
            (StandardScaler(), make_column_selector(dtype_include=['float32']))
            ,remainder='passthrough')
    
    estimator = XGBSurv(
        objective='breslow_objective',
        eval_metric='breslow_loss',
        random_state=rand_state, 
        disable_default_eval_metric=True,
        early_stopping_rounds=early_stopping_rounds, 
        base_score=base_score
                        )
    
    pipe = Pipeline([('scaler',ct),
                    ('pca', PCA()),
                    ('estimator', estimator)])
    
    rs = RandomizedSearchCV(pipe, param_grid, scoring = scoring_function, n_jobs=-1, 
                             cv=inner_custom_cv, n_iter=2, refit=True)
    
    for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)

        print(X_train.shape, type(X_train))
        print(y_train.shape, type(y_train))
        print(X_test.shape, type(X_test))
        print(y_test.shape, type(y_test))
        # save splits and data
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        np.savetxt('splits/'+model+'X_train_'+str(i)+'_'+filename, X_train, delimiter=',')
        np.savetxt('splits/'+model+'X_test_'+str(i)+'_'+filename, X_test, delimiter=',')

        np.savetxt('splits/'+model+'y_train_'+str(i)+'_'+filename, y_train, delimiter=',')
        np.savetxt('splits/'+model+'y_test_'+str(i)+'_'+filename, y_test, delimiter=',')

        rs.fit(X_train, y_train, estimator__eval_test_size=0.2)
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)
        # save hyperparameter settings
        params = rs.best_estimator_.get_params
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [params]
        try:
            cum_hazard_train = get_cumulative_hazard_function_breslow(
                    X_train.values, X_train.values, y_train.values, y_train.values,
                    best_preds_train.reshape(-1), best_preds_train.reshape(-1)
                    )

            df_survival_train = np.exp(-cum_hazard_train)
            durations_train, events_train = transform_back(y_train.values)
            time_grid_train = np.linspace(durations_train.min(), durations_train.max(), 100)
            ev = EvalSurv(df_survival_train, durations_train, events_train, censor_surv='km')
            print('Concordance Index',ev.concordance_td('antolini'))
            print('Integrated Brier Score:',ev.integrated_brier_score(time_grid_train))
            cindex_score_train = ev.concordance_td('antolini')
            ibs_score_train = ev.integrated_brier_score(time_grid_train)

            outer_scores['cindex_train_'+dataset_name] += [cindex_score_train]
            outer_scores['ibs_train_'+dataset_name] += [ibs_score_train]

        except:
            outer_scores['cindex_train_'+dataset_name] += [np.nan]
            outer_scores['ibs_train_'+dataset_name] += [np.nan]
            
        try:
            cum_hazard_test = get_cumulative_hazard_function_breslow(
                    X_train.values, X_test.values, y_train.values, y_test.values,
                    best_preds_train.reshape(-1), best_preds_test.reshape(-1)
                    )
            df_survival_test = np.exp(-cum_hazard_test)
            durations_test, events_test = transform_back(y_test.values)
            print('durations',durations_test.min(), durations_test.max())
            time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
            ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
            print('Concordance Index',ev.concordance_td('antolini'))
            print('Integrated Brier Score:',ev.integrated_brier_score(time_grid_test))
            cindex_score_test = ev.concordance_td('antolini')
            ibs_score_test = ev.integrated_brier_score(time_grid_test)

            outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
            outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
        except: 
            outer_scores['cindex_test_'+dataset_name] += [np.nan]
            outer_scores['ibs_test_'+dataset_name] += [np.nan]
            
    df_best_params = pd.DataFrame(best_params)
    df_best_model = pd.DataFrame(best_model)
    df_outer_scores = pd.DataFrame(outer_scores)
    df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
    df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
    


    

bins (406,)
(324, 20531) <class 'pandas.core.frame.DataFrame'>
(324, 2) <class 'pandas.core.frame.DataFrame'>
(82, 20531) <class 'pandas.core.frame.DataFrame'>
(82, 2) <class 'pandas.core.frame.DataFrame'>
bins (324,)


ValueError: 
All the 10 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/xgbsurv/base.py", line 94, in fit
    return super(XGBSurv, self).fit(X_train, y_train, **kwargs)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/xgboost/core.py", line 620, in inner_f
    return func(**kwargs)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/xgboost/sklearn.py", line 1025, in fit
    self._Booster = train(
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/xgboost/core.py", line 620, in inner_f
    return func(**kwargs)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/xgboost/training.py", line 185, in train
    bst.update(dtrain, i, obj)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/xgboost/core.py", line 1923, in update
    grad, hess = fobj(pred, dtrain)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/xgboost/sklearn.py", line 107, in inner
    return func(labels, preds)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/numba/core/dispatcher.py", line 468, in _compile_for_args
    error_rewrite(e, 'typing')
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/numba/core/dispatcher.py", line 409, in error_rewrite
    raise e.with_traceback(None)
numba.core.errors.TypingError: Failed in nopython mode pipeline (step: nopython frontend)
[1m[1mNo implementation of function Function(<built-in function iadd>) found for signature:
 
 >>> iadd(Literal[int](0), array(float32, 1d, C))
 
There are 18 candidate implementations:
[1m     - Of which 16 did not match due to:
     Overload of function 'iadd': File: <numerous>: Line N/A.
       With argument(s): '(int64, array(float32, 1d, C))':[0m
[1m      No match.[0m
[1m     - Of which 2 did not match due to:
     Operator Overload in function 'iadd': File: unknown: Line unknown.
       With argument(s): '(int64, array(float32, 1d, C))':[0m
[1m      No match for registered cases:
       * (int64, int64) -> int64
       * (int64, uint64) -> int64
       * (uint64, int64) -> int64
       * (uint64, uint64) -> uint64
       * (float32, float32) -> float32
       * (float64, float64) -> float64
       * (complex64, complex64) -> complex64
       * (complex128, complex128) -> complex128[0m
[0m
[0m[1mDuring: typing of intrinsic-call at /Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/xgbsurv/models/breslow_final.py (119)[0m
[1m
File "../../../../miniconda3/envs/xgbsurv/lib/python3.10/site-packages/xgbsurv/models/breslow_final.py", line 119:[0m
[1mdef breslow_objective(y: npt.NDArray[float], log_partial_hazard: npt.NDArray[float]) -> tuple[npt.NDArray[float], npt.NDArray[float]]:
    <source elided>

[1m    grad = np.empty(samples)
[0m    [1m^[0m[0m

