## XGBSurv EH Benchmark

In [17]:
from xgbsurv.datasets import (load_metabric, load_flchain, load_rgbsg, load_support, load_tcga)
from xgbsurv import XGBSurv
from xgbsurv.evaluation import cindex_censored, ibs
from xgbsurv.models.utils import sort_X_y
import os
import numpy as np
import pandas as pd
from scipy.stats import uniform as scuniform
from scipy.stats import randint as scrandint
from scipy.stats import loguniform as scloguniform 
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, LabelBinarizer, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.decomposition import PCA
# import models
from xgbsurv.models.eh_final import eh_likelihood, get_cumulative_hazard_function_eh
from pycox.evaluation import EvalSurv
from xgbsurv.models.utils import sort_X_y, sort_X_y_pandas, transform_back, transform
from xgbsurv.preprocessing.dataset_preprocessing import discretizer_df
from sklearn.utils.fixes import loguniform

## Set Path

In [18]:
current_path = os.getcwd()  # Get the current working directory path
two_levels_up = os.path.abspath(os.path.join(current_path, "..", ".."))
data_path = two_levels_up+'/xgbsurv/datasets/data/'

## Set Parameters

In [19]:
# set parameters
n_outer_splits = 5
n_inner_splits = 5
rand_state = 42
n_iter = 1 #50 #0
early_stopping_rounds=10
base_score = 0.0
validation_size = 0.2
model = 'eh_'

# set seed for scipy
np.random.seed(rand_state)

# Define parameter grid for random forest classifier
param_grid = {
'estimator__reg_alpha': scloguniform(1e-10,1),#[1e-10,1], # from hyp augmentation, L1 regularization
'estimator__reg_lambda': scloguniform(1e-10,1), #[1e-10,1], #alias l2_regularization, lambda in augmentation
'estimator__learning_rate': scloguniform(0.001,1), #[0.001,1], # assumed alias eta from augmentation,
'estimator__n_estimators':  scrandint(1,40), # corresponds to num_rounds
'estimator__gamma': loguniform(0.001,1.0),#[0.1,1], # minimum loss reduction required to make a further partition on a leaf node of the tree.
'estimator__colsample_bylevel': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation
'estimator__colsample_bynode': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation, uniform(0.1,1),
'estimator__colsample_bytree': scuniform(0.5, 1-0.5),#[0.5,1], # from hyp augmentation, seems to exceed the bound, uniform(0.5,1)
'estimator__max_depth': scrandint(1,20),#[1,20], # from hyp augmentation
'estimator__max_delta_step': scrandint(0,10),#[0,10], # from hyp augmentation
'estimator__min_child_weight' : scloguniform(0.1,20-0.1),#[0.1,20], # from hyp augmentation
'estimator__subsample': scuniform(0.01,1-0.01),#[0.01,1], # from hyp augmentation
}

## Custom Splitting

In [20]:
class CustomSplit(StratifiedKFold):
    def __init__(self, n_splits=5, shuffle=True, random_state=None):
        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    def split(self, X, y, groups=None):
        try:
            #if y.shape[1]>1:
            if y.ndim>1:
                y = y[:,0]
        except:
            pass
        try:
            if y.ndim>1:
                    y = y.iloc[:,0]
        except:
            pass
        bins = np.sign(y)
        return super().split(X, bins, groups=groups)

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

outer_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)
inner_custom_cv = CustomSplit(n_splits=n_outer_splits )#shuffle=True, random_state=rand_state

## Scoring Function

In [21]:
scoring_function = make_scorer(eh_likelihood, greater_is_better=False)

#scoring_function = make_scorer(aft_likelihood, greater_is_better=False) #changed here
def custom_scoring_function(y_true, y_pred):

        if not isinstance(y_true, np.ndarray):
            y_true = y_true.values
        if not isinstance(y_pred, np.ndarray):
            y_pred = y_pred.values
        # change order of this later
        score = eh_likelihood(y_true, y_pred)
        return score #.numpy()

scoring_function = make_scorer(custom_scoring_function, greater_is_better=False)

In [22]:
## Set Basic Elements

ct = make_column_transformer(
        (StandardScaler(), make_column_selector(dtype_include=['float32'])),
        #(OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist'), make_column_selector(dtype_include=['category', 'object'])),
        remainder='passthrough')

estimator = XGBSurv(
    objective='eh_objective',
    eval_metric='eh_loss',
    random_state=rand_state, 
    disable_default_eval_metric=True,
    early_stopping_rounds=early_stopping_rounds, 
    base_score=base_score,
                    )
pipe = Pipeline([('scaler',ct),
                ('estimator', estimator)])
    
rs = RandomizedSearchCV(pipe, param_grid, scoring = scoring_function, n_jobs=-1, 
                             cv=inner_custom_cv, n_iter=n_iter, refit=True, 
                             random_state=rand_state, verbose=1,
                             error_score = 'raise')

In [24]:
## Save Results

In [25]:
metrics_sum = {}
agg_metrics_cindex = []
agg_metrics_ibs = []

## METABRIC

In [23]:

data = load_metabric(path=data_path, as_frame=True)
filename = data.filename
dataset_name = filename.split('_')[0]
best_params = {'best_params_'+dataset_name:[]}
best_model = {'best_model_'+dataset_name:[]}
outer_scores = {'cindex_test_'+dataset_name:[],'ibs_test_'+dataset_name:[]}
X  = data.data 
y = data.target 
y = pd.concat([y,y], axis=1)
y.columns = ['target1', 'target2']
for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)
        rs.fit(X_train, y_train, estimator__eval_test_size=validation_size, estimator__verbose=0)
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [rs.best_estimator_.get_params]
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)

        np.savetxt('predictions/'+model+'best_preds_train_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
        np.savetxt('predictions/'+model+'best_preds_test_'+str(i)+'_'+filename, best_preds_test, delimiter=',')

        cum_hazard_test = get_cumulative_hazard_function_eh(
                X_train.values, X_test.values, y_train.values, y_test.values,
                best_preds_train, best_preds_test
                )
        df_survival_test = np.exp(-cum_hazard_test)
        durations_test, events_test = transform_back(y_test.values[:,0])
        time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
        ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
        print('Concordance Index Test',ev.concordance_td('antolini'))
        print('Integrated Brier Score Test',ev.integrated_brier_score(time_grid_test))
        cindex_score_test = ev.concordance_td('antolini')
        ibs_score_test = ev.integrated_brier_score(time_grid_test)
        outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
        outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
df_best_params = pd.DataFrame(best_params)
df_best_model = pd.DataFrame(best_model)
df_outer_scores = pd.DataFrame(outer_scores)
df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
# cindex
df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                        'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                        'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })
# IBS
df_agg_metrics_ibs = pd.DataFrame({'dataset':[dataset_name],
                                        'ibs_test_mean':df_outer_scores['ibs_test_'+dataset_name].mean(),
                                        'ibs_test_std':df_outer_scores['ibs_test_'+dataset_name].std() })
agg_metrics_cindex.append(df_agg_metrics_cindex)
agg_metrics_ibs.append(df_agg_metrics_ibs)
metrics_sum[model+dataset_name] = df_metrics


2d
Values have been sorted!
2d
Values have been sorted!
Fitting 5 folds for each of 1 candidates, totalling 5 fits
integration_values.shape[0] 1315
Concordance Index Test 0.6051545573588349
Integrated Brier Score Test 0.18794617113315845
2d
Values have been sorted!
2d
Values have been sorted!
Fitting 5 folds for each of 1 candidates, totalling 5 fits
integration_values.shape[0] 1513
Concordance Index Test 0.6064032724340696
Integrated Brier Score Test 0.18499940653868646
2d
Values have been sorted!
2d
Values have been sorted!
Fitting 5 folds for each of 1 candidates, totalling 5 fits
integration_values.shape[0] 1738
Concordance Index Test 0.6262117637729715
Integrated Brier Score Test 0.1926349623925296
2d
Values have been sorted!
2d
Values have been sorted!
Fitting 5 folds for each of 1 candidates, totalling 5 fits
integration_values.shape[0] 1485


KeyboardInterrupt: 

## FLCHAIN

In [None]:

data = load_flchain(path=data_path, as_frame=True)
filename = data.filename
dataset_name = filename.split('_')[0]
best_params = {'best_params_'+dataset_name:[]}
best_model = {'best_model_'+dataset_name:[]}
outer_scores = {'cindex_test_'+dataset_name:[],'ibs_test_'+dataset_name:[]}
X  = data.data 
y = data.target 
y = pd.concat([y,y], axis=1)
y.columns = ['target1', 'target2']
for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)
        rs.fit(X_train, y_train, estimator__eval_test_size=validation_size, estimator__verbose=0)
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [rs.best_estimator_.get_params]
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)

        np.savetxt('predictions/'+model+'best_preds_train_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
        np.savetxt('predictions/'+model+'best_preds_test_'+str(i)+'_'+filename, best_preds_test, delimiter=',')

        cum_hazard_test = get_cumulative_hazard_function_eh(
                X_train.values, X_test.values, y_train.values, y_test.values,
                best_preds_train, best_preds_test
                )
        df_survival_test = np.exp(-cum_hazard_test)
        durations_test, events_test = transform_back(y_test.values[:,0])
        time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
        ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
        print('Concordance Index Test',ev.concordance_td('antolini'))
        print('Integrated Brier Score Test',ev.integrated_brier_score(time_grid_test))
        cindex_score_test = ev.concordance_td('antolini')
        ibs_score_test = ev.integrated_brier_score(time_grid_test)
        outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
        outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
df_best_params = pd.DataFrame(best_params)
df_best_model = pd.DataFrame(best_model)
df_outer_scores = pd.DataFrame(outer_scores)
df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
# cindex
df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                        'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                        'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })
# IBS
df_agg_metrics_ibs = pd.DataFrame({'dataset':[dataset_name],
                                        'ibs_test_mean':df_outer_scores['ibs_test_'+dataset_name].mean(),
                                        'ibs_test_std':df_outer_scores['ibs_test_'+dataset_name].std() })
agg_metrics_cindex.append(df_agg_metrics_cindex)
agg_metrics_ibs.append(df_agg_metrics_ibs)
metrics_sum[model+dataset_name] = df_metrics


## RGBSG

In [None]:

data = load_rgbsg(path=data_path, as_frame=True)
filename = data.filename
dataset_name = filename.split('_')[0]
best_params = {'best_params_'+dataset_name:[]}
best_model = {'best_model_'+dataset_name:[]}
outer_scores = {'cindex_test_'+dataset_name:[],'ibs_test_'+dataset_name:[]}
X  = data.data 
y = data.target 
y = pd.concat([y,y], axis=1)
y.columns = ['target1', 'target2']
for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)
        rs.fit(X_train, y_train, estimator__eval_test_size=validation_size, estimator__verbose=0)
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [rs.best_estimator_.get_params]
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)

        np.savetxt('predictions/'+model+'best_preds_train_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
        np.savetxt('predictions/'+model+'best_preds_test_'+str(i)+'_'+filename, best_preds_test, delimiter=',')

        cum_hazard_test = get_cumulative_hazard_function_eh(
                X_train.values, X_test.values, y_train.values, y_test.values,
                best_preds_train, best_preds_test
                )
        df_survival_test = np.exp(-cum_hazard_test)
        durations_test, events_test = transform_back(y_test.values[:,0])
        time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
        ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
        print('Concordance Index Test',ev.concordance_td('antolini'))
        print('Integrated Brier Score Test',ev.integrated_brier_score(time_grid_test))
        cindex_score_test = ev.concordance_td('antolini')
        ibs_score_test = ev.integrated_brier_score(time_grid_test)
        outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
        outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
df_best_params = pd.DataFrame(best_params)
df_best_model = pd.DataFrame(best_model)
df_outer_scores = pd.DataFrame(outer_scores)
df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
# cindex
df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                        'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                        'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })
# IBS
df_agg_metrics_ibs = pd.DataFrame({'dataset':[dataset_name],
                                        'ibs_test_mean':df_outer_scores['ibs_test_'+dataset_name].mean(),
                                        'ibs_test_std':df_outer_scores['ibs_test_'+dataset_name].std() })
agg_metrics_cindex.append(df_agg_metrics_cindex)
agg_metrics_ibs.append(df_agg_metrics_ibs)
metrics_sum[model+dataset_name] = df_metrics


In [None]:
## SUPPORT

In [None]:

data = load_support(path=data_path, as_frame=True)
filename = data.filename
dataset_name = filename.split('_')[0]
best_params = {'best_params_'+dataset_name:[]}
best_model = {'best_model_'+dataset_name:[]}
outer_scores = {'cindex_test_'+dataset_name:[],'ibs_test_'+dataset_name:[]}
X  = data.data 
y = data.target 
y = pd.concat([y,y], axis=1)
y.columns = ['target1', 'target2']
for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)
        rs.fit(X_train, y_train, estimator__eval_test_size=validation_size, estimator__verbose=0)
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [rs.best_estimator_.get_params]
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)

        np.savetxt('predictions/'+model+'best_preds_train_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
        np.savetxt('predictions/'+model+'best_preds_test_'+str(i)+'_'+filename, best_preds_test, delimiter=',')

        cum_hazard_test = get_cumulative_hazard_function_eh(
                X_train.values, X_test.values, y_train.values, y_test.values,
                best_preds_train, best_preds_test
                )
        df_survival_test = np.exp(-cum_hazard_test)
        durations_test, events_test = transform_back(y_test.values[:,0])
        time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
        ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
        print('Concordance Index Test',ev.concordance_td('antolini'))
        print('Integrated Brier Score Test',ev.integrated_brier_score(time_grid_test))
        cindex_score_test = ev.concordance_td('antolini')
        ibs_score_test = ev.integrated_brier_score(time_grid_test)
        outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
        outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
df_best_params = pd.DataFrame(best_params)
df_best_model = pd.DataFrame(best_model)
df_outer_scores = pd.DataFrame(outer_scores)
df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
# cindex
df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                        'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                        'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })
# IBS
df_agg_metrics_ibs = pd.DataFrame({'dataset':[dataset_name],
                                        'ibs_test_mean':df_outer_scores['ibs_test_'+dataset_name].mean(),
                                        'ibs_test_std':df_outer_scores['ibs_test_'+dataset_name].std() })
agg_metrics_cindex.append(df_agg_metrics_cindex)
agg_metrics_ibs.append(df_agg_metrics_ibs)
metrics_sum[model+dataset_name] = df_metrics


In [12]:
durations_test, events_test = transform_back(y_test.values[:,0])
df_survival_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,371,372,373,374,375,376,377,378,379,380
0.766667,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
1.766667,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
2.400000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
2.533333,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
5.066667,0.996091,0.996144,0.996171,0.995482,0.995230,0.996218,0.996729,0.995073,0.996724,0.995632,...,0.996362,0.995931,0.996787,0.995797,0.995990,0.996609,0.996304,0.995907,0.996513,0.996546
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296.866669,0.255709,0.261741,0.263207,0.205819,0.190055,0.267344,0.317183,0.176312,0.318913,0.214946,...,0.278027,0.241700,0.323125,0.230536,0.244580,0.305344,0.271432,0.238843,0.293077,0.300484
307.633331,0.242254,0.248127,0.249802,0.193200,0.177494,0.253686,0.302936,0.164665,0.305228,0.202158,...,0.264582,0.228386,0.309309,0.217495,0.231356,0.291242,0.257561,0.225809,0.279250,0.286171
318.200012,0.229448,0.235878,0.236872,0.181726,0.167008,0.241080,0.289581,0.154305,0.291660,0.190316,...,0.251564,0.216204,0.295928,0.205260,0.218868,0.277893,0.245234,0.213245,0.265772,0.273244
335.600006,0.210975,0.216628,0.217416,0.164388,0.149863,0.221516,0.269100,0.137716,0.271039,0.172412,...,0.232678,0.197266,0.274790,0.188211,0.200379,0.257748,0.226216,0.194749,0.246095,0.252811


In [11]:
y_test

Unnamed: 0,target1,target2
0,-0.766667,-0.766667
1,-1.766667,-1.766667
2,-2.400000,-2.400000
3,-2.533333,-2.533333
4,5.066667,5.066667
...,...,...
376,-296.866669,-296.866669
377,-307.633331,-307.633331
378,-318.200012,-318.200012
379,335.600006,335.600006


In [13]:
ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')

## Set Model & Train Test Evaluate

In [None]:
data_set_fns = [load_metabric,  load_flchain, load_rgbsg, load_support] #, load_flchain, load_rgbsg, load_support, load_tcga]
data_set_fns_str = ['load_metabric', 'load_flchain', 'load_rgbsg', 'load_support'] 
one_hot_dict = {'load_flchain': ['mgus'], 'load_support':['cancer', 'race'], 'load_rgbsg':['grade']}
agg_metrics_cindex = []
agg_metrics_ibs = []

for idx, dataset in enumerate([load_metabric]):#data_set_fns
    model = 'xgbsurv_eh_'
    # get name of current dataset
    data = dataset(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
    filename = data.filename
    X  = data.data #.astype(np.float32)
    y = data.target #.values #.to_numpy()
    X, y = sort_X_y_pandas(X, y)
    y = pd.concat([y,y], axis=1)
    print(data_set_fns_str[idx])
    #if data_set_fns_str[idx] in one_hot_dict.keys():
    #   X = pd.get_dummies(X, columns=one_hot_dict[data_set_fns_str[idx]])

    dataset_name = filename.split('_')[0]
    # add IBS later
    outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[],
                    'ibs_train_'+dataset_name:[], 'ibs_test_'+dataset_name:[]}
    best_params = {'best_params_'+dataset_name:[]}
    best_model = {'best_model_'+dataset_name:[]}
    ct = make_column_transformer(
            (StandardScaler(), make_column_selector(dtype_include=['float32'])),
            #(OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist'), make_column_selector(dtype_include=['category', 'object'])),
            remainder='passthrough')
    
    estimator = XGBSurv(
        objective='eh_objective',
        eval_metric='eh_loss',
        disable_default_eval_metric=True,
        early_stopping_rounds=early_stopping_rounds, 
        base_score=base_score,
        random_state=rand_state, 
                        )
    pipe = Pipeline([('scaler',ct),
                    ('estimator', estimator)])
    
    rs = RandomizedSearchCV(pipe, param_grid, scoring = scoring_function, n_jobs=-1, 
                             cv=inner_custom_cv, n_iter=n_iter, refit=True, random_state=rand_state)
    
    for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)
        # print('input',X_train, y_train)
        # print(X_train.shape, type(X_train))
        # print(y_train.shape, type(y_train))
        # print(y_test.shape, type(y_test))
        # print(X_test.shape, type(X_test))
        # # save splits and data
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        #np.savetxt('splits/'+model+'X_train_'+str(i)+'_'+filename, X_train, delimiter=',')
        #np.savetxt('splits/'+model+'X_test_'+str(i)+'_'+filename, X_test, delimiter=',')

        #np.savetxt('splits/'+model+'y_train_'+str(i)+'_'+filename, y_train, delimiter=',')
        #np.savetxt('splits/'+model+'y_test_'+str(i)+'_'+filename, y_test, delimiter=',')

        rs.fit(X_train, y_train.values, estimator__eval_test_size=validation_size)
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)
        np.savetxt('predictions/'+model+'best_preds_train_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
        np.savetxt('predictions/'+model+'best_preds_test_'+str(i)+'_'+filename, best_preds_test, delimiter=',')


        # save hyperparameter settings
        params = rs.best_estimator_.get_params
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [params]
        try:
            cum_hazard_train = get_cumulative_hazard_function_eh(
                    X_train.values, X_train.values, y_train.values, y_train.values,
                    best_preds_train, best_preds_train
                    )

            df_survival_train = np.exp(-cum_hazard_train)
            durations_train, events_train = transform_back(y_train.values)
            time_grid_train = np.linspace(durations_train.min(), durations_train.max(), 100)
            ev = EvalSurv(df_survival_train, durations_train, events_train, censor_surv='km')
            print('Concordance Index',ev.concordance_td('antolini'))
            print('Integrated Brier Score:',ev.integrated_brier_score(time_grid_train))
            cindex_score_train = ev.concordance_td('antolini')
            ibs_score_train = ev.integrated_brier_score(time_grid_train)

            outer_scores['cindex_train_'+dataset_name] += [cindex_score_train]
            outer_scores['ibs_train_'+dataset_name] += [ibs_score_train]

        except:
            outer_scores['cindex_train_'+dataset_name] += [np.nan]
            outer_scores['ibs_train_'+dataset_name] += [np.nan]
            
        try:
            cum_hazard_test = get_cumulative_hazard_function_eh(
                    X_train.values, X_test.values, y_train.values, y_test.values,
                    best_preds_train, best_preds_test
                    )
            df_survival_test = np.exp(-cum_hazard_test)
            durations_test, events_test = transform_back(y_test.values)
            time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
            ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
            print('Concordance Index',ev.concordance_td('antolini'))
            print('Integrated Brier Score:',ev.integrated_brier_score(time_grid_test))
            cindex_score_test = ev.concordance_td('antolini')
            ibs_score_test = ev.integrated_brier_score(time_grid_test)

            outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
            outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
        except: 
            outer_scores['cindex_test_'+dataset_name] += [np.nan]
            outer_scores['ibs_test_'+dataset_name] += [np.nan]
            
    df_best_params = pd.DataFrame(best_params)
    df_best_model = pd.DataFrame(best_model)
    df_outer_scores = pd.DataFrame(outer_scores)
    df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
    df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
    # cindex
    df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                            'cindex_train_mean':df_outer_scores['cindex_train_'+dataset_name].mean(),
                                            'cindex_train_std':df_outer_scores['cindex_train_'+dataset_name].std(),
                                            'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                            'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })
    # IBS
    df_agg_metrics_ibs = pd.DataFrame({'dataset':[dataset_name],
                                            'ibs_train_mean':df_outer_scores['ibs_train_'+dataset_name].mean(),
                                            'ibs_train_std':df_outer_scores['ibs_train_'+dataset_name].std(),
                                            'ibs_test_mean':df_outer_scores['ibs_test_'+dataset_name].mean(),
                                            'ibs_test_std':df_outer_scores['ibs_test_'+dataset_name].std() })
    agg_metrics_cindex.append(df_agg_metrics_cindex)
    agg_metrics_ibs.append(df_agg_metrics_ibs)

df_final_breslow_1_cindex = pd.concat([df for df in agg_metrics_cindex]).round(4)
df_final_breslow_1_cindex.to_csv('metrics/final_gbdt_eh_1_cindex.csv', index=False)
df_final_breslow_1_cindex.to_csv('/Users/JUSC/Documents/644928e0fb7e147893e8ec15/05_thesis/tables/final_gbdt_eh_1_cindex.csv', index=False)  #


df_final_breslow_1_ibs = pd.concat([df for df in agg_metrics_ibs]).round(4)
df_final_breslow_1_ibs.to_csv('metrics/final_gbdt_eh_1_ibs.csv', index=False)
df_final_breslow_1_ibs.to_csv('/Users/JUSC/Documents/644928e0fb7e147893e8ec15/05_thesis/tables/final_gbdt_eh_1_ibs.csv', index=False) 



load_metabric
[0]	validation_0-eh_likelihood:1762.92477	validation_1-eh_likelihood:439.71873
[0]	validation_0-eh_likelihood:1760.03405	validation_1-eh_likelihood:441.78919
[0]	validation_0-eh_likelihood:1765.52884	validation_1-eh_likelihood:440.28058
[1]	validation_0-eh_likelihood:1762.86759	validation_1-eh_likelihood:439.70309
[0]	validation_0-eh_likelihood:1759.67685	validation_1-eh_likelihood:440.04797
[0]	validation_0-eh_likelihood:1762.31200	validation_1-eh_likelihood:439.95551
[1]	validation_0-eh_likelihood:1760.03405	validation_1-eh_likelihood:441.78919
[1]	validation_0-eh_likelihood:1765.48037	validation_1-eh_likelihood:440.27266
[1]	validation_0-eh_likelihood:1759.67132	validation_1-eh_likelihood:440.04563
[2]	validation_0-eh_likelihood:1762.86051	validation_1-eh_likelihood:439.70033
[1]	validation_0-eh_likelihood:1762.23113	validation_1-eh_likelihood:439.94790
[2]	validation_0-eh_likelihood:1760.00994	validation_1-eh_likelihood:441.78479
[2]	validation_0-eh_likelihood:1765.48

KeyboardInterrupt: 

## TCGA Train, Test, Evaluation

In [None]:
param_grid = {
'estimator__reg_alpha': scloguniform(1e-10,1),#[1e-10,1], # from hyp augmentation, L1 regularization
'estimator__reg_lambda': scloguniform(1e-10,1), #[1e-10,1], #alias l2_regularization, lambda in augmentation
'estimator__learning_rate': scloguniform(0.001,1), #[0.001,1], # assumed alias eta from augmentation,
'estimator__n_estimators':  scrandint(1,100), # corresponds to num_rounds
'estimator__gamma': scuniform(0.1,1-0.1),#[0.1,1], # minimum loss reduction required to make a further partition on a leaf node of the tree.
'estimator__colsample_bylevel': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation
'estimator__colsample_bynode': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation, uniform(0.1,1),
'estimator__colsample_bytree': scuniform(0.5, 1-0.5),#[0.5,1], # from hyp augmentation, seems to exceed the bound, uniform(0.5,1)
'estimator__max_depth': scrandint(1,20),#[1,20], # from hyp augmentation
'estimator__max_delta_step': scrandint(0,10),#[0,10], # from hyp augmentation
'estimator__min_child_weight' : scloguniform(0.1,20-0.1),#[0.1,20], # from hyp augmentation
'estimator__subsample': scuniform(0.01,1-0.01),#[0.01,1], # from hyp augmentation
'pca__n_components': [8, 16, 32, 64]
}

In [None]:
cancer_types = ['BLCA',
    'BRCA',
    'HNSC',
    'KIRC',
    'LGG',
    'LIHC',
    'LUAD',
    'LUSC',
    'OV',
    'STAD']
agg_metrics_cindex = []
agg_metrics_ibs = []


for idx, cancer_type in enumerate(cancer_types):
    model = 'xgbsurv_eh_'
    # get name of current dataset
    data = load_tcga(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", cancer_type=cancer_type, as_frame=True)
    filename = data.filename
    X  = data.data #.astype(np.float32)
    y = data.target #.values #.to_numpy()
    X, y = sort_X_y_pandas(X, y)
    y = pd.concat([data.target,data.target], axis=1)
    dataset_name = filename.split('_')[0]

    # add IBS later
    outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[],
                    'ibs_train_'+dataset_name:[], 'ibs_test_'+dataset_name:[]}
    best_params = {'best_params_'+dataset_name:[]}
    best_model = {'best_model_'+dataset_name:[]}
    ct = make_column_transformer(
            (StandardScaler(), make_column_selector(dtype_include=['float32']))
            ,remainder='passthrough')
    
    estimator = XGBSurv(
        objective='eh_objective',
        eval_metric='eh_loss',
        random_state=rand_state, 
        disable_default_eval_metric=True,
        early_stopping_rounds=early_stopping_rounds, 
        base_score=base_score
                        )
    
    pipe = Pipeline([('scaler',ct),
                    ('pca', PCA()),
                    ('estimator', estimator)])
    
    rs = RandomizedSearchCV(pipe, param_grid, scoring = scoring_function, n_jobs=-1, 
                             cv=inner_custom_cv, n_iter=n_iter, refit=True, random_state=rand_state)
    
    for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)

        #print(y_train.shape, type(y_train))
        #print(X_train.shape, type(X_train))
        #print(X_test.shape, type(X_test))
        #print(y_test.shape, type(y_test))
        # save splits and data
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        np.savetxt('splits/'+model+'X_train_'+str(i)+'_'+filename, X_train, delimiter=',')
        np.savetxt('splits/'+model+'X_test_'+str(i)+'_'+filename, X_test, delimiter=',')

        np.savetxt('splits/'+model+'y_train_'+str(i)+'_'+filename, y_train, delimiter=',')
        np.savetxt('splits/'+model+'y_test_'+str(i)+'_'+filename, y_test, delimiter=',')

        rs.fit(X_train, y_train, estimator__eval_test_size=0.2)
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)
        # save hyperparameter settings
        params = rs.best_estimator_.get_params
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [params]
        try:
            cum_hazard_train = get_cumulative_hazard_function_eh(
                    X_train.values, X_train.values, y_train.values, y_train.values,
                    best_preds_train.reshape(-1), best_preds_train.reshape(-1)
                    )

            df_survival_train = np.exp(-cum_hazard_train)
            durations_train, events_train = transform_back(y_train.values)
            time_grid_train = np.linspace(durations_train.min(), durations_train.max(), 100)
            ev = EvalSurv(df_survival_train, durations_train, events_train, censor_surv='km')
            print('Concordance Index',ev.concordance_td('antolini'))
            print('Integrated Brier Score:',ev.integrated_brier_score(time_grid_train))
            cindex_score_train = ev.concordance_td('antolini')
            ibs_score_train = ev.integrated_brier_score(time_grid_train)

            outer_scores['cindex_train_'+dataset_name] += [cindex_score_train]
            outer_scores['ibs_train_'+dataset_name] += [ibs_score_train]

        except:
            outer_scores['cindex_train_'+dataset_name] += [np.nan]
            outer_scores['ibs_train_'+dataset_name] += [np.nan]
            
        try:
            cum_hazard_test = get_cumulative_hazard_function_eh(
                    X_train.values, X_test.values, y_train.values, y_test.values,
                    best_preds_train.reshape(-1), best_preds_test.reshape(-1)
                    )
            df_survival_test = np.exp(-cum_hazard_test)
            durations_test, events_test = transform_back(y_test.values)
            print('durations',durations_test.min(), durations_test.max())
            time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
            ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
            print('Concordance Index',ev.concordance_td('antolini'))
            print('Integrated Brier Score:',ev.integrated_brier_score(time_grid_test))
            cindex_score_test = ev.concordance_td('antolini')
            ibs_score_test = ev.integrated_brier_score(time_grid_test)

            outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
            outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
        except: 
            outer_scores['cindex_test_'+dataset_name] += [np.nan]
            outer_scores['ibs_test_'+dataset_name] += [np.nan]
            
    df_best_params = pd.DataFrame(best_params)
    df_best_model = pd.DataFrame(best_model)
    df_outer_scores = pd.DataFrame(outer_scores)
    df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
    df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
            # cindex
    df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                            'cindex_train_mean':df_outer_scores['cindex_train_'+dataset_name].mean(),
                                            'cindex_train_std':df_outer_scores['cindex_train_'+dataset_name].std(),
                                            'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                            'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })
    # IBS
    df_agg_metrics_ibs = pd.DataFrame({'dataset':[dataset_name],
                                            'ibs_train_mean':df_outer_scores['ibs_train_'+dataset_name].mean(),
                                            'ibs_train_std':df_outer_scores['ibs_train_'+dataset_name].std(),
                                            'ibs_test_mean':df_outer_scores['ibs_test_'+dataset_name].mean(),
                                            'ibs_test_std':df_outer_scores['ibs_test_'+dataset_name].std() })

    agg_metrics_cindex.append(df_agg_metrics_cindex)
    agg_metrics_ibs.append(df_agg_metrics_ibs)


    

y errory error       target  target
346 -1582.0 -1582.0
320 -1110.0 -1110.0
343 -1542.0 -1542.0
124  -372.0  -372.0
234   599.0   599.0
..      ...     ...
20    -68.0   -68.0
225  -578.0  -578.0
146   406.0   406.0
242   623.0   623.0
256   674.0   674.0

[207 rows x 2 columns] <class 'pandas.core.frame.DataFrame'>
     target  target
355 -1761.0 -1761.0
322 -1127.0 -1127.0
353 -1714.0 -1714.0
120  -369.0  -369.0
243   630.0   630.0
..      ...     ...
25    -82.0   -82.0
237  -610.0  -610.0
162   434.0   434.0
255   665.0   665.0
264   706.0   706.0

[207 rows x 2 columns] <class 'pandas.core.frame.DataFrame'>
y error      target  target
350 -1649.0 -1649.0
179  -475.0  -475.0
382 -2380.0 -2380.0
117  -366.0  -366.0
385  2641.0  2641.0
..      ...     ...
358 -1806.0 -1806.0
229  -582.0  -582.0
136   386.0   386.0
18     65.0    65.0
259   690.0   690.0

[207 rows x 2 columns] <class 'pandas.core.frame.DataFrame'>
y error      target  target
358 -1806.0 -1806.0
189  -495.0  -495.0
38

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.11/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.11/site-packages/xgbsurv/base.py", line 107, in fit
    X_train, y_train = self._sort_X_y(X_train, y_train)
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.11/site-packages/xgbsurv/base.py", line 175, in _sort_X_y
    raise ValueError(f'y is not numpy.ndarray. Got {type(y)}.')
ValueError: y is not numpy.ndarray. Got <class 'pandas.core.frame.DataFrame'>.


In [None]:
df_final_breslow_1_cindex = pd.concat([df for df in agg_metrics_cindex]).round(4)
df_final_breslow_1_cindex.to_csv('metrics/final_gbdt_tcga_eh_1_cindex.csv', index=False)
df_final_breslow_1_cindex.to_csv('/Users/JUSC/Documents/644928e0fb7e147893e8ec15/05_thesis/tables/final_gbdt_tcga_eh_cindex.csv', index=False)  #
df_final_breslow_1_cindex

Unnamed: 0,dataset,cindex_train_mean,cindex_train_std,cindex_test_mean,cindex_test_std
0,BLCA,0.744,0.0122,0.5822,0.0175
0,BRCA,0.8445,0.0161,0.5264,0.0513
0,HNSC,0.7756,0.0118,0.5219,0.0441
0,KIRC,0.8115,0.0121,0.6186,0.0546
0,LGG,0.8461,0.0136,0.7583,0.0493
0,LIHC,0.7482,0.0622,0.5116,0.051
0,LUAD,0.7718,0.0075,0.547,0.0468
0,LUSC,0.6836,0.081,0.4753,0.059
0,OV,0.5916,0.0198,0.4774,0.0339
0,STAD,0.699,0.1153,0.4875,0.0582


In [None]:
df_final_breslow_1_ibs = pd.concat([df for df in agg_metrics_ibs]).round(4)
df_final_breslow_1_ibs.to_csv('metrics/final_gbdt_tcga_eh_ibs.csv', index=False)
df_final_breslow_1_ibs.to_csv('/Users/JUSC/Documents/644928e0fb7e147893e8ec15/05_thesis/tables/final_gbdt_tcga_eh_ibs.csv', index=False) 
df_final_breslow_1_ibs

Unnamed: 0,dataset,ibs_train_mean,ibs_train_std,ibs_test_mean,ibs_test_std
0,BLCA,0.209,0.0047,0.2224,0.0066
0,BRCA,0.1854,0.0027,0.1962,0.0097
0,HNSC,0.1803,0.0054,0.2038,0.0062
0,KIRC,0.1969,0.0026,0.1998,0.0065
0,LGG,0.169,0.0054,0.1912,0.0127
0,LIHC,0.2023,0.0074,0.2181,0.0136
0,LUAD,0.1851,0.0046,0.199,0.0108
0,LUSC,0.1892,0.0039,0.2037,0.0119
0,OV,0.1266,0.0077,0.1442,0.0183
0,STAD,0.2056,0.0038,0.2166,0.0081
