## XGBSurv EH Benchmark

In [1]:
from xgbsurv.datasets import (load_metabric, load_flchain, load_rgbsg, load_support, load_tcga)
from xgbsurv import XGBSurv
from xgbsurv.evaluation import cindex_censored, ibs
from xgbsurv.models.utils import sort_X_y
import os
import numpy as np
import pandas as pd
from scipy.stats import uniform as scuniform
from scipy.stats import randint as scrandint
from scipy.stats import loguniform as scloguniform 
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, LabelBinarizer, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.decomposition import PCA
# import models
from xgbsurv.models.eh_final import eh_likelihood, get_cumulative_hazard_function_eh
from pycox.evaluation import EvalSurv
from xgbsurv.models.utils import sort_X_y, sort_X_y_pandas, transform_back, transform
from xgbsurv.preprocessing.dataset_preprocessing import discretizer_df
from sklearn.utils.fixes import loguniform
from sklearn.preprocessing import MinMaxScaler

## Set Path

In [2]:
current_path = os.getcwd()  # Get the current working directory path
two_levels_up = os.path.abspath(os.path.join(current_path, "..", ".."))
data_path = two_levels_up+'/xgbsurv/datasets/data/'

## Set Parameters

In [3]:
# set parameters
n_outer_splits = 5
n_inner_splits = 5
rand_state = 42
n_iter = 50 #0
early_stopping_rounds=10
base_score = 0.0
validation_size = 0.2
model = 'eh_'

# set seed for scipy
np.random.seed(rand_state)

# Define parameter grid for random forest classifier
param_grid = {
'estimator__reg_alpha': scloguniform(1e-10,1),#[1e-10,1], # from hyp augmentation, L1 regularization
'estimator__reg_lambda': scloguniform(1e-10,1), #[1e-10,1], #alias l2_regularization, lambda in augmentation
'estimator__learning_rate': scloguniform(0.01,1), #[0.001,1], # assumed alias eta from augmentation,
'estimator__n_estimators':  scrandint(1,4000), # corresponds to num_rounds
'estimator__gamma': loguniform(0.001,1.0),#[0.1,1], # minimum loss reduction required to make a further partition on a leaf node of the tree.
'estimator__colsample_bylevel': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation
'estimator__colsample_bynode': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation, uniform(0.1,1),
'estimator__colsample_bytree': scuniform(0.5, 1-0.5),#[0.5,1], # from hyp augmentation, seems to exceed the bound, uniform(0.5,1)
'estimator__max_depth': scrandint(1,20),#[1,20], # from hyp augmentation
'estimator__max_delta_step': scrandint(0,10),#[0,10], # from hyp augmentation
'estimator__min_child_weight' : scloguniform(0.1,20-0.1),#[0.1,20], # from hyp augmentation
'estimator__subsample': scuniform(0.01,1-0.01),#[0.01,1], # from hyp augmentation
}

## Custom Splitting

In [4]:
# Define stratified inner k-fold cross-validation
class CustomSplit(StratifiedKFold):
    def __init__(self, n_splits=5, shuffle=True, random_state=None):
        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    def split(self, X, y, groups=None):
        print('split', X.dtypes)
        try:
            if y.shape[1]>1:
                y = y[:,0]
        except:
            pass
        bins = np.sign(y)
        return super().split(X, bins, groups=groups)

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

outer_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)
inner_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)



## Scoring Function

In [5]:
scoring_function = make_scorer(eh_likelihood, greater_is_better=False)

#scoring_function = make_scorer(eh_likelihood, greater_is_better=False) #changed here
def custom_scoring_function(y_true, y_pred):

        if not isinstance(y_true, np.ndarray):
            y_true = y_true.values
        if not isinstance(y_pred, np.ndarray):
            y_pred = y_pred.values
        # change order of this later
        score = eh_likelihood(y_true, y_pred)
        return score #.numpy()

scoring_function = make_scorer(custom_scoring_function, greater_is_better=False)

In [6]:
## Set Basic Elements

ct = make_column_transformer(
        (StandardScaler(), make_column_selector(dtype_include=['float32'])),
        #(OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist'), make_column_selector(dtype_include=['category', 'object'])),
        remainder='passthrough')

estimator = XGBSurv(
    objective='eh_objective',
    eval_metric='eh_loss',
    random_state=rand_state, 
    disable_default_eval_metric=True,
    early_stopping_rounds=early_stopping_rounds, 
    base_score=base_score,
                    )
pipe = Pipeline([('scaler',ct),
                ('estimator', estimator)])
    
rs = RandomizedSearchCV(pipe, param_grid, scoring = scoring_function, n_jobs=-1, 
                             cv=inner_custom_cv, n_iter=n_iter, refit=True, 
                             random_state=rand_state, verbose=1,
                             error_score = 'raise')

In [7]:
## Save Results

In [8]:
metrics_sum = {}
agg_metrics_cindex = []
agg_metrics_ibs = []

## METABRIC

In [9]:

data = load_metabric(path=data_path, as_frame=True)
filename = data.filename
dataset_name = filename.split('_')[0]
best_params = {'best_params_'+dataset_name:[]}
best_model = {'best_model_'+dataset_name:[]}
outer_scores = {'cindex_test_'+dataset_name:[],'ibs_test_'+dataset_name:[]}
X  = data.data 
y = data.target 
y = pd.concat([y,y], axis=1)
y.columns = ['target1', 'target2']
for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)
        rs.fit(X_train, y_train, estimator__eval_test_size=validation_size, estimator__verbose=0)
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [rs.best_estimator_.get_params()]
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)

        np.savetxt('predictions/'+model+'best_preds_train_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
        np.savetxt('predictions/'+model+'best_preds_test_'+str(i)+'_'+filename, best_preds_test, delimiter=',')

        cum_hazard_test = get_cumulative_hazard_function_eh(
                X_train.values, X_test.values, y_train.values, y_test.values,
                best_preds_train, best_preds_test
                )
        df_survival_test = np.exp(-cum_hazard_test)
        durations_test, events_test = transform_back(y_test.values[:,0])
        time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
        ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
        print('Concordance Index Test',ev.concordance_td('antolini'))
        print('Integrated Brier Score Test',ev.integrated_brier_score(time_grid_test))
        cindex_score_test = ev.concordance_td('antolini')
        ibs_score_test = ev.integrated_brier_score(time_grid_test)
        outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
        outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
df_best_params = pd.DataFrame(best_params)
df_best_model = pd.DataFrame(best_model)
df_outer_scores = pd.DataFrame(outer_scores)
df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
# cindex
df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                        'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                        'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })
# IBS
df_agg_metrics_ibs = pd.DataFrame({'dataset':[dataset_name],
                                        'ibs_test_mean':df_outer_scores['ibs_test_'+dataset_name].mean(),
                                        'ibs_test_std':df_outer_scores['ibs_test_'+dataset_name].std() })
agg_metrics_cindex.append(df_agg_metrics_cindex)
agg_metrics_ibs.append(df_agg_metrics_ibs)
metrics_sum[model+dataset_name] = df_metrics


Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 12.09
integration_values.shape[0] 122315
Concordance Index Test 0.6638427046545772
Integrated Brier Score Test 0.1691472224981181
Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 12.38
integration_values.shape[0] 122765
Concordance Index Test 0.6386752935160966
Integrated Brier Score Test 0.17077684376423446
Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 5.85
integration_values.shape[0] 52846
Concordance Index Test 0.6451241995916183
Integrated Brier Score Test 0.1696888596360053
Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 3.82
integration_values.shape[0] 40693
Concordance Index Test 0.6773601632888687
Integrated Brier Score Test 0.15841707841018862
Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 3.52
integration_values.shape[0] 37062
Concordance Index Test 0.6570622486343718
Integrated Brier Score Test 0.16526817336

## FLCHAIN

In [10]:

data = load_flchain(path=data_path, as_frame=True)
filename = data.filename
dataset_name = filename.split('_')[0]
best_params = {'best_params_'+dataset_name:[]}
best_model = {'best_model_'+dataset_name:[]}
outer_scores = {'cindex_test_'+dataset_name:[],'ibs_test_'+dataset_name:[]}
X  = data.data 
y = data.target 
y = pd.concat([y,y], axis=1)
y.columns = ['target1', 'target2']
for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)
        rs.fit(X_train, y_train, estimator__eval_test_size=validation_size, estimator__verbose=0)
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [rs.best_estimator_.get_params()]
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)

        np.savetxt('predictions/'+model+'best_preds_train_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
        np.savetxt('predictions/'+model+'best_preds_test_'+str(i)+'_'+filename, best_preds_test, delimiter=',')
        #minmaxscaler = MinMaxScaler(feature_range=(-0.75, 0.75))
        #best_preds_train = minmaxscaler.fit_transform(best_preds_train)
        #best_preds_test = minmaxscaler.fit_transform(best_preds_test)
        #print('best_preds_train shape', best_preds_train.shape)
        #print('best_preds_test shape', best_preds_test.shape)
        cum_hazard_test = get_cumulative_hazard_function_eh(
                X_train.values, X_test.values, y_train.values, y_test.values,
                best_preds_train, best_preds_test
                )
        df_survival_test = np.exp(-cum_hazard_test)
        durations_test, events_test = transform_back(y_test.values[:,0])
        time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
        ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
        print('Concordance Index Test',ev.concordance_td('antolini'))
        print('Integrated Brier Score Test',ev.integrated_brier_score(time_grid_test))
        cindex_score_test = ev.concordance_td('antolini')
        ibs_score_test = ev.integrated_brier_score(time_grid_test)
        outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
        outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
df_best_params = pd.DataFrame(best_params)
df_best_model = pd.DataFrame(best_model)
df_outer_scores = pd.DataFrame(outer_scores)
df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
# cindex
df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                        'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                        'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })
# IBS
df_agg_metrics_ibs = pd.DataFrame({'dataset':[dataset_name],
                                        'ibs_test_mean':df_outer_scores['ibs_test_'+dataset_name].mean(),
                                        'ibs_test_std':df_outer_scores['ibs_test_'+dataset_name].std() })
agg_metrics_cindex.append(df_agg_metrics_cindex)
agg_metrics_ibs.append(df_agg_metrics_ibs)
metrics_sum[model+dataset_name] = df_metrics


Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 45.17
integration_values.shape[0] 235547
Concordance Index Test 0.7814489818877264
Integrated Brier Score Test 0.09838964127361889
Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 69.78
integration_values.shape[0] 357473
Concordance Index Test 0.7980671574588969
Integrated Brier Score Test 0.09524479211291466
Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 97.47
integration_values.shape[0] 503528
Concordance Index Test 0.780494537559029
Integrated Brier Score Test 0.09637501585477062
Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 104.44
integration_values.shape[0] 540043
Concordance Index Test 0.7813475521815916
Integrated Brier Score Test 0.09746942126990302
Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 34.68
integration_values.shape[0] 179864
Concordance Index Test 0.7923975295618376
Integrated Brier Score Test 0.092

In [11]:
np.max(best_preds_test)

3.546034

## RGBSG

In [13]:

data = load_rgbsg(path=data_path, as_frame=True)
filename = data.filename
dataset_name = filename.split('_')[0]
best_params = {'best_params_'+dataset_name:[]}
best_model = {'best_model_'+dataset_name:[]}
outer_scores = {'cindex_test_'+dataset_name:[],'ibs_test_'+dataset_name:[]}
X  = data.data 
y = data.target 
y = pd.concat([y,y], axis=1)
y.columns = ['target1', 'target2']
for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)
        rs.fit(X_train, y_train, estimator__eval_test_size=validation_size, estimator__verbose=0)
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [rs.best_estimator_.get_params()]
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)

        np.savetxt('predictions/'+model+'best_preds_train_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
        np.savetxt('predictions/'+model+'best_preds_test_'+str(i)+'_'+filename, best_preds_test, delimiter=',')

        cum_hazard_test = get_cumulative_hazard_function_eh(
                X_train.values, X_test.values, y_train.values, y_test.values,
                best_preds_train, best_preds_test
                )
        df_survival_test = np.exp(-cum_hazard_test)
        durations_test, events_test = transform_back(y_test.values[:,0])
        time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
        ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
        print('Concordance Index Test',ev.concordance_td('antolini'))
        print('Integrated Brier Score Test',ev.integrated_brier_score(time_grid_test))
        cindex_score_test = ev.concordance_td('antolini')
        ibs_score_test = ev.integrated_brier_score(time_grid_test)
        outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
        outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
df_best_params = pd.DataFrame(best_params)
df_best_model = pd.DataFrame(best_model)
df_outer_scores = pd.DataFrame(outer_scores)
df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
# cindex
df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                        'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                        'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })
# IBS
df_agg_metrics_ibs = pd.DataFrame({'dataset':[dataset_name],
                                        'ibs_test_mean':df_outer_scores['ibs_test_'+dataset_name].mean(),
                                        'ibs_test_std':df_outer_scores['ibs_test_'+dataset_name].std() })
agg_metrics_cindex.append(df_agg_metrics_cindex)
agg_metrics_ibs.append(df_agg_metrics_ibs)
metrics_sum[model+dataset_name] = df_metrics


Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 4.15
integration_values.shape[0] 11021
Concordance Index Test 0.6964656684203426
Integrated Brier Score Test 0.16795334524586725
Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 4.09
integration_values.shape[0] 1
Concordance Index Test 0.0
Integrated Brier Score Test 0.4148120690987607
Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 3.56
integration_values.shape[0] 9286
Concordance Index Test 0.6696174644502303
Integrated Brier Score Test 0.18234981759269647
Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 3.59
integration_values.shape[0] 9164
Concordance Index Test 0.6650380898543702
Integrated Brier Score Test 0.17868397513183334
Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 4.07
integration_values.shape[0] 10443
Concordance Index Test 0.6836862766042674
Integrated Brier Score Test 0.17359570641400074


In [None]:
## SUPPORT

In [15]:

data = load_support(path=data_path, as_frame=True)
filename = data.filename
dataset_name = filename.split('_')[0]
best_params = {'best_params_'+dataset_name:[]}
best_model = {'best_model_'+dataset_name:[]}
outer_scores = {'cindex_test_'+dataset_name:[],'ibs_test_'+dataset_name:[]}
X  = data.data 
y = data.target 
y = pd.concat([y,y], axis=1)
y.columns = ['target1', 'target2']
for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)
        rs.fit(X_train, y_train, estimator__eval_test_size=validation_size, estimator__verbose=0)
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [rs.best_estimator_.get_params()]
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)

        np.savetxt('predictions/'+model+'best_preds_train_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
        np.savetxt('predictions/'+model+'best_preds_test_'+str(i)+'_'+filename, best_preds_test, delimiter=',')

        cum_hazard_test = get_cumulative_hazard_function_eh(
                X_train.values, X_test.values, y_train.values, y_test.values,
                best_preds_train, best_preds_test
                )
        df_survival_test = np.exp(-cum_hazard_test)
        durations_test, events_test = transform_back(y_test.values[:,0])
        time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
        ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
        print('Concordance Index Test',ev.concordance_td('antolini'))
        print('Integrated Brier Score Test',ev.integrated_brier_score(time_grid_test))
        cindex_score_test = ev.concordance_td('antolini')
        ibs_score_test = ev.integrated_brier_score(time_grid_test)
        outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
        outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
df_best_params = pd.DataFrame(best_params)
df_best_model = pd.DataFrame(best_model)
df_outer_scores = pd.DataFrame(outer_scores)
df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
# cindex
df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                        'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                        'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })
# IBS
df_agg_metrics_ibs = pd.DataFrame({'dataset':[dataset_name],
                                        'ibs_test_mean':df_outer_scores['ibs_test_'+dataset_name].mean(),
                                        'ibs_test_std':df_outer_scores['ibs_test_'+dataset_name].std() })
agg_metrics_cindex.append(df_agg_metrics_cindex)
agg_metrics_ibs.append(df_agg_metrics_ibs)
metrics_sum[model+dataset_name] = df_metrics


Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 3.62
integration_values.shape[0] 7327
Concordance Index Test 0.607529630354571
Integrated Brier Score Test 0.19631841731342417
Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 3.19
integration_values.shape[0] 6475
Concordance Index Test 0.6162894519182714
Integrated Brier Score Test 0.191192744951117
Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 4.57
integration_values.shape[0] 9269
Concordance Index Test 0.6155759415030738
Integrated Brier Score Test 0.19157509192810113
Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 3.44
integration_values.shape[0] 6977
Concordance Index Test 0.6179995329056271
Integrated Brier Score Test 0.19064629199737224
Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 3.31
integration_values.shape[0] 6716
Concordance Index Test 0.617449857197777
Integrated Brier Score Test 0.19039351074494174


## Summarize Information

In [16]:
df_final_eh_1_cindex = pd.concat([df for df in agg_metrics_cindex]).round(4)
df_final_eh_1_cindex.to_csv('metrics/final_gbdt_eh_1_cindex.csv', index=False)
df_final_eh_1_cindex.to_csv('/Users/JUSC/Documents/644928e0fb7e147893e8ec15/05_thesis/tables/final_gbdt_eh_1_cindex.csv', index=False)  #
df_final_eh_1_cindex

Unnamed: 0,dataset,cindex_test_mean,cindex_test_std
0,METABRIC,0.6564,0.0153
0,FLCHAIN,0.7868,0.008
0,RGBSG,0.543,0.3038
0,SUPPORT,0.615,0.0043


In [17]:
df_final_eh_1_ibs = pd.concat([df for df in agg_metrics_ibs]).round(4)
df_final_eh_1_ibs.to_csv('metrics/final_gbdt_eh_1_ibs.csv', index=False)
df_final_eh_1_ibs.to_csv('/Users/JUSC/Documents/644928e0fb7e147893e8ec15/05_thesis/tables/final_gbdt_eh_1_ibs.csv', index=False) 
df_final_eh_1_ibs

Unnamed: 0,dataset,ibs_test_mean,ibs_test_std
0,METABRIC,0.1667,0.0051
0,FLCHAIN,0.096,0.0023
0,RGBSG,0.2235,0.1071
0,SUPPORT,0.192,0.0024


## TCGA

In [8]:
param_grid = {
'estimator__reg_alpha': scloguniform(1e-10,1),#[1e-10,1], # from hyp augmentation, L1 regularization
'estimator__reg_lambda': scloguniform(1e-10,1), #[1e-10,1], #alias l2_regularization, lambda in augmentation
'estimator__learning_rate': scloguniform(0.01,1.0), #[0.001,1], # assumed alias eta from augmentation,
'estimator__n_estimators':  scrandint(1,4000),#00), # corresponds to num_rounds
'estimator__gamma': loguniform(0.001,1.0),#[0.1,1], # minimum loss reduction required to make a further partition on a leaf node of the tree.
'estimator__colsample_bylevel': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation
'estimator__colsample_bynode': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation, uniform(0.1,1),
'estimator__colsample_bytree': scuniform(0.5, 1-0.5),#[0.5,1], # from hyp augmentation, seems to exceed the bound, uniform(0.5,1)
'estimator__max_depth': scrandint(1,20),#[1,20], # from hyp augmentation
'estimator__max_delta_step': scrandint(0,10),#[0,10], # from hyp augmentation
'estimator__min_child_weight' : scloguniform(0.1,20-0.1),#[0.1,20], # from hyp augmentation
'estimator__subsample': scuniform(0.01,1-0.01),#[0.01,1], # from hyp augmentation
#'pca__n_components': [8, 16, 32, 64]
}
n_outer_splits = 5
n_inner_splits = 5
rand_state = 42
n_iter = 50
early_stopping_rounds=10
base_score = 0.0
validation_size = 0.2
model = 'eh_'
data = 'tcga'

In [9]:
## Set Basic Elements

ct = make_column_transformer(
        (StandardScaler(), make_column_selector(dtype_include=['float32'])),
        #(OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist'), make_column_selector(dtype_include=['category', 'object'])),
        remainder='passthrough')

estimator = XGBSurv(
    objective='eh_objective',
    eval_metric='eh_loss',
    random_state=rand_state, 
    disable_default_eval_metric=True,
    early_stopping_rounds=early_stopping_rounds, 
    base_score=base_score,
                    )
pipe = Pipeline([('scaler',ct),
                ('estimator', estimator)])
    
rs = RandomizedSearchCV(pipe, param_grid, scoring = scoring_function, n_jobs=-1, 
                             cv=inner_custom_cv, n_iter=n_iter, refit=True, 
                             random_state=rand_state, verbose=1,
                             error_score = 'raise')

In [14]:
cancer_types = [
    #'BLCA',
    #'BRCA',
    #'HNSC',
    #'KIRC',
    #'LGG',
    #'LIHC',
    #'LUAD',
    'LUSC',
    'OV',
    'STAD']
agg_metrics_cindex = []
agg_metrics_ibs = []

In [15]:
for idx, cancer_type in enumerate(cancer_types):
        print(cancer_type)
        data = load_tcga(path=data_path, cancer_type=cancer_type, as_frame=True)
        filename = data.filename
        dataset_name = filename.split('_')[0]
        outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[]}
        best_params = {'best_params_'+dataset_name:[]}
        best_model = {'best_model_'+dataset_name:[]}
        outer_scores = {'cindex_test_'+dataset_name:[],'ibs_test_'+dataset_name:[]}
        X  = data.data 
        y = data.target 
        y = pd.concat([y,y], axis=1)
        y.columns = ['target1', 'target2']
        for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
                # Split data into training and testing sets for outer fold

                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                
                np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
                np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
                
                X_train, y_train = sort_X_y_pandas(X_train, y_train)
                X_test, y_test = sort_X_y_pandas(X_test, y_test)
                rs.fit(X_train, y_train, estimator__eval_test_size=validation_size, estimator__verbose=0)
                best_params['best_params_'+dataset_name] += [rs.best_params_]
                best_model['best_model_'+dataset_name] += [rs.best_estimator_.get_params()]
                best_preds_train = rs.best_estimator_.predict(X_train)
                best_preds_test = rs.best_estimator_.predict(X_test)

                np.savetxt('predictions/'+model+'best_preds_train_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
                np.savetxt('predictions/'+model+'best_preds_test_'+str(i)+'_'+filename, best_preds_test, delimiter=',')

                cum_hazard_test = get_cumulative_hazard_function_eh(
                        X_train.values, X_test.values, y_train.values, y_test.values,
                        best_preds_train, best_preds_test
                        )
                df_survival_test = np.exp(-cum_hazard_test)
                durations_test, events_test = transform_back(y_test.values[:,0])
                time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
                ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
                print('Concordance Index Test',ev.concordance_td('antolini'))
                print('Integrated Brier Score Test',ev.integrated_brier_score(time_grid_test))
                cindex_score_test = ev.concordance_td('antolini')
                ibs_score_test = ev.integrated_brier_score(time_grid_test)
                outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
                outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
        df_best_params = pd.DataFrame(best_params)
        df_best_model = pd.DataFrame(best_model)
        df_outer_scores = pd.DataFrame(outer_scores)
        df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
        df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
        # cindex
        df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                                'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                                'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })
        # IBS
        df_agg_metrics_ibs = pd.DataFrame({'dataset':[dataset_name],
                                                'ibs_test_mean':df_outer_scores['ibs_test_'+dataset_name].mean(),
                                                'ibs_test_std':df_outer_scores['ibs_test_'+dataset_name].std() })
        agg_metrics_cindex.append(df_agg_metrics_cindex)
        agg_metrics_ibs.append(df_agg_metrics_ibs)

LUSC
Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 4.33
integration_values.shape[0] 16611
Concordance Index Test 0.5647712697734074
Integrated Brier Score Test 0.20369363606093652
Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 1.1
integration_values.shape[0] 4419
Concordance Index Test 0.46093366093366095
Integrated Brier Score Test 0.20043111806552955
Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 1.09
integration_values.shape[0] 5754
Concordance Index Test 0.47786850021486893
Integrated Brier Score Test 0.20553288127384972
Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 1.22
integration_values.shape[0] 5814
Concordance Index Test 0.5572333170969314
Integrated Brier Score Test 0.19827713556109514
Fitting 5 folds for each of 50 candidates, totalling 250 fits
max round 1.07
integration_values.shape[0] 5038
Concordance Index Test 0.5588610284742881
Integrated Brier Score Test 0.21997156203

In [16]:
df_final_eh_tcga_cindex = pd.concat([df for df in agg_metrics_cindex]).round(4)
df_final_eh_tcga_cindex.to_csv('metrics/final_gbdt_tcga_eh_cindex.csv', index=False)
df_final_eh_tcga_cindex.to_csv('/Users/JUSC/Documents/644928e0fb7e147893e8ec15/05_thesis/tables/final_gbdt_tcga_eh_cindex.csv', index=False)  #
df_final_eh_tcga_cindex

Unnamed: 0,dataset,cindex_test_mean,cindex_test_std
0,LUSC,0.5239,0.0502
0,OV,0.5129,0.0615
0,STAD,0.5527,0.0687


In [17]:
df_final_eh_tcga_ibs = pd.concat([df for df in agg_metrics_ibs]).round(4)
df_final_eh_tcga_ibs.to_csv('metrics/final_gbdt_tcga_eh_ibs.csv', index=False)
df_final_eh_tcga_ibs.to_csv('/Users/JUSC/Documents/644928e0fb7e147893e8ec15/05_thesis/tables/final_gbdt_tcga_eh_ibs.csv', index=False) 
df_final_eh_tcga_ibs

Unnamed: 0,dataset,ibs_test_mean,ibs_test_std
0,LUSC,0.2056,0.0085
0,OV,0.1434,0.0188
0,STAD,0.23,0.0326
