# Introduction to the xgbsurv package

This notebook introduces `xgbsurv` using a specific dataset. It structured by the following steps:

- Load data
- Load model
- Fit model
- Predict and evaluate model

The syntax conveniently follows that of sklearn.

In [1]:
from xgbsurv.datasets import (load_metabric, load_flchain, load_rgbsg, load_support, load_tcga)
from xgbsurv import XGBSurv
import numpy as np
import pandas as pd
from scipy.stats import uniform as scuniform
from scipy.stats import randint as scrandint
from scipy.stats import loguniform as scloguniform 
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, LabelBinarizer, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.decomposition import PCA
# import models
from xgbsurv.models.eh_aft_final import aft_likelihood, get_cumulative_hazard_function_aft
from pycox.evaluation import EvalSurv
from xgbsurv.models.utils import sort_X_y_pandas, transform_back, transform
from xgbsurv.preprocessing.dataset_preprocessing import discretizer_df
from sklearn.utils.fixes import loguniform
import os
np.random.seed(42)

## Set Path

In [2]:
current_path = os.getcwd()  # Get the current working directory path
two_levels_up = os.path.abspath(os.path.join(current_path, "..", ".."))
data_path = two_levels_up+'/xgbsurv/datasets/data/'

In [3]:
n_outer_splits = 5
n_inner_splits = 5
rand_state = 42
n_iter = 50 
early_stopping_rounds=10
base_score = 0.0
validation_size = 0.2
model = 'aft_'

param_grid = {
'estimator__reg_alpha': scloguniform(1e-10,1),#[1e-10,1], # from hyp augmentation, L1 regularization
'estimator__reg_lambda': scloguniform(1e-10,1), #[1e-10,1], #alias l2_regularization, lambda in augmentation
'estimator__learning_rate': scloguniform(0.01,1.0), #[0.001,1], # assumed alias eta from augmentation,
'estimator__n_estimators':  scrandint(1,4000),#00), # corresponds to num_rounds
'estimator__gamma': loguniform(0.001,1.0),#[0.1,1], # minimum loss reduction required to make a further partition on a leaf node of the tree.
'estimator__colsample_bylevel': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation
'estimator__colsample_bynode': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation, uniform(0.1,1),
'estimator__colsample_bytree': scuniform(0.5, 1-0.5),#[0.5,1], # from hyp augmentation, seems to exceed the bound, uniform(0.5,1)
'estimator__max_depth': scrandint(1,20),#[1,20], # from hyp augmentation
'estimator__max_delta_step': scrandint(0,10),#[0,10], # from hyp augmentation
'estimator__min_child_weight' : scloguniform(0.1,20-0.1),#[0.1,20], # from hyp augmentation
'estimator__subsample': scuniform(0.01,1-0.01),#[0.01,1], # from hyp augmentation
}


## Custom Splitting

In [4]:
class CustomSplit(StratifiedKFold):
    def __init__(self, n_splits=5, shuffle=True, random_state=None):
        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    def split(self, X, y, groups=None):
        try:
            if y.shape[1]>1:
                y = y[:,0]
        except:
            pass
        bins = np.sign(y)
        return super().split(X, bins, groups=groups)

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

outer_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)
inner_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)

## Scoring Function

In [5]:
#scoring_function = make_scorer(aft_likelihood, greater_is_better=False) #changed here
def custom_scoring_function(y_true, y_pred):

        if not isinstance(y_true, np.ndarray):
            y_true = y_true.values
        if not isinstance(y_pred, np.ndarray):
            y_pred = y_pred.values
        # change order of this later
        score = aft_likelihood(y_true, y_pred)
        return score

scoring_function = make_scorer(custom_scoring_function, greater_is_better=False)

In [6]:
## Set Basic Elements

ct = make_column_transformer(
        (StandardScaler(), make_column_selector(dtype_include=['float32'])),
        #(OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist'), make_column_selector(dtype_include=['category', 'object'])),
        remainder='passthrough')

estimator = XGBSurv(
    objective='aft_objective',
    eval_metric='aft_loss',
    random_state=rand_state, 
    disable_default_eval_metric=True,
    early_stopping_rounds=early_stopping_rounds, 
    base_score=base_score,
                    )
pipe = Pipeline([('scaler',ct),
                ('estimator', estimator)])
    
rs = RandomizedSearchCV(pipe, param_grid, scoring = scoring_function, n_jobs=-1, 
                             cv=inner_custom_cv, n_iter=n_iter, refit=True, 
                             random_state=rand_state, verbose=1,
                             error_score = 'raise')

In [7]:
# saving results
metrics_sum = {}
agg_metrics_cindex = []
agg_metrics_ibs = []

## Metabric

In [8]:
data = load_metabric(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
filename = data.filename
dataset_name = filename.split('_')[0]
outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[]}
best_params = {'best_params_'+dataset_name:[]}
best_model = {'best_model_'+dataset_name:[]}
outer_scores = {'cindex_test_'+dataset_name:[],'ibs_test_'+dataset_name:[]}
X  = data.data 
y = data.target 

for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)
        rs.fit(X_train, y_train, estimator__eval_test_size=validation_size, estimator__verbose=0)
        
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [rs.best_estimator_.get_params()]
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)

        np.savetxt('predictions/'+model+'best_preds_train_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
        np.savetxt('predictions/'+model+'best_preds_test_'+str(i)+'_'+filename, best_preds_test, delimiter=',')

        cum_hazard_test = get_cumulative_hazard_function_aft(
                X_train.values, X_test.values, y_train.values, y_test.values,
                best_preds_train, best_preds_test
                )
        df_survival_test = np.exp(-cum_hazard_test)
        durations_test, events_test = transform_back(y_test.values)
        time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
        ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
        print('Concordance Index Test',ev.concordance_td('antolini'))
        print('Integrated Brier Score Test',ev.integrated_brier_score(time_grid_test))
        cindex_score_test = ev.concordance_td('antolini')
        ibs_score_test = ev.integrated_brier_score(time_grid_test)
        outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
        outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
df_best_params = pd.DataFrame(best_params)
df_best_model = pd.DataFrame(best_model)
df_outer_scores = pd.DataFrame(outer_scores)
df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
# cindex
df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                        'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                        'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })
# IBS
df_agg_metrics_ibs = pd.DataFrame({'dataset':[dataset_name],
                                        'ibs_test_mean':df_outer_scores['ibs_test_'+dataset_name].mean(),
                                        'ibs_test_std':df_outer_scores['ibs_test_'+dataset_name].std() })
agg_metrics_cindex.append(df_agg_metrics_cindex)
agg_metrics_ibs.append(df_agg_metrics_ibs)
metrics_sum[model+dataset_name] = df_metrics


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.637573905797389
Integrated Brier Score Test 0.16907601308515136
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.6261283407736632
Integrated Brier Score Test 0.17718532578351995
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.6239716115219158
Integrated Brier Score Test 0.17568926581616545
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.6754221388367729
Integrated Brier Score Test 0.15804626840844102
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.6411149128599155
Integrated Brier Score Test 0.17161379447015274


## Flchain

In [9]:
data = load_flchain(path=data_path, as_frame=True)
filename = data.filename
dataset_name = filename.split('_')[0]
outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[]}
best_params = {'best_params_'+dataset_name:[]}
best_model = {'best_model_'+dataset_name:[]}
outer_scores = {'cindex_test_'+dataset_name:[],'ibs_test_'+dataset_name:[]}
X  = data.data 
y = data.target 

for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)
        rs.fit(X_train, y_train, estimator__eval_test_size=validation_size, estimator__verbose=0)
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [rs.best_estimator_.get_params()]
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)

        np.savetxt('predictions/'+model+'best_preds_train_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
        np.savetxt('predictions/'+model+'best_preds_test_'+str(i)+'_'+filename, best_preds_test, delimiter=',')

        cum_hazard_test = get_cumulative_hazard_function_aft(
                X_train.values, X_test.values, y_train.values, y_test.values,
                best_preds_train, best_preds_test
                )
        df_survival_test = np.exp(-cum_hazard_test)
        durations_test, events_test = transform_back(y_test.values)
        time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
        ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
        print('Concordance Index Test',ev.concordance_td('antolini'))
        print('Integrated Brier Score Test',ev.integrated_brier_score(time_grid_test))
        cindex_score_test = ev.concordance_td('antolini')
        ibs_score_test = ev.integrated_brier_score(time_grid_test)
        outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
        outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
df_best_params = pd.DataFrame(best_params)
df_best_model = pd.DataFrame(best_model)
df_outer_scores = pd.DataFrame(outer_scores)
df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
# cindex
df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                        'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                        'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })
# IBS
df_agg_metrics_ibs = pd.DataFrame({'dataset':[dataset_name],
                                        'ibs_test_mean':df_outer_scores['ibs_test_'+dataset_name].mean(),
                                        'ibs_test_std':df_outer_scores['ibs_test_'+dataset_name].std() })
agg_metrics_cindex.append(df_agg_metrics_cindex)
agg_metrics_ibs.append(df_agg_metrics_ibs)
metrics_sum[model+dataset_name] = df_metrics


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.803369332883339
Integrated Brier Score Test 0.09545824255375147
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.8016691887074464
Integrated Brier Score Test 0.09327320614548783
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.7880198674637859
Integrated Brier Score Test 0.09576841088559072
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.781409684331025
Integrated Brier Score Test 0.09716198006283183
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.809436969037225
Integrated Brier Score Test 0.08923811385187708


## RGBSG

In [10]:
data = load_rgbsg(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
filename = data.filename
dataset_name = filename.split('_')[0]
outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[]}
best_params = {'best_params_'+dataset_name:[]}
best_model = {'best_model_'+dataset_name:[]}
outer_scores = {'cindex_test_'+dataset_name:[],'ibs_test_'+dataset_name:[]}
X  = data.data 
y = data.target 

for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)
        rs.fit(X_train, y_train, estimator__eval_test_size=validation_size, estimator__verbose=0)
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [rs.best_estimator_.get_params()]
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)

        np.savetxt('predictions/'+model+'best_preds_train_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
        np.savetxt('predictions/'+model+'best_preds_test_'+str(i)+'_'+filename, best_preds_test, delimiter=',')

        cum_hazard_test = get_cumulative_hazard_function_aft(
                X_train.values, X_test.values, y_train.values, y_test.values,
                best_preds_train, best_preds_test
                )
        df_survival_test = np.exp(-cum_hazard_test)
        durations_test, events_test = transform_back(y_test.values)
        time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
        ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
        print('Concordance Index Test',ev.concordance_td('antolini'))
        print('Integrated Brier Score Test',ev.integrated_brier_score(time_grid_test))
        cindex_score_test = ev.concordance_td('antolini')
        ibs_score_test = ev.integrated_brier_score(time_grid_test)
        outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
        outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
df_best_params = pd.DataFrame(best_params)
df_best_model = pd.DataFrame(best_model)
df_outer_scores = pd.DataFrame(outer_scores)
df_metricsmodel = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
# cindex
df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                        'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                        'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })
# IBS
df_agg_metrics_ibs = pd.DataFrame({'dataset':[dataset_name],

                                        'ibs_test_mean':df_outer_scores['ibs_test_'+dataset_name].mean(),
                                        'ibs_test_std':df_outer_scores['ibs_test_'+dataset_name].std() })
agg_metrics_cindex.append(df_agg_metrics_cindex)
agg_metrics_ibs.append(df_agg_metrics_ibs)
metrics_sum[model+dataset_name] = df_metrics
        

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.6931471738837178
Integrated Brier Score Test 0.16974930947328776
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.6792871571337444
Integrated Brier Score Test 0.1821431537910619
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.6680419253621737
Integrated Brier Score Test 0.18329090738873904
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.6639344262295082
Integrated Brier Score Test 0.17589028407488536
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.6875584158944642
Integrated Brier Score Test 0.17577932137755214


## SUPPORT

In [11]:
data = load_support(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
filename = data.filename
dataset_name = filename.split('_')[0]
outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[]}
best_params = {'best_params_'+dataset_name:[]}
best_model = {'best_model_'+dataset_name:[]}
outer_scores = {'cindex_test_'+dataset_name:[],'ibs_test_'+dataset_name:[]}
X  = data.data 
y = data.target 

for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)
        rs.fit(X_train, y_train, estimator__eval_test_size=validation_size, estimator__verbose=0)
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [rs.best_estimator_.get_params()]
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)

        np.savetxt('predictions/'+model+'best_preds_train_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
        np.savetxt('predictions/'+model+'best_preds_test_'+str(i)+'_'+filename, best_preds_test, delimiter=',')

        cum_hazard_test = get_cumulative_hazard_function_aft(
                X_train.values, X_test.values, y_train.values, y_test.values,
                best_preds_train, best_preds_test
                )
        df_survival_test = np.exp(-cum_hazard_test)
        durations_test, events_test = transform_back(y_test.values)
        time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
        ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
        print('Concordance Index Test',ev.concordance_td('antolini'))
        print('Integrated Brier Score Test',ev.integrated_brier_score(time_grid_test))
        cindex_score_test = ev.concordance_td('antolini')
        ibs_score_test = ev.integrated_brier_score(time_grid_test)
        outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
        outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
df_best_params = pd.DataFrame(best_params)
df_best_model = pd.DataFrame(best_model)
df_outer_scores = pd.DataFrame(outer_scores)
df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
# cindex
df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                        'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                        'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })
# IBS
df_agg_metrics_ibs = pd.DataFrame({'dataset':[dataset_name],

                                        'ibs_test_mean':df_outer_scores['ibs_test_'+dataset_name].mean(),
                                        'ibs_test_std':df_outer_scores['ibs_test_'+dataset_name].std() })
agg_metrics_cindex.append(df_agg_metrics_cindex)
agg_metrics_ibs.append(df_agg_metrics_ibs)
metrics_sum[model+dataset_name] = df_metrics


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.585412400197487
Integrated Brier Score Test 0.21406696429022382
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.5621264507229138
Integrated Brier Score Test 0.21374750809853493
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.5753438286244055
Integrated Brier Score Test 0.21200514988680844
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.5899841202555695
Integrated Brier Score Test 0.21268441874136074
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.5441383333014195
Integrated Brier Score Test 0.21281326988942734


## Summarize Information

In [12]:
df_final_aft_1_cindex = pd.concat([df for df in agg_metrics_cindex]).round(4)
df_final_aft_1_cindex.to_csv('metrics/final_gbdt_aft_1_cindex.csv', index=False)
df_final_aft_1_cindex.to_csv('/Users/JUSC/Documents/644928e0fb7e147893e8ec15/05_thesis/tables/final_gbdt_aft_1_cindex.csv', index=False)  #
df_final_aft_1_cindex

Unnamed: 0,dataset,cindex_test_mean,cindex_test_std
0,METABRIC,0.6408,0.0207
0,FLCHAIN,0.7968,0.0116
0,RGBSG,0.6784,0.0124
0,SUPPORT,0.5714,0.0186


In [13]:
df_final_aft_1_ibs = pd.concat([df for df in agg_metrics_ibs]).round(4)
df_final_aft_1_ibs.to_csv('metrics/final_gbdt_aft_1_ibs.csv', index=False)
df_final_aft_1_ibs.to_csv('/Users/JUSC/Documents/644928e0fb7e147893e8ec15/05_thesis/tables/final_gbdt_aft_1_ibs.csv', index=False) 
df_final_aft_1_ibs

Unnamed: 0,dataset,ibs_test_mean,ibs_test_std
0,METABRIC,0.1703,0.0076
0,FLCHAIN,0.0942,0.0031
0,RGBSG,0.1774,0.0055
0,SUPPORT,0.2131,0.0008


## TCGA

In [14]:
param_grid = {
'estimator__reg_alpha': scloguniform(1e-10,1),#[1e-10,1], # from hyp augmentation, L1 regularization
'estimator__reg_lambda': scloguniform(1e-10,1), #[1e-10,1], #alias l2_regularization, lambda in augmentation
'estimator__learning_rate': scloguniform(0.01,1.0), #[0.001,1], # assumed alias eta from augmentation,
'estimator__n_estimators':  scrandint(1,4000),#00), # corresponds to num_rounds
'estimator__gamma': loguniform(0.001,1.0),#[0.1,1], # minimum loss reduction required to make a further partition on a leaf node of the tree.
'estimator__colsample_bylevel': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation
'estimator__colsample_bynode': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation, uniform(0.1,1),
'estimator__colsample_bytree': scuniform(0.5, 1-0.5),#[0.5,1], # from hyp augmentation, seems to exceed the bound, uniform(0.5,1)
'estimator__max_depth': scrandint(1,20),#[1,20], # from hyp augmentation
'estimator__max_delta_step': scrandint(0,10),#[0,10], # from hyp augmentation
'estimator__min_child_weight' : scloguniform(0.1,20-0.1),#[0.1,20], # from hyp augmentation
'estimator__subsample': scuniform(0.01,1-0.01),#[0.01,1], # from hyp augmentation
#'pca__n_components': [8, 16, 32, 64]
}
n_outer_splits = 5
n_inner_splits = 5
rand_state = 42
n_iter = 50 #0
early_stopping_rounds=10
base_score = 0.0
validation_size = 0.2
model = 'aft_tcga_'
#data = 'tcga'

In [8]:
cancer_types = ['BLCA',
    'BRCA',
    'HNSC',
    'KIRC',
    'LGG',
    'LIHC',
    'LUAD',
    'LUSC',
    'OV',
    'STAD']
agg_metrics_cindex = []
agg_metrics_ibs = []
metrics_sum = {}
model = 'aft_tcga_'

In [10]:
for idx, cancer_type in enumerate(cancer_types):
        data = load_tcga(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", cancer_type=cancer_type, as_frame=True)
        filename = data.filename
        dataset_name = filename.split('_')[0]
        outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[]}
        best_params = {'best_params_'+dataset_name:[]}
        best_model = {'best_model_'+dataset_name:[]}
        outer_scores = {'cindex_test_'+dataset_name:[],'ibs_test_'+dataset_name:[]}
        X  = data.data 
        y = data.target 

        for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
                # Split data into training and testing sets for outer fold

                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                
                np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
                np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
                
                X_train, y_train = sort_X_y_pandas(X_train, y_train)
                X_test, y_test = sort_X_y_pandas(X_test, y_test)
                rs.fit(X_train, y_train, estimator__eval_test_size=validation_size, estimator__verbose=0)
                best_params['best_params_'+dataset_name] += [rs.best_params_]
                best_model['best_model_'+dataset_name] += [rs.best_estimator_.get_params()]
                best_preds_train = rs.best_estimator_.predict(X_train)
                best_preds_test = rs.best_estimator_.predict(X_test)

                np.savetxt('predictions/'+model+'best_preds_train_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
                np.savetxt('predictions/'+model+'best_preds_test_'+str(i)+'_'+filename, best_preds_test, delimiter=',')

                cum_hazard_test = get_cumulative_hazard_function_aft(
                        X_train.values, X_test.values, y_train.values, y_test.values,
                        best_preds_train, best_preds_test
                        )
                df_survival_test = np.exp(-cum_hazard_test)
                durations_test, events_test = transform_back(y_test.values)
                time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
                ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
                print('Concordance Index Test',ev.concordance_td('antolini'))
                print('Integrated Brier Score Test',ev.integrated_brier_score(time_grid_test))
                cindex_score_test = ev.concordance_td('antolini')
                ibs_score_test = ev.integrated_brier_score(time_grid_test)
                outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
                outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
        df_best_params = pd.DataFrame(best_params)
        df_best_model = pd.DataFrame(best_model)
        df_outer_scores = pd.DataFrame(outer_scores)
        df_metrics= pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
        df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
        # cindex
        df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                                'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                                'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })
        # IBS
        df_agg_metrics_ibs = pd.DataFrame({'dataset':[dataset_name],
                                                'ibs_test_mean':df_outer_scores['ibs_test_'+dataset_name].mean(),
                                                'ibs_test_std':df_outer_scores['ibs_test_'+dataset_name].std() })
        agg_metrics_cindex.append(df_agg_metrics_cindex)
        agg_metrics_ibs.append(df_agg_metrics_ibs)
        metrics_sum[model+cancer_type] = df_metrics

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.38176795580110495
Integrated Brier Score Test 0.23095484217616288
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.6264336428181322
Integrated Brier Score Test 0.21970910685247957
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.5041816009557945
Integrated Brier Score Test 0.22499448739971434
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.5715922107674685
Integrated Brier Score Test 0.21903317753266022
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.5339366515837104
Integrated Brier Score Test 0.23762356830954517
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.6358506944444444
Integrated Brier Score Test 0.21577870827385198
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.

In [11]:
df_final_aft_tcga_cindex = pd.concat([df for df in agg_metrics_cindex]).round(4)
df_final_aft_tcga_cindex.to_csv('metrics/final_gbdt_tcga_aft_cindex.csv', index=False)
df_final_aft_tcga_cindex.to_csv('/Users/JUSC/Documents/644928e0fb7e147893e8ec15/05_thesis/tables/final_gbdt_tcga_aft_cindex.csv', index=False)  #
df_final_aft_tcga_cindex

Unnamed: 0,dataset,cindex_test_mean,cindex_test_std
0,BLCA,0.5236,0.0915
0,BRCA,0.5605,0.0484
0,HNSC,0.5296,0.083
0,KIRC,0.6567,0.078
0,LGG,0.8339,0.0261
0,LIHC,0.6119,0.1064
0,LUAD,0.5912,0.041
0,LUSC,0.4116,0.0313
0,OV,0.537,0.0786
0,STAD,0.5157,0.0554


In [12]:
df_final_aft_tcga_ibs = pd.concat([df for df in agg_metrics_ibs]).round(4)
df_final_aft_tcga_ibs.to_csv('metrics/final_gbdt_tcga_aft_ibs.csv', index=False)
df_final_aft_tcga_ibs.to_csv('/Users/JUSC/Documents/644928e0fb7e147893e8ec15/05_thesis/tables/final_gbdt_tcga_aft_ibs.csv', index=False) 
df_final_aft_tcga_ibs

Unnamed: 0,dataset,ibs_test_mean,ibs_test_std
0,BLCA,0.2265,0.0079
0,BRCA,0.2129,0.0398
0,HNSC,0.2215,0.0295
0,KIRC,0.1827,0.0128
0,LGG,0.1603,0.0277
0,LIHC,0.2259,0.0264
0,LUAD,0.211,0.032
0,LUSC,0.2161,0.0203
0,OV,0.1464,0.0182
0,STAD,0.2217,0.0158
