# Introduction to the xgbsurv package

This notebook introduces `xgbsurv` using a specific dataset. It structured by the following steps:

- Load data
- Load model
- Fit model
- Predict and evaluate model

The syntax conveniently follows that of sklearn.

In [1]:
from xgbsurv.datasets import (load_metabric, load_flchain, load_rgbsg, load_support, load_tcga)
from xgbsurv import XGBSurv
import numpy as np
import pandas as pd
from scipy.stats import uniform as scuniform
from scipy.stats import randint as scrandint
from scipy.stats import loguniform as scloguniform 
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, LabelBinarizer, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.decomposition import PCA
# import models
from xgbsurv.models.eh_aft_final import aft_likelihood, get_cumulative_hazard_function_aft
from pycox.evaluation import EvalSurv
from xgbsurv.models.utils import sort_X_y_pandas, transform_back, transform
from xgbsurv.preprocessing.dataset_preprocessing import discretizer_df
from sklearn.utils.fixes import loguniform
np.random.seed(42)

In [2]:
param_grid = {
'estimator__reg_alpha': scloguniform(1e-10,1),#[1e-10,1], # from hyp augmentation, L1 regularization
'estimator__reg_lambda': scloguniform(1e-10,1), #[1e-10,1], #alias l2_regularization, lambda in augmentation
'estimator__learning_rate': scloguniform(0.01,1.0), #[0.001,1], # assumed alias eta from augmentation,
'estimator__n_estimators':  scrandint(1,500),#00), # corresponds to num_rounds
'estimator__gamma': loguniform(0.01,1-0.5),#[0.1,1], # minimum loss reduction required to make a further partition on a leaf node of the tree.
'estimator__colsample_bylevel': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation
'estimator__colsample_bynode': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation, uniform(0.1,1),
'estimator__colsample_bytree': scuniform(0.5, 1-0.5),#[0.5,1], # from hyp augmentation, seems to exceed the bound, uniform(0.5,1)
'estimator__max_depth': scrandint(1,20),#[1,20], # from hyp augmentation
'estimator__max_delta_step': scrandint(0,10),#[0,10], # from hyp augmentation
'estimator__min_child_weight' : scloguniform(0.1,20-0.1),#[0.1,20], # from hyp augmentation
'estimator__subsample': scuniform(0.01,1-0.01),#[0.01,1], # from hyp augmentation
}
n_outer_splits = 5
n_inner_splits = 5
rand_state = 42
n_iter = 50 
early_stopping_rounds=10
base_score = 0.0
validation_size = 0.2
model = 'aft_'
# if only n_estimators it works well for flchain
# now testing all but gamma and l1:
# 0   0.657119	0.122939
# 1	0.764429	0.109948
# 2	0.758871	0.109038
# 3	0.758103	0.109116
# 4	0.746572	0.102472

In [10]:
#scoring_function = make_scorer(aft_likelihood, greater_is_better=False) #changed here
def custom_scoring_function(y_true, y_pred):

        if not isinstance(y_true, np.ndarray):
            y_true = y_true.values
        if not isinstance(y_pred, np.ndarray):
            y_pred = y_pred.values
        # change order of this later
        score = aft_likelihood(y_true, y_pred)
        return score

scoring_function = make_scorer(custom_scoring_function, greater_is_better=False)

In [11]:
class CustomSplit(StratifiedKFold):
    def __init__(self, n_splits=5, shuffle=True, random_state=None):
        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    def split(self, X, y, groups=None):
        try:
            if y.shape[1]>1:
                y = y[:,0]
        except:
            pass
        bins = np.sign(y)
        return super().split(X, bins, groups=groups)

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

outer_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)
inner_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)

In [12]:
## Set Basic Elements

ct = make_column_transformer(
        (StandardScaler(), make_column_selector(dtype_include=['float32'])),
        #(OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist'), make_column_selector(dtype_include=['category', 'object'])),
        remainder='passthrough')

estimator = XGBSurv(
    objective='aft_objective',
    eval_metric='aft_loss',
    random_state=rand_state, 
    disable_default_eval_metric=True,
    early_stopping_rounds=early_stopping_rounds, 
    base_score=base_score,
                    )
pipe = Pipeline([('scaler',ct),
                ('estimator', estimator)])
    
rs = RandomizedSearchCV(pipe, param_grid, scoring = scoring_function, n_jobs=-1, 
                             cv=inner_custom_cv, n_iter=n_iter, refit=True, 
                             random_state=rand_state, verbose=1,
                             error_score = 'raise')

In [13]:
## Set Basic Elements
# no onehot following Kadra et al.

ct = make_column_transformer(
        (StandardScaler(), make_column_selector(dtype_include=['float32'])),
        #(OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist'), make_column_selector(dtype_include=['category', 'object'])),
        remainder='passthrough')

estimator = XGBSurv(
    objective='aft_objective',
    eval_metric='aft_loss',
    random_state=rand_state, 
    disable_default_eval_metric=True,
    early_stopping_rounds=early_stopping_rounds, 
    base_score=base_score,
                    )
pipe = Pipeline([('scaler',ct),
                    ('pca', PCA()),
                    ('estimator', estimator)])
    
rs = RandomizedSearchCV(pipe, param_grid, scoring = scoring_function, n_jobs=-1, 
                             cv=inner_custom_cv, n_iter=n_iter, refit=True, 
                             random_state=rand_state, verbose=1,
                             error_score = 'raise') # state not preserved

In [14]:
# saving results
metrics_sum = {}
agg_metrics_cindex = []
agg_metrics_ibs = []

## Metabric

In [15]:
data = load_metabric(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
filename = data.filename
dataset_name = filename.split('_')[0]
outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[]}
best_params = {'best_params_'+dataset_name:[]}
best_model = {'best_model_'+dataset_name:[]}
outer_scores = {'cindex_test_'+dataset_name:[],'ibs_test_'+dataset_name:[]}
X  = data.data 
y = data.target 

for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)
        rs.fit(X_train, y_train, estimator__eval_test_size=validation_size, estimator__verbose=0)
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [rs.best_estimator_.get_params]
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)

        np.savetxt('predictions/'+model+'best_preds_train_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
        np.savetxt('predictions/'+model+'best_preds_test_'+str(i)+'_'+filename, best_preds_test, delimiter=',')

        cum_hazard_test = get_cumulative_hazard_function_aft(
                X_train.values, X_test.values, y_train.values, y_test.values,
                best_preds_train, best_preds_test
                )
        df_survival_test = np.exp(-cum_hazard_test)
        durations_test, events_test = transform_back(y_test.values)
        time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
        ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
        print('Concordance Index Test',ev.concordance_td('antolini'))
        print('Integrated Brier Score Test',ev.integrated_brier_score(time_grid_test))
        cindex_score_test = ev.concordance_td('antolini')
        ibs_score_test = ev.integrated_brier_score(time_grid_test)
        outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
        outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
df_best_params = pd.DataFrame(best_params)
df_best_model = pd.DataFrame(best_model)
df_outer_scores = pd.DataFrame(outer_scores)
df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
# cindex
df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                        'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                        'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })
# IBS
df_agg_metrics_ibs = pd.DataFrame({'dataset':[dataset_name],
                                        'ibs_test_mean':df_outer_scores['ibs_test_'+dataset_name].mean(),
                                        'ibs_test_std':df_outer_scores['ibs_test_'+dataset_name].std() })
agg_metrics_cindex.append(df_agg_metrics_cindex)
agg_metrics_ibs.append(df_agg_metrics_ibs)
metrics_sum[model+dataset_name] = df_metrics


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.6372167771120194
Integrated Brier Score Test 0.16794372891930448
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.6044956636315365
Integrated Brier Score Test 0.1789291972666372
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.6360446444501715
Integrated Brier Score Test 0.17064991565117915
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.654949178401336
Integrated Brier Score Test 0.16311283482906555
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.6505592572583389
Integrated Brier Score Test 0.1627950482572053


## Flchain

In [16]:
data = load_flchain(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
filename = data.filename
dataset_name = filename.split('_')[0]
outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[]}
best_params = {'best_params_'+dataset_name:[]}
best_model = {'best_model_'+dataset_name:[]}
outer_scores = {'cindex_test_'+dataset_name:[],'ibs_test_'+dataset_name:[]}
X  = data.data 
y = data.target 

for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)
        rs.fit(X_train, y_train, estimator__eval_test_size=validation_size, estimator__verbose=0)
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [rs.best_estimator_.get_params]
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)

        np.savetxt('predictions/'+model+'best_preds_train_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
        np.savetxt('predictions/'+model+'best_preds_test_'+str(i)+'_'+filename, best_preds_test, delimiter=',')

        cum_hazard_test = get_cumulative_hazard_function_aft(
                X_train.values, X_test.values, y_train.values, y_test.values,
                best_preds_train, best_preds_test
                )
        df_survival_test = np.exp(-cum_hazard_test)
        durations_test, events_test = transform_back(y_test.values)
        time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
        ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
        print('Concordance Index Test',ev.concordance_td('antolini'))
        print('Integrated Brier Score Test',ev.integrated_brier_score(time_grid_test))
        cindex_score_test = ev.concordance_td('antolini')
        ibs_score_test = ev.integrated_brier_score(time_grid_test)
        outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
        outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
df_best_params = pd.DataFrame(best_params)
df_best_model = pd.DataFrame(best_model)
df_outer_scores = pd.DataFrame(outer_scores)
df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
# cindex
df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                        'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                        'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })
# IBS
df_agg_metrics_ibs = pd.DataFrame({'dataset':[dataset_name],
                                        'ibs_test_mean':df_outer_scores['ibs_test_'+dataset_name].mean(),
                                        'ibs_test_std':df_outer_scores['ibs_test_'+dataset_name].std() })
agg_metrics_cindex.append(df_agg_metrics_cindex)
agg_metrics_ibs.append(df_agg_metrics_ibs)
metrics_sum[model+dataset_name] = df_metrics


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.7948719391007613
Integrated Brier Score Test 0.09663385874241122
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.7946433710556365
Integrated Brier Score Test 0.09539319087916771
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Concordance Index Test 0.7855243924805985
Integrated Brier Score Test 0.09787810392450727
Fitting 5 folds for each of 50 candidates, totalling 250 fits


## RGBSG

In [None]:
data = load_rgbsg(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
filename = data.filename
dataset_name = filename.split('_')[0]
outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[]}
best_params = {'best_params_'+dataset_name:[]}
best_model = {'best_model_'+dataset_name:[]}
outer_scores = {'cindex_test_'+dataset_name:[],'ibs_test_'+dataset_name:[]}
X  = data.data 
y = data.target 

for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)
        rs.fit(X_train, y_train, estimator__eval_test_size=validation_size, estimator__verbose=0)
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [rs.best_estimator_.get_params]
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)

        np.savetxt('predictions/'+model+'best_preds_train_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
        np.savetxt('predictions/'+model+'best_preds_test_'+str(i)+'_'+filename, best_preds_test, delimiter=',')

        cum_hazard_test = get_cumulative_hazard_function_aft(
                X_train.values, X_test.values, y_train.values, y_test.values,
                best_preds_train, best_preds_test
                )
        df_survival_test = np.exp(-cum_hazard_test)
        durations_test, events_test = transform_back(y_test.values)
        time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
        ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
        print('Concordance Index Test',ev.concordance_td('antolini'))
        print('Integrated Brier Score Test',ev.integrated_brier_score(time_grid_test))
        cindex_score_test = ev.concordance_td('antolini')
        ibs_score_test = ev.integrated_brier_score(time_grid_test)
        outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
        outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
df_best_params = pd.DataFrame(best_params)
df_best_model = pd.DataFrame(best_model)
df_outer_scores = pd.DataFrame(outer_scores)
df_metricsmodel = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
# cindex
df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                        'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                        'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })
# IBS
df_agg_metrics_ibs = pd.DataFrame({'dataset':[dataset_name],

                                        'ibs_test_mean':df_outer_scores['ibs_test_'+dataset_name].mean(),
                                        'ibs_test_std':df_outer_scores['ibs_test_'+dataset_name].std() })
agg_metrics_cindex.append(df_agg_metrics_cindex)
agg_metrics_ibs.append(df_agg_metrics_ibs)
metrics_sum[model+dataset_name] = df_metrics


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Concordance Index Test 0.6053689464454337
Integrated Brier Score Test 0.20176212889495945
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Concordance Index Test 0.0
Integrated Brier Score Test 0.4148120690987607
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Concordance Index Test 0.620001335202617
Integrated Brier Score Test 0.20235223350294743
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Concordance Index Test 0.5951169614256103
Integrated Brier Score Test 0.19900222744356932
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Concordance Index Test 0.538160600315112
Integrated Brier Score Test 0.2025100392405161


## SUPPORT

In [None]:
data = load_support(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
filename = data.filename
dataset_name = filename.split('_')[0]
outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[]}
best_params = {'best_params_'+dataset_name:[]}
best_model = {'best_model_'+dataset_name:[]}
outer_scores = {'cindex_test_'+dataset_name:[],'ibs_test_'+dataset_name:[]}
X  = data.data 
y = data.target 

for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)
        rs.fit(X_train, y_train, estimator__eval_test_size=validation_size, estimator__verbose=0)
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [rs.best_estimator_.get_params]
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)

        np.savetxt('predictions/'+model+'best_preds_train_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
        np.savetxt('predictions/'+model+'best_preds_test_'+str(i)+'_'+filename, best_preds_test, delimiter=',')

        cum_hazard_test = get_cumulative_hazard_function_aft(
                X_train.values, X_test.values, y_train.values, y_test.values,
                best_preds_train, best_preds_test
                )
        df_survival_test = np.exp(-cum_hazard_test)
        durations_test, events_test = transform_back(y_test.values)
        time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
        ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
        print('Concordance Index Test',ev.concordance_td('antolini'))
        print('Integrated Brier Score Test',ev.integrated_brier_score(time_grid_test))
        cindex_score_test = ev.concordance_td('antolini')
        ibs_score_test = ev.integrated_brier_score(time_grid_test)
        outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
        outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
df_best_params = pd.DataFrame(best_params)
df_best_model = pd.DataFrame(best_model)
df_outer_scores = pd.DataFrame(outer_scores)
df_metricsmodel = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
# cindex
df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                        'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                        'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })
# IBS
df_agg_metrics_ibs = pd.DataFrame({'dataset':[dataset_name],

                                        'ibs_test_mean':df_outer_scores['ibs_test_'+dataset_name].mean(),
                                        'ibs_test_std':df_outer_scores['ibs_test_'+dataset_name].std() })
agg_metrics_cindex.append(df_agg_metrics_cindex)
agg_metrics_ibs.append(df_agg_metrics_ibs)
metrics_sum[model+dataset_name] = df_metrics


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Concordance Index Test 0.3221784336008367
Integrated Brier Score Test 0.21679043899496458
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Concordance Index Test 0.35969525685577086
Integrated Brier Score Test 0.2157368723879445
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Concordance Index Test 0.3434770945504766
Integrated Brier Score Test 0.21414484557213184
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Concordance Index Test 0.2998233387583636
Integrated Brier Score Test 0.21456145570239665
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Concordance Index Test 0.31727501837501526
Integrated Brier Score Test 0.21480511482061643


## Summarize Information

In [None]:
df_final_aft_1_cindex = pd.concat([df for df in agg_metrics_cindex]).round(4)
df_final_aft_1_cindex.to_csv('metrics/final_gbdt_aft_1_cindex.csv', index=False)
df_final_aft_1_cindex.to_csv('/Users/JUSC/Documents/644928e0fb7e147893e8ec15/05_thesis/tables/final_gbdt_aft_1_cindex.csv', index=False)  #
df_final_aft_1_cindex

Unnamed: 0,dataset,cindex_test_mean,cindex_test_std
0,METABRIC,0.555,0.0238
0,FLCHAIN,0.6065,0.0621
0,RGBSG,0.4717,0.2655
0,SUPPORT,0.3285,0.0234


In [None]:
df_final_aft_1_ibs = pd.concat([df for df in agg_metrics_ibs]).round(4)
df_final_aft_1_ibs.to_csv('metrics/final_gbdt_aft_1_ibs.csv', index=False)
df_final_aft_1_ibs.to_csv('/Users/JUSC/Documents/644928e0fb7e147893e8ec15/05_thesis/tables/final_gbdt_aft_1_ibs.csv', index=False) 
df_final_aft_1_ibs

Unnamed: 0,dataset,ibs_test_mean,ibs_test_std
0,METABRIC,0.184,0.0074
0,FLCHAIN,0.1326,0.0025
0,RGBSG,0.2441,0.0954
0,SUPPORT,0.2152,0.0011


## TCGA

In [None]:
param_grid = {
'estimator__reg_alpha': scloguniform(1e-10,1),#[1e-10,1], # from hyp augmentation, L1 regularization
'estimator__reg_lambda': scloguniform(1e-10,1), #[1e-10,1], #alias l2_regularization, lambda in augmentation
'estimator__learning_rate': scloguniform(0.01,1.0), #[0.001,1], # assumed alias eta from augmentation,
'estimator__n_estimators':  scrandint(1,500),#00), # corresponds to num_rounds
'estimator__gamma': loguniform(0.01,1-0.5),#[0.1,1], # minimum loss reduction required to make a further partition on a leaf node of the tree.
'estimator__colsample_bylevel': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation
'estimator__colsample_bynode': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation, uniform(0.1,1),
'estimator__colsample_bytree': scuniform(0.5, 1-0.5),#[0.5,1], # from hyp augmentation, seems to exceed the bound, uniform(0.5,1)
'estimator__max_depth': scrandint(1,20),#[1,20], # from hyp augmentation
'estimator__max_delta_step': scrandint(0,10),#[0,10], # from hyp augmentation
'estimator__min_child_weight' : scloguniform(0.1,20-0.1),#[0.1,20], # from hyp augmentation
'estimator__subsample': scuniform(0.01,1-0.01),#[0.01,1], # from hyp augmentation
'pca__n_components': [8, 16, 32, 64]
}
n_outer_splits = 5
n_inner_splits = 5
rand_state = 42
n_iter = 50 #0
early_stopping_rounds=10
base_score = 0.0
validation_size = 0.2
model = 'aft_'
data = 'tcga'

In [None]:
cancer_types = ['BLCA',
    'BRCA',
    'HNSC',
    'KIRC',
    'LGG',
    'LIHC',
    'LUAD',
    'LUSC',
    'OV',
    'STAD']
agg_metrics_cindex = []
agg_metrics_ibs = []

In [None]:
for idx, cancer_type in enumerate(cancer_types):
        data = load_tcga(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", cancer_type=cancer_type, as_frame=True)
        filename = data.filename
        dataset_name = filename.split('_')[0]
        outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[]}
        best_params = {'best_params_'+dataset_name:[]}
        best_model = {'best_model_'+dataset_name:[]}
        outer_scores = {'cindex_test_'+dataset_name:[],'ibs_test_'+dataset_name:[]}
        X  = data.data 
        y = data.target 

        for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
                # Split data into training and testing sets for outer fold

                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                
                np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
                np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
                
                X_train, y_train = sort_X_y_pandas(X_train, y_train)
                X_test, y_test = sort_X_y_pandas(X_test, y_test)
                rs.fit(X_train, y_train, estimator__eval_test_size=validation_size, estimator__verbose=0)
                best_params['best_params_'+dataset_name] += [rs.best_params_]
                best_model['best_model_'+dataset_name] += [rs.best_estimator_.get_params]
                best_preds_train = rs.best_estimator_.predict(X_train)
                best_preds_test = rs.best_estimator_.predict(X_test)

                np.savetxt('predictions/'+model+'best_preds_train_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
                np.savetxt('predictions/'+model+'best_preds_test_'+str(i)+'_'+filename, best_preds_test, delimiter=',')

                cum_hazard_test = get_cumulative_hazard_function_aft(
                        X_train.values, X_test.values, y_train.values, y_test.values,
                        best_preds_train, best_preds_test
                        )
                df_survival_test = np.exp(-cum_hazard_test)
                durations_test, events_test = transform_back(y_test.values)
                time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
                ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
                print('Concordance Index Test',ev.concordance_td('antolini'))
                print('Integrated Brier Score Test',ev.integrated_brier_score(time_grid_test))
                cindex_score_test = ev.concordance_td('antolini')
                ibs_score_test = ev.integrated_brier_score(time_grid_test)
                outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
                outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
        df_best_params = pd.DataFrame(best_params)
        df_best_model = pd.DataFrame(best_model)
        df_outer_scores = pd.DataFrame(outer_scores)
        df_metricsmodel = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
        df_metrics.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
        # cindex
        df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                                'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                                'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })
        # IBS
        df_agg_metrics_ibs = pd.DataFrame({'dataset':[dataset_name],
                                                'ibs_test_mean':df_outer_scores['ibs_test_'+dataset_name].mean(),
                                                'ibs_test_std':df_outer_scores['ibs_test_'+dataset_name].std() })
        agg_metrics_cindex.append(df_agg_metrics_cindex)
        agg_metrics_ibs.append(df_agg_metrics_ibs)
        metrics_sum[model+cancer_type] = df_metrics

BLCA_adapted.csv
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Concordance Index Test 0.287292817679558
Integrated Brier Score Test 0.22784199742966532
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Concordance Index Test 0.2599672310212998
Integrated Brier Score Test 0.22521729522927544
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Concordance Index Test 0.18040621266427717
Integrated Brier Score Test 0.2237839845058751
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Concordance Index Test 0.2829324169530355
Integrated Brier Score Test 0.22326588318606544
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Concordance Index Test 0.2850678733031674
Integrated Brier Score Test 0.21147425014035265
BRCA_adapted.csv
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Concordance Index Test 0.030381944444444444
Integrated Brier Score Test 0.2078867308889194
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Concordance I

In [None]:
df_final_aft_tcga_cindex = pd.concat([df for df in agg_metrics_cindex]).round(4)
df_final_aft_tcga_cindex.to_csv('metrics/final_gbdt_tcga_aft_cindex.csv', index=False)
df_final_aft_tcga_cindex.to_csv('/Users/JUSC/Documents/644928e0fb7e147893e8ec15/05_thesis/tables/final_gbdt_tcga_aft_cindex.csv', index=False)  #
df_final_aft_tcga_cindex

Unnamed: 0,dataset,cindex_test_mean,cindex_test_std
0,BLCA,0.2591,0.0454
0,BRCA,0.0091,0.0136
0,HNSC,0.3351,0.1329
0,KIRC,0.207,0.1053
0,LGG,0.4954,0.1182
0,LIHC,0.0889,0.0818
0,LUAD,0.415,0.078
0,LUSC,0.3574,0.1364
0,OV,0.3668,0.1437
0,STAD,0.2978,0.1493


In [None]:
df_final_aft_tcga_ibs = pd.concat([df for df in agg_metrics_ibs]).round(4)
df_final_aft_tcga_ibs.to_csv('metrics/final_gbdt_tcga_aft_ibs.csv', index=False)
df_final_aft_tcga_ibs.to_csv('/Users/JUSC/Documents/644928e0fb7e147893e8ec15/05_thesis/tables/final_gbdt_tcga_aft_ibs.csv', index=False) 
df_final_aft_tcga_ibs

Unnamed: 0,dataset,ibs_test_mean,ibs_test_std
0,BLCA,0.2223,0.0063
0,BRCA,0.2015,0.0125
0,HNSC,0.2059,0.0057
0,KIRC,0.2031,0.0086
0,LGG,0.1953,0.0096
0,LIHC,0.2163,0.0125
0,LUAD,0.2,0.0113
0,LUSC,0.2044,0.0091
0,OV,0.1479,0.0148
0,STAD,0.2162,0.009
