## XGBSurv AFT Benchmark

In [2]:
from xgbsurv.datasets import (load_metabric, load_flchain, load_rgbsg, load_support, load_tcga)
from xgbsurv import XGBSurv
from xgbsurv.evaluation import cindex_censored, ibs
from xgbsurv.models.utils import sort_X_y
import os
import numpy as np
import pandas as pd
from scipy.stats import uniform as scuniform
from scipy.stats import randint as scrandint
from scipy.stats import loguniform as scloguniform 
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, LabelBinarizer, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.decomposition import PCA
# import models
from xgbsurv.models.eh_aft_final import aft_likelihood, get_cumulative_hazard_function_aft
from xgbsurv.models.eh_ah_final import ah_likelihood
from pycox.evaluation import EvalSurv
from xgbsurv.models.utils import sort_X_y_pandas, transform_back, transform
from xgbsurv.preprocessing.dataset_preprocessing import discretizer_df
from sklearn.utils.fixes import loguniform #https://scikit-learn.org/stable/modules/grid_search.html

## Set Parameters

In [8]:
# set parameters
n_outer_splits = 5
n_inner_splits = 5
rand_state = 42
n_iter = 1 #50
early_stopping_rounds=10
base_score = 0.0
validation_size = 0.2

# set seed for scipy
np.random.seed(rand_state)

# Define parameter grid for random forest classifier
param_grid = {
#'estimator__reg_alpha': loguniform(1e-10,1.0),#[1e-10,1], # from hyp augmentation, L1 regularization
'estimator__reg_lambda': loguniform(1e-10,1.0), #[1e-10,1], #alias l2_regularization, lambda in augmentation
'estimator__learning_rate': loguniform(0.01,1.0), #[0.001,1], # assumed alias eta from augmentation,
'estimator__n_estimators':  scrandint(1,4000), # corresponds to num_rounds 4000
#'estimator__gamma': scuniform(0.1,1-0.1),#[0.1,1], # minimum loss reduction required to make a further partition on a leaf node of the tree.
'estimator__colsample_bylevel': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation
'estimator__colsample_bynode': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation, uniform(0.1,1),
'estimator__colsample_bytree': scuniform(0.5, 1-0.5),#[0.5,1], # from hyp augmentation, seems to exceed the bound, uniform(0.5,1)
'estimator__max_depth': scrandint(1,20),#[1,20], # from hyp augmentation
'estimator__max_delta_step': scrandint(0,10),#[0,10], # from hyp augmentation
'estimator__min_child_weight' : loguniform(0.1,20-0.1),#[0.1,20], # from hyp augmentation
'estimator__subsample': scuniform(0.01,1-0.01),#[0.01,1], # from hyp augmentation
}

## Custom Splitting

In [9]:
class CustomSplit(StratifiedKFold):
    def __init__(self, n_splits=5, shuffle=True, random_state=None):
        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    def split(self, X, y, groups=None):
        try:
            if y.shape[1]>1:
                y = y[:,0]
        except:
            pass
        bins = np.sign(y)
        return super().split(X, bins, groups=groups)

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

outer_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)
inner_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)

## Scoring Function

In [10]:
#scoring_function = make_scorer(aft_likelihood, greater_is_better=True) # changed here

def custom_scoring_function(y_true, y_pred):

    if not isinstance(y_true, np.ndarray):
        y_true = y_true.values
    if not isinstance(y_pred, np.ndarray):
        y_pred = y_pred.values
    # change order of this later
    score = aft_likelihood(y_true, y_pred)
    return score

scoring_function = make_scorer(custom_scoring_function, greater_is_better=False)

## Set Model & Train Test Evaluate

In [12]:
data_set_fns = [load_flchain] #load_metabric, load_flchain, load_rgbsg, load_support]
data_set_fns_str = ['load_metabric', 'load_flchain', 'load_rgbsg', 'load_support'] 
one_hot_dict = {'load_flchain': ['mgus'], 'load_support':['cancer', 'race'], 'load_rgbsg':['grade']}
agg_metrics_cindex = []
agg_metrics_ibs = []
agg_best_settings = []

for idx, dataset in enumerate(data_set_fns):
    model = 'xgbsurv_aft_'
    # get name of current dataset
    data = dataset(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
    filename = data.filename
    X  = data.data #.astype(np.float32)
    y = data.target #.values #.to_numpy()

    print(data_set_fns_str[idx])
    #if data_set_fns_str[idx] in one_hot_dict.keys():
    #   X = pd.get_dummies(X, columns=one_hot_dict[data_set_fns_str[idx]])
    #print(X)
    X, y = sort_X_y_pandas(X, y)
    #print(X.dtypes)
    dataset_name = filename.split('_')[0]
    # add IBS later
    outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[],
                    'ibs_train_'+dataset_name:[], 'ibs_test_'+dataset_name:[]}
    best_params = {'best_params_'+dataset_name:[]}
    best_model = {'best_model_'+dataset_name:[]}
    ct = make_column_transformer(
            (StandardScaler(), make_column_selector(dtype_include=['float32'], dtype_exclude=['category', 'object', 'int'])),
            #(OneHotEncoder(sparse_output=False, handle_unknown='infrequent_if_exist'), make_column_selector(dtype_include=['category', 'object'])),
            remainder='passthrough')
    
    estimator = XGBSurv(
        objective='aft_objective', 
        eval_metric='aft_loss',
        random_state=rand_state, 
        disable_default_eval_metric=True,
        early_stopping_rounds=early_stopping_rounds, 
        base_score=base_score,
        verbosity=1,
        )
    pipe = Pipeline([('scaler',ct),
                    ('estimator', estimator)])
    
    rs = RandomizedSearchCV(pipe, param_grid, scoring = scoring_function, n_jobs=-1, 
                             cv=inner_custom_cv, n_iter=n_iter, refit=True, 
                             verbose = 3, random_state=rand_state)
    
    for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)

        #print(X_train.shape, type(X_train))
        #print(y_train.shape, type(y_train))
        #print(y_test.shape, type(y_test))
        #print(X_test.shape, type(X_test))
        # save splits and data
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        #np.savetxt('splits/'+model+'X_train_'+str(i)+'_'+filename, X_train, delimiter=',')
        #np.savetxt('splits/'+model+'X_test_'+str(i)+'_'+filename, X_test, delimiter=',')

        #np.savetxt('splits/'+model+'y_train_'+str(i)+'_'+filename, y_train, delimiter=',')
        #np.savetxt('splits/'+model+'y_test_'+str(i)+'_'+filename, y_test, delimiter=',')

        rs.fit(X_train, y_train, estimator__eval_test_size=validation_size)
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)
        #print(' best_preds_train', best_preds_train)
        #print(' best_preds_test', best_preds_test)
        np.savetxt('predictions/'+model+'best_preds_train_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
        np.savetxt('predictions/'+model+'best_preds_test_'+str(i)+'_'+filename, best_preds_test, delimiter=',')


        # save hyperparameter settings
        params = rs.best_estimator_.get_params
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [params]
            
    df_best_params = pd.DataFrame(best_params)
    df_best_model = pd.DataFrame(best_model)
    # df_outer_scores = pd.DataFrame(outer_scores)
    df_best_settings = pd.concat([df_best_params,df_best_model])
    df_best_settings.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
    agg_best_settings.append(df_best_settings)

df_final_settings = pd.concat([df for df in agg_best_settings])
df_final_settings.to_csv('metrics/final_settings_gbdt_aft_1.csv', index=False)




load_metabric
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[0]	validation_0-aft_likelihood:2.61267	validation_1-aft_likelihood:2.61300
[0]	validation_0-aft_likelihood:2.61287	validation_1-aft_likelihood:2.61027
[0]	validation_0-aft_likelihood:2.61237	validation_1-aft_likelihood:2.61036
[0]	validation_0-aft_likelihood:2.61160	validation_1-aft_likelihood:2.61510
[1]	validation_0-aft_likelihood:2.61267	validation_1-aft_likelihood:2.61300
[1]	validation_0-aft_likelihood:2.61287	validation_1-aft_likelihood:2.61027
[1]	validation_0-aft_likelihood:2.61237	validation_1-aft_likelihood:2.61036
[1]	validation_0-aft_likelihood:2.61160	validation_1-aft_likelihood:2.61510
[2]	validation_0-aft_likelihood:2.61267	validation_1-aft_likelihood:2.61300
[2]	validation_0-aft_likelihood:2.61287	validation_1-aft_likelihood:2.61027
[2]	validation_0-aft_likelihood:2.61237	validation_1-aft_likelihood:2.61036
[0]	validation_0-aft_likelihood:2.61307	validation_1-aft_likelihood:2.60917
[2]	validation_

SystemError: CPUDispatcher(<function aft_objective at 0x7f991249a5f0>) returned a result with an exception set

In [13]:
# df_final_aft_1_cindex = pd.concat([df for df in agg_metrics_cindex]).round(4)
# df_final_aft_1_cindex.to_csv('metrics/final_gbdt_aft_1_cindex.csv', index=False)
# df_final_aft_1_cindex.to_csv('/Users/JUSC/Documents/644928e0fb7e147893e8ec15/05_thesis/tables/final_gbdt_aft_1_cindex.csv', index=False)  #


# df_final_aft_1_ibs = pd.concat([df for df in agg_metrics_ibs]).round(4)
# df_final_aft_1_ibs.to_csv('metrics/final_gbdt_aft_1_ibs.csv', index=False)
# df_final_aft_1_ibs.to_csv('/Users/JUSC/Documents/644928e0fb7e147893e8ec15/05_thesis/tables/final_gbdt_aft_1_ibs.csv', index=False) 

best_preds_test


array([-1.0434934e-04,  9.2484122e-05, -1.0434934e-04, ...,
       -9.4871895e-05, -1.0434934e-04, -1.0434934e-04], dtype=float32)

## TCGA Train, Test, Evaluation

In [None]:
param_grid = {
#'estimator__reg_alpha': scloguniform(1e-10,1),#[1e-10,1], # from hyp augmentation, L1 regularization
'estimator__reg_lambda': scloguniform(1e-10,1), #[1e-10,1], #alias l2_regularization, lambda in augmentation
'estimator__learning_rate': scloguniform(0.001,1), #[0.001,1], # assumed alias eta from augmentation,
'estimator__n_estimators':  scrandint(1,1000), # corresponds to num_rounds
#'estimator__gamma': scuniform(0.1,1-0.1),#[0.1,1], # minimum loss reduction required to make a further partition on a leaf node of the tree.
'estimator__colsample_bylevel': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation
'estimator__colsample_bynode': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation, uniform(0.1,1),
'estimator__colsample_bytree': scuniform(0.5, 1-0.5),#[0.5,1], # from hyp augmentation, seems to exceed the bound, uniform(0.5,1)
'estimator__max_depth': scrandint(1,20),#[1,20], # from hyp augmentation
'estimator__max_delta_step': scrandint(0,10),#[0,10], # from hyp augmentation
'estimator__min_child_weight' : scloguniform(0.1,20-0.1),#[0.1,20], # from hyp augmentation
'estimator__subsample': scuniform(0.01,1-0.01),#[0.01,1], # from hyp augmentation
'pca__n_components': [8, 16, 32, 64]
}

In [None]:
cancer_types = ['BLCA',
    'BRCA',
    'HNSC',
    'KIRC',
    'LGG',
    'LIHC',
    'LUAD',
    'LUSC',
    'OV',
    'STAD']
agg_metrics_cindex = []
agg_metrics_ibs = []
agg_best_settings = []


for idx, cancer_type in enumerate(cancer_types):
    model = 'xgbsurv_aft_'
    # get name of current dataset
    data = load_tcga(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", cancer_type=cancer_type, as_frame=True)
    X  = data.data #.astype(np.float32)
    y = data.target #.values #.to_numpy()
    filename = data.filename
    dataset_name = filename.split('_')[0]
    X, y = sort_X_y_pandas(X, y)

    # add IBS later
    outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[],
                    'ibs_train_'+dataset_name:[], 'ibs_test_'+dataset_name:[]}
    best_params = {'best_params_'+dataset_name:[]}
    best_model = {'best_model_'+dataset_name:[]}
    ct = make_column_transformer(
            (StandardScaler(), make_column_selector(dtype_include=['float32']))
            ,remainder='passthrough')
    
    estimator = XGBSurv(
        objective='aft_objective',
        eval_metric='aft_loss',
        random_state=rand_state, 
        disable_default_eval_metric=True,
        early_stopping_rounds=early_stopping_rounds, 
        base_score=base_score,
        verbose = 0
        )
    
    pipe = Pipeline([('scaler',ct),
                    ('pca', PCA()),
                    ('estimator', estimator)])
    
    rs = RandomizedSearchCV(pipe, param_grid, scoring = scoring_function, n_jobs=-1, 
                             cv=inner_custom_cv, n_iter=n_iter, refit=True, random_state=rand_state)
    
    for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)

        #print(y_train.shape, type(y_train))
        #print(X_train.shape, type(X_train))
        #print(X_test.shape, type(X_test))
        #print(y_test.shape, type(y_test))
        # save splits and data
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        #np.savetxt('splits/'+model+'X_train_'+str(i)+'_'+filename, X_train, delimiter=',')
        #np.savetxt('splits/'+model+'X_test_'+str(i)+'_'+filename, X_test, delimiter=',')

        #np.savetxt('splits/'+model+'y_train_'+str(i)+'_'+filename, y_train, delimiter=',')
        #np.savetxt('splits/'+model+'y_test_'+str(i)+'_'+filename, y_test, delimiter=',')

        rs.fit(X_train, y_train, estimator__eval_test_size=0.2)
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)
        df_best_params = pd.DataFrame(best_params)
        df_best_model = pd.DataFrame(best_model)
        # df_outer_scores = pd.DataFrame(outer_scores)
        df_best_settings = pd.concat([df_best_params,df_best_model])
        df_best_settings.to_csv('metrics/'+model+'metric_summary_'+filename, index=False)
        
        # save hyperparameter settings
        params = rs.best_estimator_.get_params
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [params]

        agg_best_settings.append(df_best_settings)

df_final_settings = pd.concat([df for df in agg_best_settings])
df_final_settings.to_csv('metrics/final_settings_gbdt_aft_tcga.csv', index=False)


    

[0]	validation_0-aft_likelihood:3.24891	validation_1-aft_likelihood:3.31404
[1]	validation_0-aft_likelihood:3.24891	validation_1-aft_likelihood:3.31404
[2]	validation_0-aft_likelihood:3.24891	validation_1-aft_likelihood:3.31404
[0]	validation_0-aft_likelihood:3.23300	validation_1-aft_likelihood:3.26245
[3]	validation_0-aft_likelihood:3.24891	validation_1-aft_likelihood:3.31404
[1]	validation_0-aft_likelihood:3.23300	validation_1-aft_likelihood:3.26245
[4]	validation_0-aft_likelihood:3.24891	validation_1-aft_likelihood:3.31404
[2]	validation_0-aft_likelihood:3.23300	validation_1-aft_likelihood:3.26245
[5]	validation_0-aft_likelihood:3.24891	validation_1-aft_likelihood:3.31404
[3]	validation_0-aft_likelihood:3.23300	validation_1-aft_likelihood:3.26245
[6]	validation_0-aft_likelihood:3.24891	validation_1-aft_likelihood:3.31404
[0]	validation_0-aft_likelihood:3.24311	validation_1-aft_likelihood:3.30762
[4]	validation_0-aft_likelihood:3.23300	validation_1-aft_likelihood:3.26245
[7]	validati

In [None]:
df_final_aft_1_cindex = pd.concat([df for df in agg_metrics_cindex]).round(4)
df_final_aft_1_cindex.to_csv('metrics/final_gbdt_tcga_aft_1_cindex.csv', index=False)
df_final_aft_1_cindex.to_csv('/Users/JUSC/Documents/644928e0fb7e147893e8ec15/05_thesis/tables/final_gbdt_tcga_aft_1_cindex.csv', index=False)  #
df_final_aft_1_cindex

Unnamed: 0,dataset,cindex_train_mean,cindex_train_std,cindex_test_mean,cindex_test_std
0,BLCA,0.744,0.0122,0.5822,0.0175
0,BRCA,0.8445,0.0161,0.5264,0.0513
0,HNSC,0.7756,0.0118,0.5219,0.0441
0,KIRC,0.8115,0.0121,0.6186,0.0546
0,LGG,0.8461,0.0136,0.7583,0.0493
0,LIHC,0.7482,0.0622,0.5116,0.051
0,LUAD,0.7718,0.0075,0.547,0.0468
0,LUSC,0.6836,0.081,0.4753,0.059
0,OV,0.5916,0.0198,0.4774,0.0339
0,STAD,0.699,0.1153,0.4875,0.0582


In [None]:
df_final_aft_1_ibs = pd.concat([df for df in agg_metrics_ibs]).round(4)
df_final_aft_1_ibs.to_csv('metrics/final_gbdt_tcga_aft_1_ibs.csv', index=False)
df_final_aft_1_ibs.to_csv('/Users/JUSC/Documents/644928e0fb7e147893e8ec15/05_thesis/tables/final_gbdt_tcga_aft_1_ibs.csv', index=False) 
df_final_aft_1_ibs

Unnamed: 0,dataset,ibs_train_mean,ibs_train_std,ibs_test_mean,ibs_test_std
0,BLCA,0.209,0.0047,0.2224,0.0066
0,BRCA,0.1854,0.0027,0.1962,0.0097
0,HNSC,0.1803,0.0054,0.2038,0.0062
0,KIRC,0.1969,0.0026,0.1998,0.0065
0,LGG,0.169,0.0054,0.1912,0.0127
0,LIHC,0.2023,0.0074,0.2181,0.0136
0,LUAD,0.1851,0.0046,0.199,0.0108
0,LUSC,0.1892,0.0039,0.2037,0.0119
0,OV,0.1266,0.0077,0.1442,0.0183
0,STAD,0.2056,0.0038,0.2166,0.0081
