In [1]:
## XGBSurv Cind Benchmark

In [2]:
from xgbsurv.datasets import (load_metabric, load_flchain, load_rgbsg, load_support, load_tcga)
from xgbsurv import XGBSurv
from xgbsurv.evaluation import cindex_censored, ibs
from xgbsurv.models.utils import sort_X_y
import os
import numpy as np
import pandas as pd
from scipy.stats import uniform as scuniform
from scipy.stats import randint as scrandint
from scipy.stats import loguniform as scloguniform 
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, LabelBinarizer, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.decomposition import PCA
# import models
from xgbsurv.models.breslow_final import breslow_likelihood,get_cumulative_hazard_function_breslow
from xgbsurv.models.cind_final import cind_loss

from pycox.evaluation import EvalSurv
from xgbsurv.models.utils import sort_X_y_pandas, transform_back, transform
from xgbsurv.preprocessing.dataset_preprocessing import discretizer_df

## Set Parameters

In [3]:
# set parameters
n_outer_splits = 5
n_inner_splits = 5
rand_state = 42
n_iter = 1,#50
early_stopping_rounds=10
base_score = 0.0

# Define parameter grid for random forest classifier
param_grid = {
#'estimator__reg_alpha': scloguniform(1e-10,1),#[1e-10,1], # from hyp augmentation, L1 regularization
'estimator__reg_lambda': scloguniform(1e-10,1), #[1e-10,1], #alias l2_regularization, lambda in augmentation
# cind works only with very low learning rate
'estimator__learning_rate': scloguniform(0.01,1.0), #[0.001,1], # assumed alias eta from augmentation,
'estimator__n_estimators':  scrandint(1,8000),
#'estimator__gamma': scuniform(0.1,1-0.1),#[0.1,1], # minimum loss reduction required to make a further partition on a leaf node of the tree.
'estimator__colsample_bylevel': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation
'estimator__colsample_bynode': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation, uniform(0.1,1),
'estimator__colsample_bytree': scuniform(0.5, 1-0.5),#[0.5,1], # from hyp augmentation, seems to exceed the bound, uniform(0.5,1)
'estimator__max_depth': scrandint(1,20),#[1,20], # from hyp augmentation
'estimator__max_delta_step': scrandint(0,10),#[0,10], # from hyp augmentation
'estimator__min_child_weight' : scloguniform(0.1,20-0.1),#[0.1,20], # from hyp augmentation
'estimator__subsample': scuniform(0.01,1-0.01),#[0.01,1], # from hyp augmentation
}

## Custom Splitting

In [4]:
class CustomSplit(StratifiedKFold):
    def __init__(self, n_splits=5, shuffle=True, random_state=None):
        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    def split(self, X, y, groups=None):
        try:
            if y.shape[1]>1:
                y = y[:,0]
        except:
            pass
        bins = np.sign(y)
        return super().split(X, bins, groups=groups)

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

outer_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)
inner_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)

In [5]:
## Scoring Function

In [6]:
scoring_function = make_scorer(cind_loss, greater_is_better=False)

In [7]:
## Set Model & Train Test Evaluate

In [8]:
data_set_fns = [load_metabric,  load_flchain, load_rgbsg, load_support] #, load_flchain, load_rgbsg, load_support, load_tcga]
data_set_fns_str = ['load_metabric', 'load_flchain', 'load_rgbsg', 'load_support'] 
one_hot_dict = {'load_flchain': ['mgus'], 'load_support':['cancer'], 'load_rgbsg':['grade']}

agg_metrics_cindex = []
agg_metrics_ibs = []

for idx, dataset in enumerate(data_set_fns):
    model = 'xgbsurv_cind_'
    # get name of current dataset
    data = dataset(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
    filename = data.filename
    X  = data.data #.astype(np.float32)
    y = data.target #.values #.to_numpy()

    print(data_set_fns_str[idx])
    if data_set_fns_str[idx] in one_hot_dict.keys():
        X = pd.get_dummies(X, columns=one_hot_dict[data_set_fns_str[idx]])
    X, y = sort_X_y_pandas(X, y)
    #print(X.dtypes)
    dataset_name = filename.split('_')[0]
    # add IBS later
    outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[]}
    best_params = {'best_params_'+dataset_name:[]}
    best_model = {'best_model_'+dataset_name:[]}
    ct = make_column_transformer(
            #(OneHotEncoder(sparse_output=False), make_column_selector(dtype_include=['category', 'object']))
            (StandardScaler(), make_column_selector(dtype_include=['float32']))
            ,remainder='passthrough')
    
    estimator = XGBSurv(
        objective='cind_objective',
        eval_metric='cind_loss',
        random_state=rand_state, 
        disable_default_eval_metric=True,
        early_stopping_rounds=early_stopping_rounds, 
        base_score=base_score
                        )
    pipe = Pipeline([('scaler',ct),
                    ('estimator', estimator)])
    
    rs = RandomizedSearchCV(pipe, param_grid, scoring = scoring_function, n_jobs=-1, 
                             cv=inner_custom_cv, n_iter=2, refit=True)
    
    for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)

        print(X_train.shape, type(X_train))
        print(y_train.shape, type(y_train))
        print(X_test.shape, type(X_test))
        print(y_test.shape, type(y_test))
        # save splits and data
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        #np.savetxt('splits/'+model+'X_train_'+str(i)+'_'+filename, X_train, delimiter=',')
        #np.savetxt('splits/'+model+'X_test_'+str(i)+'_'+filename, X_test, delimiter=',')

        #np.savetxt('splits/'+model+'y_train_'+str(i)+'_'+filename, y_train, delimiter=',')
        #np.savetxt('splits/'+model+'y_test_'+str(i)+'_'+filename, y_test, delimiter=',')

        rs.fit(X_train, y_train, estimator__eval_test_size=0.2)
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)
        # save hyperparameter settings
        params = rs.best_estimator_.get_params
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [params]
        try:
            cindex_score_train = cindex_censored(y_train, best_preds_train)
            print('Concordance Index',cindex_score_train)
            outer_scores['cindex_train_'+dataset_name] += [cindex_score_train]
            #outer_scores['ibs_train_'+dataset_name] += [ibs_score_train]

        except:
            outer_scores['cindex_train_'+dataset_name] += [np.nan]
            
        try:

            cindex_score_test = cindex_censored(y_test,best_preds_test)


            outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]

        except: 
            outer_scores['cindex_test_'+dataset_name] += [np.nan]

            
    df_best_params = pd.DataFrame(best_params)
    df_best_model = pd.DataFrame(best_model)
    df_outer_scores = pd.DataFrame(outer_scores)
    df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
    df_metrics.to_csv('metrics/xgbsurv_cind_metric_summary_'+str(i)+'_'+filename, index=False)
    # cindex
    df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                            'cindex_train_mean':df_outer_scores['cindex_train_'+dataset_name].mean(),
                                            'cindex_train_std':df_outer_scores['cindex_train_'+dataset_name].std(),
                                            'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                            'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })


    agg_metrics_cindex.append(df_agg_metrics_cindex)

df_final_cind_1_cindex = pd.concat([df for df in agg_metrics_cindex]).round(4)
df_final_cind_1_cindex.to_csv('metrics/final_gbdt_cind_1_cindex.csv', index=False)
df_final_cind_1_cindex.to_csv('/Users/JUSC/Documents/644928e0fb7e147893e8ec15/05_thesis/tables/final_gbdt_cind_1_cindex.csv', index=False)  #
df_final_cind_1_cindex

load_metabric
(1522, 9) <class 'pandas.core.frame.DataFrame'>
(1522,) <class 'pandas.core.series.Series'>
(381, 9) <class 'pandas.core.frame.DataFrame'>
(381,) <class 'pandas.core.series.Series'>
[0]	validation_0-cind_loss:-282.26306	validation_1-cind_loss:-70.49092
[0]	validation_0-cind_loss:-291.59255	validation_1-cind_loss:-71.27930
[0]	validation_0-cind_loss:-282.63040	validation_1-cind_loss:-70.53056
[0]	validation_0-cind_loss:-282.61372	validation_1-cind_loss:-70.51151
[0]	validation_0-cind_loss:-291.02362	validation_1-cind_loss:-71.93473
[0]	validation_0-cind_loss:-291.96172	validation_1-cind_loss:-71.43569
[0]	validation_0-cind_loss:-282.21944	validation_1-cind_loss:-70.55330
[1]	validation_0-cind_loss:-283.78273	validation_1-cind_loss:-70.61300
[1]	validation_0-cind_loss:-294.06985	validation_1-cind_loss:-71.80454
[1]	validation_0-cind_loss:-283.88622	validation_1-cind_loss:-70.60574
[1]	validation_0-cind_loss:-283.81500	validation_1-cind_loss:-70.81695
[0]	validation_0-cind_l

KeyboardInterrupt: 

In [None]:
pd.DataFrame(outer_scores)

ValueError: All arrays must be of the same length

## TCGA Train, Test, Evaluation

In [None]:
param_grid = {
#'estimator__reg_alpha': scloguniform(1e-10,1),#[1e-10,1], # from hyp augmentation, L1 regularization
'estimator__reg_lambda': scloguniform(1e-10,1), #[1e-10,1], #alias l2_regularization, lambda in augmentation
'estimator__learning_rate': scloguniform(0.01,1), #[0.001,1], # assumed alias eta from augmentation,
'estimator__n_estimators':  scrandint(1,8000), # corresponds to num_rounds
#'estimator__gamma': scuniform(0.1,1-0.1),#[0.1,1], # minimum loss reduction required to make a further partition on a leaf node of the tree.
'estimator__colsample_bylevel': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation
'estimator__colsample_bynode': scuniform(0.1, 1-0.1), #[0.1,1], # from hyp augmentation, uniform(0.1,1),
'estimator__colsample_bytree': scuniform(0.5, 1-0.5),#[0.5,1], # from hyp augmentation, seems to exceed the bound, uniform(0.5,1)
'estimator__max_depth': scrandint(1,20),#[1,20], # from hyp augmentation
'estimator__max_delta_step': scrandint(0,10),#[0,10], # from hyp augmentation
'estimator__min_child_weight' : scloguniform(0.1,20-0.1),#[0.1,20], # from hyp augmentation
'estimator__subsample': scuniform(0.01,1-0.01),#[0.01,1], # from hyp augmentation
'pca__n_components': [8, 10, 12, 14, 16]
}

In [None]:
cancer_types = ['BLCA',
    'BRCA',
    'HNSC',
    'KIRC',
    'LGG',
    'LIHC',
    'LUAD',
    'LUSC',
    'OV',
    'STAD']

agg_metrics_cindex = []
agg_metrics_ibs = []

for idx, cancer_type in enumerate(cancer_types):
    model = 'xgbsurv_cind_'
    # get name of current dataset
    data = load_tcga(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", cancer_type=cancer_type, as_frame=True)
    X  = data.data #.astype(np.float32)
    y = data.target #.values #.to_numpy()
    filename = data.filename
    dataset_name = filename.split('_')[0]
    X, y = sort_X_y_pandas(X, y)

    # add IBS later
    outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[]}
    best_params = {'best_params_'+dataset_name:[]}
    best_model = {'best_model_'+dataset_name:[]}
    ct = make_column_transformer(
            #(OneHotEncoder(sparse_output=False), make_column_selector(dtype_include=['category', 'object']))
            (StandardScaler(), make_column_selector(dtype_include=['float32']))
            ,remainder='passthrough')
    
    estimator = XGBSurv(
        objective='cind_objective',
        eval_metric='cind_loss',
        random_state=rand_state, 
        disable_default_eval_metric=True,
        early_stopping_rounds=early_stopping_rounds, 
        base_score=base_score
                        )
    
    pipe = Pipeline([('scaler',ct),
                    ('pca', PCA()),
                    ('estimator', estimator)])
    
    rs = RandomizedSearchCV(pipe, param_grid, scoring = scoring_function, n_jobs=-1, 
                             cv=inner_custom_cv, n_iter=2, refit=True)
    
    for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        # Split data into training and testing sets for outer fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        X_train, y_train = sort_X_y_pandas(X_train, y_train)
        X_test, y_test = sort_X_y_pandas(X_test, y_test)

        print(X_train.shape, type(X_train))
        print(y_train.shape, type(y_train))
        print(X_test.shape, type(X_test))
        print(y_test.shape, type(y_test))
        # save splits and data
        np.savetxt('splits/'+model+'train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
        np.savetxt('splits/'+model+'test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
        
        #np.savetxt('splits/'+model+'X_train_'+str(i)+'_'+filename, X_train, delimiter=',')
        #np.savetxt('splits/'+model+'X_test_'+str(i)+'_'+filename, X_test, delimiter=',')

        #np.savetxt('splits/'+model+'y_train_'+str(i)+'_'+filename, y_train, delimiter=',')
        #np.savetxt('splits/'+model+'y_test_'+str(i)+'_'+filename, y_test, delimiter=',')

        rs.fit(X_train, y_train, estimator__eval_test_size=0.2)
        best_preds_train = rs.best_estimator_.predict(X_train)
        best_preds_test = rs.best_estimator_.predict(X_test)
        # save hyperparameter settings
        params = rs.best_estimator_.get_params
        best_params['best_params_'+dataset_name] += [rs.best_params_]
        best_model['best_model_'+dataset_name] += [params]
        try:

        
            cindex_score_train = cindex_censored(y_train, best_preds_train)
            print('Concordance Index',cindex_score_train)
            outer_scores['cindex_train_'+dataset_name] += [cindex_score_train]
            #outer_scores['ibs_train_'+dataset_name] += [ibs_score_train]

        except:
            outer_scores['cindex_train_'+dataset_name] += [np.nan]
            
        try:

            cindex_score_test = cindex_censored(y_test,best_preds_test)


            outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]

        except: 
            outer_scores['cindex_test_'+dataset_name] += [np.nan]

            
    df_best_params = pd.DataFrame(best_params)
    df_best_model = pd.DataFrame(best_model)
    df_outer_scores = pd.DataFrame(outer_scores)
    df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
    df_metrics.to_csv('metrics/metric_summary_'+str(i)+'_'+filename, index=False)
    # cindex
    df_agg_metrics_cindex = pd.DataFrame({'dataset':[dataset_name],
                                            'cindex_train_mean':df_outer_scores['cindex_train_'+dataset_name].mean(),
                                            'cindex_train_std':df_outer_scores['cindex_train_'+dataset_name].std(),
                                            'cindex_test_mean':df_outer_scores['cindex_test_'+dataset_name].mean(),
                                            'cindex_test_std':df_outer_scores['cindex_test_'+dataset_name].std() })


    agg_metrics_cindex.append(df_agg_metrics_cindex)

df_final_cind_1_cindex = pd.concat([df for df in agg_metrics_cindex]).round(4)
df_final_cind_1_cindex.to_csv('metrics/final_gbdt_tcga_cind_1_cindex.csv', index=False)
df_final_cind_1_cindex.to_csv('/Users/JUSC/Documents/644928e0fb7e147893e8ec15/05_thesis/tables/final_gbdt_tcga_cind_1_cindex.csv', index=False)  #
df_final_cind_1_cindex
    

(324, 20531) <class 'pandas.core.frame.DataFrame'>
(324,) <class 'pandas.core.series.Series'>
(82, 20531) <class 'pandas.core.frame.DataFrame'>
(82,) <class 'pandas.core.series.Series'>
[0]	validation_0-breslow_likelihood:4.23220	validation_1-breslow_likelihood:3.17639
[1]	validation_0-breslow_likelihood:4.13138	validation_1-breslow_likelihood:3.12117
[2]	validation_0-breslow_likelihood:4.10251	validation_1-breslow_likelihood:3.24152
[3]	validation_0-breslow_likelihood:4.06283	validation_1-breslow_likelihood:3.27751
[4]	validation_0-breslow_likelihood:3.96997	validation_1-breslow_likelihood:3.34330
[5]	validation_0-breslow_likelihood:3.93729	validation_1-breslow_likelihood:3.36013
[6]	validation_0-breslow_likelihood:3.89891	validation_1-breslow_likelihood:3.34272
[7]	validation_0-breslow_likelihood:3.85069	validation_1-breslow_likelihood:3.43695
[8]	validation_0-breslow_likelihood:3.79533	validation_1-breslow_likelihood:3.40617
[9]	validation_0-breslow_likelihood:3.73514	validation_1-b