## DL EH Benchmark

In [1]:
from xgbsurv.datasets import (load_metabric, load_flchain, load_rgbsg, load_support, load_tcga)
from xgbsurv import XGBSurv
from xgbsurv.evaluation import cindex_censored, ibs
from xgbsurv.models.utils import sort_X_y
import os
import numpy as np
import pandas as pd
from scipy.stats import uniform as scuniform
from scipy.stats import randint as scrandint
from scipy.stats import loguniform as scloguniform 
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, LabelBinarizer, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.decomposition import PCA
# import models
from xgbsurv.models.breslow_final import breslow_likelihood,get_cumulative_hazard_function_breslow
from xgbsurv.models.efron_final import efron_likelihood
from xgbsurv.models.cind_final import cind_loss
from xgbsurv.models.deephit_pycox_final import deephit_loss1_pycox
from xgbsurv.models.eh_aft_final import aft_likelihood
from xgbsurv.models.eh_ah_final import ah_likelihood
from pycox.evaluation import EvalSurv
from sklearn.utils.fixes import loguniform
from xgbsurv.models.utils import sort_X_y_pandas, transform_back, transform
from xgbsurv.preprocessing.dataset_preprocessing import discretizer_df
import os
import sys
current_path = os.getcwd() 
one_level_up = os.path.abspath(os.path.join(current_path,  ".."))
two_levels_up = os.path.abspath(os.path.join(current_path,  "..",".."))
sys.path.append(one_level_up+'/dl_pipeline')
from dl_pipeline import train_dl_complete

## Set Parameters

In [2]:
# set parameters
n_outer_splits = 5
n_inner_splits = 5
rand_state = 42
n_iter = 50
early_stopping_rounds=10
base_score = 0.0
validation_size = 0.2
method = '_dl_'
model = 'eh'

# set seed for scipy
np.random.seed(rand_state)


## Custom Splitting

In [3]:
class CustomSplit(StratifiedKFold):
    def __init__(self, n_splits=5, shuffle=True, random_state=None):
        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    def split(self, X, y, groups=None):
        print('split type',type(y))
        if isinstance(y, pd.DataFrame):
            y = y.values[:,0]
        elif isinstance(y, pd.Series):
            try:
                y = y.values[:,0]
            except:
                y = y.values
        elif isinstance(y, np.ndarray):
            try:
                if y.shape[1]>1:
                    y = y[:,0]
            except:
                pass

        bins = np.sign(y)
        return super().split(X, bins, groups=groups)

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

outer_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)


## Training

In [4]:
data_set_fns = [load_metabric, load_flchain, load_rgbsg, load_support] # 
metrics_list = []

for idx, dataset in enumerate(data_set_fns):
    model = 'eh'
    # get name of current dataset
    data = dataset(path=two_levels_up+"/xgbsurv/datasets/data/", as_frame=True)
    filename = data.filename
    X  = data.data #.astype(np.float32)
    y = data.target #.values #.to_numpy()
    #y = pd.concat([y,y], axis=1)
    #y.columns = ['target1', 'target2']
    print(filename)

    X, y = sort_X_y_pandas(X, y)
    dataset_name = filename.split('_')[0]
    
    for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        print(i)
        metric = train_dl_complete(dataset_name, X, y, i,  train_index, test_index, model, n_iter)
        print(metric)
        metrics_list.append(metric)



METABRIC_adapted.csv
split type <class 'pandas.core.series.Series'>
0
model: eh
gradient clipping
Fitting 5 folds for each of 50 candidates, totalling 250 fits
split type <class 'pandas.core.series.Series'>
integration_values.shape[0] 162563
integration_values.shape[0] 36569
Concordance Index 0.6144002222134042
Integrated Brier Score: 0.17987197753810902
{'model': 'eh', 'dataset': 'METABRIC', 'cindex_train': [0.628591309649033], 'cindex_test': [0.6144002222134042], 'ibs_train': [0.1681395842696041], 'ibs_test': [0.17987197753810902]}
1
model: eh
gradient clipping
Fitting 5 folds for each of 50 candidates, totalling 250 fits
split type <class 'pandas.core.series.Series'>
integration_values.shape[0] 260502
integration_values.shape[0] 48807
Concordance Index 0.6378689846407992
Integrated Brier Score: 0.17609990324299007
{'model': 'eh', 'dataset': 'METABRIC', 'cindex_train': [0.655902794726704], 'cindex_test': [0.6378689846407992], 'ibs_train': [0.16562406302309612], 'ibs_test': [0.1760999

In [5]:
df0 = pd.DataFrame(metrics_list)
df = df0.applymap(lambda x: x[0] if isinstance(x, list) else x)
df.to_csv(current_path+'/metrics/'+model+method+'results.csv',index=False)

In [6]:
metrics_list

[{'model': 'eh',
  'dataset': 'METABRIC',
  'cindex_train': [0.628591309649033],
  'cindex_test': [0.6144002222134042],
  'ibs_train': [0.1681395842696041],
  'ibs_test': [0.17987197753810902]},
 {'model': 'eh',
  'dataset': 'METABRIC',
  'cindex_train': [0.655902794726704],
  'cindex_test': [0.6378689846407992],
  'ibs_train': [0.16562406302309612],
  'ibs_test': [0.17609990324299007]},
 {'model': 'eh',
  'dataset': 'METABRIC',
  'cindex_train': [0.6581634183283263],
  'cindex_test': [0.6383046210574311],
  'ibs_train': [0.16648272650907364],
  'ibs_test': [0.17111224055246768]},
 {'model': 'eh',
  'dataset': 'METABRIC',
  'cindex_train': [0.6539120191641424],
  'cindex_test': [0.649382512421912],
  'ibs_train': [0.16877596695447547],
  'ibs_test': [0.16252767213748265]},
 {'model': 'eh',
  'dataset': 'METABRIC',
  'cindex_train': [0.6639391447368421],
  'cindex_test': [0.6621245773055605],
  'ibs_train': [0.16308124031183518],
  'ibs_test': [0.16946390749184967]},
 {'model': 'eh',
  

In [7]:
df

Unnamed: 0,model,dataset,cindex_train,cindex_test,ibs_train,ibs_test
0,eh,METABRIC,0.628591,0.6144,0.16814,0.179872
1,eh,METABRIC,0.655903,0.637869,0.165624,0.1761
2,eh,METABRIC,0.658163,0.638305,0.166483,0.171112
3,eh,METABRIC,0.653912,0.649383,0.168776,0.162528
4,eh,METABRIC,0.663939,0.662125,0.163081,0.169464
5,eh,FLCHAIN,0.777999,0.804189,0.097038,0.095277
6,eh,FLCHAIN,0.779689,0.800297,0.096556,0.095578
7,eh,FLCHAIN,0.787571,0.78311,0.095605,0.096962
8,eh,FLCHAIN,0.784487,0.774276,0.098979,0.099957
9,eh,FLCHAIN,0.774976,0.790985,0.100795,0.094921


## TCGA Train, Test, Evaluation

In [10]:
cancer_types = [
    'BLCA',
    'BRCA',
    'HNSC',
    'KIRC',
    'LGG',
    'LIHC',
    'LUAD',
    'LUSC',
    'OV',
    'STAD'
    ]

metrics_list = []

for i, cancer_type in enumerate(cancer_types):
    model = 'eh'
    # get name of current dataset
    data = load_tcga(path=two_levels_up+"/xgbsurv/datasets/data/",cancer_type=cancer_type, as_frame=True)
    filename = data.filename
    X  = data.data #.astype(np.float32)
    y = data.target #.values #.to_numpy()

    print(filename)

    X, y = sort_X_y_pandas(X, y)
    dataset_name = filename.split('_')[0]
    
    for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
        print(i)
        metric = train_dl_complete(dataset_name, X, y, i,  train_index, test_index, model, n_iter, tcga=True)
        print(metric)
        metrics_list.append(metric)

BLCA_adapted.csv
split type <class 'pandas.core.series.Series'>
0
model: eh
gradient clipping
Fitting 5 folds for each of 50 candidates, totalling 250 fits
split type <class 'pandas.core.series.Series'>
integration_values.shape[0] 12849
integration_values.shape[0] 2405
Concordance Index 0.6204419889502762
Integrated Brier Score: 0.24596764387000708
{'model': 'eh', 'dataset': 'BLCA', 'cindex_train': [0.791179615767543], 'cindex_test': [0.6204419889502762], 'ibs_train': [0.1760107559334629], 'ibs_test': [0.24596764387000708]}
1
model: eh
gradient clipping
Fitting 5 folds for each of 50 candidates, totalling 250 fits
split type <class 'pandas.core.series.Series'>
integration_values.shape[0] 16436
integration_values.shape[0] 2800
Concordance Index 0.7056253413435282
Integrated Brier Score: 0.22415904967079636
{'model': 'eh', 'dataset': 'BLCA', 'cindex_train': [0.8241142645149323], 'cindex_test': [0.7056253413435282], 'ibs_train': [0.1764604437662181], 'ibs_test': [0.22415904967079636]}
2
m

In [11]:
df0 = pd.DataFrame(metrics_list)
df = df0.applymap(lambda x: x[0] if isinstance(x, list) else x)
df.to_csv(current_path+'/metrics/'+model+method+'tcga_results.csv',index=False)

In [12]:
df

Unnamed: 0,model,dataset,cindex_train,cindex_test,ibs_train,ibs_test
0,eh,BLCA,0.79118,0.620442,0.176011,0.245968
1,eh,BLCA,0.824114,0.705625,0.17646,0.224159
2,eh,BLCA,0.753801,0.601553,0.196014,0.226888
3,eh,BLCA,0.75769,0.583047,0.201993,0.254122
4,eh,BLCA,0.689354,0.616031,0.21882,0.197789
5,eh,BRCA,0.606254,0.513455,0.180804,0.280214
6,eh,BRCA,0.736036,0.557277,0.177967,0.350095
7,eh,BRCA,0.757199,0.644403,0.119979,0.2471
8,eh,BRCA,0.740518,0.558731,0.117316,0.251842
9,eh,BRCA,0.67796,0.561938,0.188917,0.329513
