In [1]:
# XGBsurv benchmark

#from xgbsurv.datasets import (load_flchain, load_rgbsg, load_support,
#load_tcga)
from xgbsurv import XGBSurv
from xgbsurv.evaluation import cindex_censored, ibs
from xgbsurv.models.utils import sort_X_y
import os
import numpy as np
from numpy import savetxt
import pandas as pd
from scipy.stats import uniform as scuniform
from scipy.stats import randint as scrandint
from scipy.stats import loguniform as scloguniform 
from sklearn.model_selection import RandomizedSearchCV, KFold, StratifiedKFold
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.datasets import load_iris
#from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler, LabelEncoder, LabelBinarizer, OneHotEncoder
# import models
#from xgbsurv.models.breslow_final import breslow_likelihood, breslow_estimator
from pycox.evaluation import EvalSurv
from xgbsurv.models.utils import transform_back, transform
from xgbsurv.preprocessing.dataset_preprocessing import discretizer_df
import sys
#sys.path.append('/Users/JUSC/Documents/xgbsurv_benchmarking/deep_learning/')
from loss_functions_pytorch import BreslowLoss, breslow_likelihood_torch
from skorch import NeuralNet
from skorch.callbacks import EarlyStopping
from skorch.dataset import CVSplit, ValidSplit
import torch
from torch import nn
from xgbsurv.datasets import load_metabric, load_flchain, load_rgbsg, load_support
#import torch.nn.init as init

/Users/JUSC/Documents/xgbsurv/experiments/deep_learning


## Set Parameters
deep learning hyperparameter spaces follow pycox paper
Time-to-Event Prediction with Neural Networks and Cox Regression
page 21

In [2]:
# set parameters, put into function
n_outer_splits = 5
n_inner_splits = 5
rand_state = 42
n_iter = 10 # set to 50
#n_iter_cind = 200
early_stopping_rounds=15
base_score = 0.0

param_grid_breslow = {
    'estimator__module__n_layers': [1, 2, 4],
    'estimator__module__n_layers': [1, 2, 4],
    'estimator__module__num_nodes': [64, 128, 256, 512],
    'estimator__module__dropout': scuniform(0.0,0.7),
    'estimator__optimizer__weight_decay': [0.4, 0.2, 0.1, 0.05, 0.02, 0.01, 0],
    'estimator__batch_size': [64, 128, 256, 512, 1024],
    #lr not in paper because of learning rate finder
    # note: setting learning rate higher would make exp(partial_hazard) explode
    'estimator__lr': scloguniform(0.001,0.01), # scheduler unten einbauen
    #'max_epochs':  scrandint(10,20), # corresponds to num_rounds
}

## Set MLP Model

In [3]:

class SurvivalModel(nn.Module):
    def __init__(self, n_layers, in_features, num_nodes, dropout, out_features):
        super(SurvivalModel, self).__init__()
        self.n_layers = n_layers
        self.in_features = in_features
        self.num_nodes = num_nodes
        self.dropout = dropout
        self.out_features = out_features
        model = []
        # first layer
        model.append(torch.nn.Linear(in_features, num_nodes))
        model.append(torch.nn.ReLU())
        model.append(torch.nn.Dropout(dropout))
        model.append(torch.nn.BatchNorm1d(num_nodes))

        for i in range(n_layers-1):
            model.append(torch.nn.Linear(num_nodes, num_nodes))
            #init.kaiming_normal_(model[-1].weight, nonlinearity='relu')
            model.append(torch.nn.ReLU())
            model.append(torch.nn.Dropout(dropout))
            model.append(torch.nn.BatchNorm1d(num_nodes))

        # output layer
        model.append(torch.nn.Linear(num_nodes, out_features))
    
        self.layers = nn.Sequential(*model)

        # for layer in self.layers:
        #     if isinstance(layer, nn.Linear):
        #         #nn.init.uniform_(layer.weight, a=-0.5, b=0.5)
        #         nn.init.kaiming_normal_(layer.weight)


    def forward(self, x):
        res = self.layers(x)
        #print(res)
        return res


## Set Splitting Strategy

In [4]:
# Define stratified inner k-fold cross-validation
class CustomSplit(StratifiedKFold):
    def __init__(self, n_splits=5, shuffle=True, random_state=None):
        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    def split(self, X, y, groups=None):
        print('split', X.dtypes)
        try:
            if y.shape[1]>1:
                y = y[:,0]
        except:
            pass
        bins = np.sign(y)
        return super().split(X, bins, groups=groups)

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

outer_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)
inner_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)



## Set Scoring Function

In [5]:
# Define Scorer
def custom_scoring_function(y_true, y_pred):

        #y_true = torch.from_numpy(y_true)
        if isinstance(y_pred, np.ndarray):
            y_pred = torch.from_numpy(y_pred)
        if isinstance(y_true, np.ndarray):
            y_true = torch.from_numpy(y_true)
        if isinstance(y_pred, pd.Series):
            y_pred = torch.tensor(y_pred.values)
        if isinstance(y_true, pd.Series):
            y_true = torch.tensor(y_true.values)
        score = breslow_likelihood_torch(y_true, y_pred) 
        return score.numpy()

scoring_function = make_scorer(custom_scoring_function, greater_is_better=False)

## Set Data Loading Functions

In [6]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_selector
import pandas as pd

# Define a custom transformer to convert True and False values to 1 and 0, respectively
class BoolToNumericTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        X_copy[X_copy == True] = 1
        X_copy[X_copy == False] = 0
        return X_copy.astype(float)

# Let's say you have a pandas DataFrame with columns of different types:
df = pd.DataFrame({
    'numeric_feature': [1.0, 2.0, 3.0],
    'bool_feature': [True, False, True]
})

# Define the column transformer using make_column_selector and the custom transformer
column_transformer = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include=['float64'])),
    (BoolToNumericTransformer(), make_column_selector(dtype_include=['bool']))
)

# Apply the transformation to the DataFrame
transformed_df = column_transformer.fit_transform(df)

print(transformed_df)


[[-1.22474487  1.        ]
 [ 0.          0.        ]
 [ 1.22474487  1.        ]]


In [7]:
class TorchStandardScaler(StandardScaler):
    def transform(self, X, copy=None):
        X_scaled = super().transform(X, copy=copy)
        return torch.from_numpy(X_scaled)

    def fit_transform(self, X, y=None):
        X_transformed = super().fit_transform(X, y=y)
        return torch.from_numpy(X_transformed)
    
ct = make_column_transformer(
       (StandardScaler(),
        make_column_selector(dtype_include=float)),)

In [8]:
class BoolToNumericTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        X_copy[X_copy == True] = 1
        X_copy[X_copy == False] = 0
        return X_copy.astype(np.float32)

# adapt this for bool case
class CustomStandardScaler(StandardScaler):
    
    def __init__(self, copy=True, with_mean=True, with_std=True):
        super().__init__(copy=copy, with_mean=with_mean, with_std=with_std)
        
    def fit(self, X, y=None):
        # Add your own code here
        return super().fit(X, y)
    
    def transform(self, X, y=None):
        # Add your own code here
        X_transformed = super().transform(X, y)
        # Add your own code here
        return X_transformed.astype(np.float32)
    
    def fit_transform(self, X, y=None):
        # Add your own code here
        
        #to_add = X[X.columns[X.columns.isin(['sex','D'])]]
        #X = X[X.columns[~X.columns.isin(['sex','D'])]]
        #df.columns.get_loc('age')
        X_transformed = super().fit_transform(X, y)
        #to_add = to_add.values.reshape(-1, 1)

        # Horizontally stack the two arrays
        #result = np.hstack((X_transformed, to_add))
        
        # Add your own code here
        return X_transformed.astype(np.float32)

class CustomStandardScaler2(StandardScaler):
    """Just to change the datatype of bool variables."""
    def __init__(self, copy=True, with_mean=True, with_std=True):
        super().__init__(copy=copy, with_mean=with_mean, with_std=with_std)
        
    def fit(self, X, y=None):
        # Add your own code here
        return super().fit(X, y)
    
    def transform(self, X, y=None):
        # Add your own code here
        X_transformed = super().transform(X, y)
        # Add your own code here
        return X_transformed.astype(np.float32)
    
    def fit_transform(self, X, y=None):
        # Add your own code here
        
        to_return = X
        #X = X[X.columns[~X.columns.isin(['sex','D'])]]
        #df.columns.get_loc('age')
        X_transformed = super().fit_transform(X, y)
        #to_add = to_add.values.reshape(-1, 1)

        # Horizontally stack the two arrays
        #result = np.hstack((X_transformed, to_add))
        
        # Add your own code here
        return to_return.astype(np.float32)


class CustomOneHotEncoder(OneHotEncoder):
    def __init__(self, categories='auto', drop=None, sparse=True,
                 dtype=np.float32, handle_unknown='error'):
        super().__init__(categories=categories, drop=drop, sparse=sparse,
                         dtype=dtype, handle_unknown=handle_unknown)

    def fit(self, X, y=None):
        # Add your custom fit logic here
        # You can modify or extend the behavior of the base class fit() method
        return super().fit(X, y)

    def transform(self, X):
        # Add your custom transform logic here
        # You can modify or extend the behavior of the base class transform() method
        return super().transform(X) #.astype(np.float32)

    def fit_transform(self, X, y=None):
        
        # Add your custom fit_transform logic here
        # You can modify or extend the behavior of the base class fit_transform() method
        print('transformed columns', X.columns)
        to_return = super().fit_transform(X, y)
        
        return to_return #.astype(np.float32)

    def inverse_transform(self, X):
        # Add your custom inverse_transform logic here
        # You can modify or extend the behavior of the base class inverse_transform() method
        return super().inverse_transform(X)

    def get_feature_names(self, input_features=None):
        # Add your custom get_feature_names logic here
        # You can modify or extend the behavior of the base class get_feature_names() method
        return super().get_feature_names(input_features)

    
class CustomLabelBinarizer(LabelBinarizer):
    def __init__(self, neg_label=0, pos_label=1, sparse_output=False):
        super().__init__(neg_label=0, pos_label=1, sparse_output=False)

    def fit(self, X, y=None):
        # Add custom fit logic here
        # Call the parent class's fit method
        print('fit',X)
        super().fit(X, y)

        return self

    def transform(self, X, y=None):
        # Add custom transform logic here

        # Call the parent class's transform method
        print('transform',X)
        transformed_X = super().transform(X)

        return transformed_X
    
    def fit_transform(self, X, y=None):
        # Add custom transform logic here

        # Call the parent class's transform method
        print(y)
        print('fit_transform',X.values)
        to_add = X[X.columns[X.columns.isin(['sex','D'])]]
        X = X[X.columns[~X.columns.isin(['sex','D'])]]
        transformed_X = super().fit_transform(X.values)
    
        return transformed_X.astype(np.float32)

    def inverse_transform(self, X, y=None):
        # Add custom inverse_transform logic here

        # Call the parent class's inverse_transform method
        inverse_X = super().inverse_transform(X)

        return inverse_X.astype(np.float32)

column_names = ['age', 'sex', 'race', 'n_comorbidities', 'diabetes', 'dementia',
       'cancer', 'blood_pressure', 'heart_rate', 'respiration_rate',
       'temperature', 'white_blood_cell', 'serum_sodium', 'serum_creatinine']
def select_columns_by_name(columns):
    def selector(X):
        return X[:, [column_names.index(col) for col in columns]]
    return selector
    
ct = make_column_transformer(
        (CustomStandardScaler(), make_column_selector(dtype_include=['float32'])),
        (CustomOneHotEncoder(dtype=np.float32), make_column_selector(dtype_include=['category'],dtype_exclude=['float32'])), remainder='passthrough')

# ct = make_column_transformer(
#         #(CustomStandardScaler(), make_column_selector(dtype_include=['float32'])),
#         (CustomOneHotEncoder(), select_columns_by_name(['sex'])), remainder='passthrough')

#ct = CustomOneHotEncoder()
data = load_support(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
X  = data.data
#X = X.dtype(np.float32)
#X = X.convert_dtypes()
print(X.columns)
print(X.shape)
print(X.dtypes)
X = ct.fit_transform(X)
X

Index(['age', 'sex', 'race', 'n_comorbidities', 'diabetes', 'dementia',
       'cancer', 'blood_pressure', 'heart_rate', 'respiration_rate',
       'temperature', 'white_blood_cell', 'serum_sodium', 'serum_creatinine'],
      dtype='object')
(8873, 14)
age                  float32
sex                 category
race                 float32
n_comorbidities      float32
diabetes            category
dementia            category
cancer              category
blood_pressure       float32
heart_rate           float32
respiration_rate     float32
temperature          float32
white_blood_cell     float32
serum_sodium         float32
serum_creatinine     float32
dtype: object
transformed columns Index(['sex', 'diabetes', 'dementia', 'cancer'], dtype='object')


array([[-2.4593287 , -0.6470795 ,  1.2134316 , ...,  0.        ,
         0.        ,  1.        ],
       [ 0.14505765, -0.6470795 , -0.40320182, ...,  1.        ,
         0.        ,  0.        ],
       [ 0.21530288,  0.8430675 ,  1.2134316 , ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 1.1050085 , -0.6470795 , -0.40320182, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.3469174 ,  0.097994  , -0.40320182, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.2037778 , -0.6470795 ,  1.2134316 , ...,  0.        ,
         1.        ,  0.        ]], dtype=float32)

In [9]:
data = load_metabric(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
data.target.to_numpy()

array([ 1.0000000e-01, -7.6666665e-01, -1.2333333e+00, ...,
       -3.3703333e+02,  3.5100000e+02,  3.5520001e+02], dtype=float32)

In [10]:
from xgbsurv.models.breslow_final import get_cumulative_hazard_function_breslow
import numba
# get_cumulative_hazard_function_breslow(X_train: np.array, 
#         X_test: np.array, y_train: np.array, y_test: np.array,
#         predictor_train: np.array, predictor_test: np.array
#     )

## Set Training Function

In [57]:
# set parameters, put into function
n_outer_splits = 5
n_inner_splits = 5
rand_state = 42
n_iter = 1 # # set to 50
early_stopping_rounds=10
base_score = 0.0

def train_eval(X, y, net, n_iter, filename):

        dataset_name = filename.split('_')[0]
        # add IBS later
        outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[],
                        'ibs_train_'+dataset_name:[], 'ibs_test_'+dataset_name:[]}
        best_params = {'best_params_'+dataset_name:[]}
        best_model = {'best_model_'+dataset_name:[]}

        ct = make_column_transformer(
        (CustomStandardScaler(), make_column_selector(dtype_include=['float32'])),
        (OneHotEncoder(dtype=np.float32), make_column_selector(dtype_include=['category'], dtype_exclude=['float32'])), remainder='passthrough')

        pipe = Pipeline(
        [('scaler', ct),
        ('estimator', net)]
        )
        # pipe = Pipeline([
        # ('scale', StandardScaler()),
        # ('estimator', net),
        # ])
        rs = RandomizedSearchCV(pipe, param_grid_breslow, scoring = scoring_function, n_jobs=-1, 
                            cv=inner_custom_cv, n_iter=n_iter, refit=True)
        
        for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
                # Split data into training and testing sets for outer fold
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y[train_index], y[test_index]

                print(X_train.shape, type(X_train))
                print(y_train.shape, type(y_train))
                print(X_test.shape, type(X_test))
                print(y_test.shape, type(y_test))
                # save splits and data
                savetxt('splits/train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
                savetxt('splits/test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
                
                savetxt('splits/X_train_'+str(i)+'_'+filename, X_train, delimiter=',')
                savetxt('splits/X_test_'+str(i)+'_'+filename, X_test, delimiter=',')

                savetxt('splits/y_train_'+str(i)+'_'+filename, y_train, delimiter=',')
                savetxt('splits/y_test_'+str(i)+'_'+filename, y_test, delimiter=',')
                
                #savetxt('splits/test_index_'+str(i)+'.csv', test_index, delimiter=',')
                
                # create validation dataset for early stopping
                strat = np.sign(y_train)
                valid_split = ValidSplit(cv=0.1, stratified=strat, random_state=42)

                #print(X_train.dtypes, X_test.dtypes)
                # train
                rs.fit(X_train, y_train)
                
                # predict
                #scaler = StandardScaler()
                #X_train = scaler.fit_transform(X_train)
                #X_test = scaler.transform(X_test)
                print(rs.best_estimator_.predict(X_train))
                best_preds_train = rs.best_estimator_.predict(X_train)
                best_preds_test = rs.best_estimator_.predict(X_test)

                #print(best_preds_test, type(best_preds_train))
                #print(best_preds_test, type(best_preds_test))
                # predict survival function
                # d = predict_survival_function(X_test, dataframe=True)

                # save predictions, get dataset name
                savetxt('predictions/train_preds_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
                savetxt('predictions/test_preds_'+str(i)+'_'+filename, best_preds_test, delimiter=',')
                
                # save hyperparameter settings
                params = rs.best_estimator_.get_params
                best_params['best_params_'+dataset_name] += [rs.best_params_]
                best_model['best_model_'+dataset_name] += [params]
                #print(rs.best_params_)
                # save c-index, ibs values
                for var in [X_train, X_train, y_train, y_train, best_preds_train, best_preds_test]:
                        if not isinstance(var, np.ndarray):
                                #print(type(var))
                                var = var.values #.to_numpy()
                                #print(type(var))
                #print('y_train',y_train)
                # training data
                #try: 
                # train data
                cum_hazard_train = get_cumulative_hazard_function_breslow(
                        X_train, X_train, y_train, y_train,
                        best_preds_train.reshape(-1), best_preds_train.reshape(-1)
                        )
                df_survival_train = np.exp(-cum_hazard_train)
                durations_train, events_train = transform_back(y_train)
                time_grid_train = np.linspace(durations_train.min(), durations_train.max(), 100)
                ev = EvalSurv(df_survival_train, durations_train, events_train, censor_surv='km')
                print('Concordance Index',ev.concordance_td('antolini'))
                cindex_score_train = ev.concordance_td('antolini')
                ibs_score_train = ev.integrated_brier_score(time_grid_train)
                outer_scores['cindex_train_'+dataset_name] += [cindex_score_train]
                outer_scores['ibs_train_'+dataset_name] += [ibs_score_train]

                # test data
                cum_hazard_test = get_cumulative_hazard_function_breslow(
                        X_train, X_test, y_train, y_test,
                        best_preds_train.reshape(-1), best_preds_test.reshape(-1)
                        )
                df_survival_test = np.exp(-cum_hazard_test)
                durations_test, events_test = transform_back(y_test)
                print('durations',durations_test.min(), durations_test.max())
                time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
                ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
                print('Concordance Index',ev.concordance_td('antolini'))
                print('Integrated Brier Score:',ev.integrated_brier_score(time_grid_test))
                cindex_score_test = ev.concordance_td('antolini')
                ibs_score_test = ev.integrated_brier_score(time_grid_test)

                outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
                outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]

                # except:
                #         outer_scores['cindex_train_'+dataset_name] += [np.nan]
                #         outer_scores['cindex_test_'+dataset_name] += [np.nan]
                #         outer_scores['ibs_train_'+dataset_name] += [np.nan]
                #         outer_scores['ibs_test_'+dataset_name] += [np.nan]
                # try:
                #         score_train = cindex_censored(y_train, best_preds_train.reshape(-1))
                #         score_test = cindex_censored(y_test, best_preds_test.reshape(-1))
                #         outer_scores['cindex_train_'+dataset_name] += [score_train]
                #         outer_scores['cindex_test_'+dataset_name] += [score_test]
                # except:
                #         outer_scores['cindex_train_'+dataset_name] += [np.nan]
                #         outer_scores['cindex_test_'+dataset_name] += [np.nan]
        df_best_params = pd.DataFrame(best_params)
        df_best_model = pd.DataFrame(best_model)
        df_outer_scores = pd.DataFrame(outer_scores)
        df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
        df_metrics.to_csv('metrics/metric_summary_'+str(i)+'_'+filename, index=False)
        return best_model, best_params, outer_scores, best_preds_train, best_preds_test, X_train, X_test, y_train, y_test
                

In [58]:
data = load_support(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
X  = data.data
X.dtypes

age                  float32
sex                 category
race                 float32
n_comorbidities      float32
diabetes            category
dementia            category
cancer              category
blood_pressure       float32
heart_rate           float32
respiration_rate     float32
temperature          float32
white_blood_cell     float32
serum_sodium         float32
serum_creatinine     float32
dtype: object

In [60]:
data_set_fns = [load_metabric] #, load_flchain, load_rgbsg, load_support, load_tcga]
for dataset in data_set_fns:
    # get name of current dataset
    data = dataset(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
    X  = data.data #.astype(np.float32)
    y = data.target #.values #.to_numpy()
    print('Dataset:',data.filename)
    #print(X.dtypes)
    
    net = NeuralNet(
    SurvivalModel, 
    module__n_layers = 1,
    module__in_features = X.shape[1],
    #module__num_nodes = 32,
    #module__dropout = 0.1, # these could also be removed
    module__out_features = 1,
    # for split sizes when result size = 1
    iterator_train__drop_last=True,
    #iterator_valid__drop_last=True,
    criterion=BreslowLoss,
    optimizer=torch.optim.AdamW,
    optimizer__weight_decay = 0.4,
    batch_size=32, # separate train and valid->iterator_train__batch_size=128 and iterator_valid__batch_size=128 ?
    callbacks=[EarlyStopping(patience=10)],
    #TODO: enable stratification, verify
    train_split=ValidSplit(0.2), # might cause lower performance in metrics, explain in thesis
    #lr=0.001,
    #max_epochs=1, #0,#100
    #train_split=None,
    verbose=1
    )
    best_model,params, outer_scores, best_preds_train, best_preds_test, X_train, X_test, y_train, y_test = train_eval(X, y, net, n_iter, data.filename)
    
#train eval function here


Dataset: METABRIC_adapted.csv
split MKI67                float32
EGFR                 float32
PGR                  float32
ERBB2                float32
hormone_treatment    float32
radiotherapy         float32
chemotherapy         float32
ER_positive          float32
age                  float32
dtype: object
(1522, 9) <class 'pandas.core.frame.DataFrame'>
(1522,) <class 'pandas.core.series.Series'>
(381, 9) <class 'pandas.core.frame.DataFrame'>
(381,) <class 'pandas.core.series.Series'>
split MKI67                float32
EGFR                 float32
PGR                  float32
ERBB2                float32
hormone_treatment    float32
radiotherapy         float32
chemotherapy         float32
ER_positive          float32
age                  float32
dtype: object


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1230, in fit
    self.partial_fit(X, y, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1189, in partial_fit
    self.fit_loop(X, y, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1101, in fit_loop
    self.run_single_epoch(iterator_train, training=True, prefix="train",
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1137, in run_single_epoch
    step = step_fn(batch, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1016, in train_step
    self._step_optimizer(step_fn)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 972, in _step_optimizer
    optimizer.step(step_fn)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/optim/optimizer.py", line 113, in wrapper
    return func(*args, **kwargs)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
    return func(*args, **kwargs)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/optim/adamw.py", line 119, in step
    loss = closure()
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1006, in step_fn
    step = self.train_step_single(batch, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 907, in train_step_single
    loss.backward()
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/_tensor.py", line 396, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/autograd/__init__.py", line 166, in backward
    grad_tensors_ = _make_grads(tensors, grad_tensors_, is_grads_batched=False)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/autograd/__init__.py", line 67, in _make_grads
    raise RuntimeError("grad can be implicitly created only for scalar outputs")
RuntimeError: grad can be implicitly created only for scalar outputs


In [None]:
pd.DataFrame(outer_scores)

Unnamed: 0,cindex_train_METABRIC,cindex_test_METABRIC,ibs_train_METABRIC,ibs_test_METABRIC
0,0.211087,0.145828,0.185698,0.200548
1,0.198378,0.182186,0.178083,0.188852
2,0.195783,0.159745,0.175659,0.192298
3,0.207634,0.171123,0.191138,0.176814
4,0.209188,0.17408,0.175741,0.181442


## TCGA and PCA

In [17]:
from sklearn.decomposition import PCA
from xgbsurv.datasets import load_tcga


In [34]:
class SurvivalModel(nn.Module):
    def __init__(self, n_layers, in_features, num_nodes, dropout, out_features):
        super(SurvivalModel, self).__init__()
        self.n_layers = n_layers
        self.in_features = in_features
        self.num_nodes = num_nodes
        self.dropout = dropout
        self.out_features = out_features
        model = []
        # first layer
        model.append(torch.nn.Linear(in_features, num_nodes))
        model.append(torch.nn.ReLU())
        model.append(torch.nn.Dropout(dropout))
        model.append(torch.nn.BatchNorm1d(num_nodes))

        for i in range(n_layers-1):
            model.append(torch.nn.Linear(num_nodes, num_nodes))
            #init.kaiming_normal_(model[-1].weight, nonlinearity='relu')
            model.append(torch.nn.ReLU())
            model.append(torch.nn.Dropout(dropout))
            model.append(torch.nn.BatchNorm1d(num_nodes))

        # output layer
        model.append(torch.nn.Linear(num_nodes, out_features))
    
        self.layers = nn.Sequential(*model)

        # for layer in self.layers:
        #     if isinstance(layer, nn.Linear):
        #         #nn.init.uniform_(layer.weight, a=-0.5, b=0.5)
        #         nn.init.kaiming_normal_(layer.weight)


    def forward(self, x):
        res = self.layers(x)
        #print(res)
        return res

In [35]:
# set parameters, put into function
n_outer_splits = 5
n_inner_splits = 5
rand_state = 42
n_iter = 1 # # set to 50
early_stopping_rounds=10
base_score = 0.0

def train_eval(X, y, net, n_iter, filename):

        dataset_name = filename.split('_')[0]
        # add IBS later
        outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[],
                        'ibs_train_'+dataset_name:[], 'ibs_test_'+dataset_name:[]}
        best_params = {'best_params_'+dataset_name:[]}
        best_model = {'best_model_'+dataset_name:[]}

        ct = make_column_transformer(
        (CustomStandardScaler(), make_column_selector(dtype_include=['float32'])), 
        remainder='passthrough')
        #,
        #(OneHotEncoder(dtype=np.float32), make_column_selector(dtype_include=['category'], dtype_exclude=['float32'])),
        pca = PCA()

        pipe = Pipeline(
        [#('scaler', ct),
         ('pca', pca),
        ('estimator', net)]
        )
        # pipe = Pipeline([
        # ('scale', StandardScaler()),
        # ('estimator', net),
        # ])
        rs = RandomizedSearchCV(pipe, param_grid_breslow, scoring = scoring_function, n_jobs=-1, 
                            cv=inner_custom_cv, n_iter=n_iter, refit=True)
        
        for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
                # Split data into training and testing sets for outer fold
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y[train_index], y[test_index]

                print(X_train.shape, type(X_train))
                print(y_train.shape, type(y_train))
                print(X_test.shape, type(X_test))
                print(y_test.shape, type(y_test))
                # save splits and data
                savetxt('splits/train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
                savetxt('splits/test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
                
                savetxt('splits/X_train_'+str(i)+'_'+filename, X_train, delimiter=',')
                savetxt('splits/X_test_'+str(i)+'_'+filename, X_test, delimiter=',')

                savetxt('splits/y_train_'+str(i)+'_'+filename, y_train, delimiter=',')
                savetxt('splits/y_test_'+str(i)+'_'+filename, y_test, delimiter=',')
                
                #savetxt('splits/test_index_'+str(i)+'.csv', test_index, delimiter=',')
                
                # create validation dataset for early stopping
                strat = np.sign(y_train)
                valid_split = ValidSplit(cv=0.1, stratified=strat, random_state=42)

                #print(X_train.dtypes, X_test.dtypes)
                # train
                rs.fit(X_train, y_train)
                
                # predict
                #scaler = StandardScaler()
                #X_train = scaler.fit_transform(X_train)
                #X_test = scaler.transform(X_test)
                print(rs.best_estimator_.predict(X_train))
                best_preds_train = rs.best_estimator_.predict(X_train)
                best_preds_test = rs.best_estimator_.predict(X_test)

                #print(best_preds_test, type(best_preds_train))
                #print(best_preds_test, type(best_preds_test))
                # predict survival function
                # d = predict_survival_function(X_test, dataframe=True)

                # save predictions, get dataset name
                savetxt('predictions/train_preds_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
                savetxt('predictions/test_preds_'+str(i)+'_'+filename, best_preds_test, delimiter=',')
                
                # save hyperparameter settings
                params = rs.best_estimator_.get_params
                best_params['best_params_'+dataset_name] += [rs.best_params_]
                best_model['best_model_'+dataset_name] += [params]
                #print(rs.best_params_)
                # save c-index, ibs values
                for var in [X_train, X_train, y_train, y_train, best_preds_train, best_preds_test]:
                        if not isinstance(var, np.ndarray):
                                #print(type(var))
                                var = var.values #.to_numpy()
                                #print(type(var))
                #print('y_train',y_train)
                # training data
                #try: 
                # train data
                cum_hazard_train = get_cumulative_hazard_function_breslow(
                        X_train, X_train, y_train, y_train,
                        best_preds_train.reshape(-1), best_preds_train.reshape(-1)
                        )
                df_survival_train = np.exp(-cum_hazard_train)
                durations_train, events_train = transform_back(y_train)
                time_grid_train = np.linspace(durations_train.min(), durations_train.max(), 100)
                ev = EvalSurv(df_survival_train, durations_train, events_train, censor_surv='km')
                print('Concordance Index',ev.concordance_td('antolini'))
                cindex_score_train = ev.concordance_td('antolini')
                ibs_score_train = ev.integrated_brier_score(time_grid_train)
                outer_scores['cindex_train_'+dataset_name] += [cindex_score_train]
                outer_scores['ibs_train_'+dataset_name] += [ibs_score_train]

                # test data
                cum_hazard_test = get_cumulative_hazard_function_breslow(
                        X_train, X_test, y_train, y_test,
                        best_preds_train.reshape(-1), best_preds_test.reshape(-1)
                        )
                df_survival_test = np.exp(-cum_hazard_test)
                durations_test, events_test = transform_back(y_test)
                print('durations',durations_test.min(), durations_test.max())
                time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
                ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
                print('Concordance Index',ev.concordance_td('antolini'))
                print('Integrated Brier Score:',ev.integrated_brier_score(time_grid_test))
                cindex_score_test = ev.concordance_td('antolini')
                ibs_score_test = ev.integrated_brier_score(time_grid_test)

                outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
                outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]

                # except:
                #         outer_scores['cindex_train_'+dataset_name] += [np.nan]
                #         outer_scores['cindex_test_'+dataset_name] += [np.nan]
                #         outer_scores['ibs_train_'+dataset_name] += [np.nan]
                #         outer_scores['ibs_test_'+dataset_name] += [np.nan]
                # try:
                #         score_train = cindex_censored(y_train, best_preds_train.reshape(-1))
                #         score_test = cindex_censored(y_test, best_preds_test.reshape(-1))
                #         outer_scores['cindex_train_'+dataset_name] += [score_train]
                #         outer_scores['cindex_test_'+dataset_name] += [score_test]
                # except:
                #         outer_scores['cindex_train_'+dataset_name] += [np.nan]
                #         outer_scores['cindex_test_'+dataset_name] += [np.nan]
        df_best_params = pd.DataFrame(best_params)
        df_best_model = pd.DataFrame(best_model)
        df_outer_scores = pd.DataFrame(outer_scores)
        df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
        df_metrics.to_csv('metrics/metric_summary_'+str(i)+'_'+filename, index=False)
        return best_model, best_params, outer_scores, best_preds_train, best_preds_test, X_train, X_test, y_train, y_test
                

In [36]:
from xgbsurv.datasets import load_tcga

cancer_types = ['BLCA']
for cancer_type in cancer_types:
    # get name of current dataset
    data = load_tcga(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", cancer_type= cancer_type, as_frame=True)
    X  = data.data #.astype(np.float32)
    y = data.target.to_numpy()
    print('Dataset:',data.filename)
    print('shape X',X.shape[1])
    
    net = NeuralNet(
    SurvivalModel, 
    module__n_layers = 1,
    module__in_features = X.shape[1],
    #module__num_nodes = 32,
    #module__dropout = 0.1, # these could also be removed
    module__out_features = 1,
    # for split sizes when result size = 1
    iterator_train__drop_last=True,
    #iterator_valid__drop_last=True,
    criterion=BreslowLoss,
    optimizer=torch.optim.AdamW,
    optimizer__weight_decay = 0.4,
    batch_size=32, # separate train and valid->iterator_train__batch_size=128 and iterator_valid__batch_size=128 ?
    callbacks=[EarlyStopping(patience=10)],
    #TODO: enable stratification, verify
    train_split=ValidSplit(0.2), # might cause lower performance in metrics, explain in thesis
    #lr=0.001,
    #max_epochs=1, #0,#100
    #train_split=None,
    verbose=1
    )
    best_model,params, outer_scores, best_preds_train, best_preds_test, X_train, X_test, y_train, y_test = train_eval(X, y, net, n_iter, data.filename)
    
#train eval function here


Dataset: BLCA_adapted.csv
shape X 20531
split gex_?|100130426      float32
gex_?|100133144      float32
gex_?|100134869      float32
gex_?|10357          float32
gex_?|10431          float32
                      ...   
gex_ZYG11A|440590    float32
gex_ZYG11B|79699     float32
gex_ZYX|7791         float32
gex_ZZEF1|23140      float32
gex_ZZZ3|26009       float32
Length: 20531, dtype: object
(324, 20531) <class 'pandas.core.frame.DataFrame'>
(324,) <class 'numpy.ndarray'>
(82, 20531) <class 'pandas.core.frame.DataFrame'>
(82,) <class 'numpy.ndarray'>
split gex_?|100130426      float32
gex_?|100133144      float32
gex_?|100134869      float32
gex_?|10357          float32
gex_?|10431          float32
                      ...   
gex_ZYG11A|440590    float32
gex_ZYG11B|79699     float32
gex_ZYX|7791         float32
gex_ZZEF1|23140      float32
gex_ZZZ3|26009       float32
Length: 20531, dtype: object


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1230, in fit
    self.partial_fit(X, y, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1189, in partial_fit
    self.fit_loop(X, y, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1104, in fit_loop
    self.run_single_epoch(iterator_valid, training=False, prefix="valid",
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1137, in run_single_epoch
    step = step_fn(batch, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 873, in validation_step
    y_pred = self.infer(Xi, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1427, in infer
    return self.module_(x, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/var/folders/jr/dh6mkdzs31lc5pkqymtdbh180000gp/T/ipykernel_44752/2043477712.py", line 35, in forward
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/nn/modules/container.py", line 139, in forward
    input = module(input)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 114, in forward
    return F.linear(input, self.weight, self.bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (52x259 and 20531x512)

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1230, in fit
    self.partial_fit(X, y, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1189, in partial_fit
    self.fit_loop(X, y, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1104, in fit_loop
    self.run_single_epoch(iterator_valid, training=False, prefix="valid",
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1137, in run_single_epoch
    step = step_fn(batch, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 873, in validation_step
    y_pred = self.infer(Xi, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1427, in infer
    return self.module_(x, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/var/folders/jr/dh6mkdzs31lc5pkqymtdbh180000gp/T/ipykernel_44752/2043477712.py", line 35, in forward
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/nn/modules/container.py", line 139, in forward
    input = module(input)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 114, in forward
    return F.linear(input, self.weight, self.bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (52x260 and 20531x512)


In [None]:
from xgbsurv.datasets import load_tcga, load_metabric
data = load_tcga(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
data.data

feature names Index(['gex_?|100130426', 'gex_?|100133144', 'gex_?|100134869', 'gex_?|10357',
       'gex_?|10431', 'gex_?|136542', 'gex_?|155060', 'gex_?|26823',
       'gex_?|280660', 'gex_?|317712',
       ...
       'gex_ZWILCH|55055', 'gex_ZWINT|11130', 'gex_ZXDA|7789',
       'gex_ZXDB|158586', 'gex_ZXDC|79364', 'gex_ZYG11A|440590',
       'gex_ZYG11B|79699', 'gex_ZYX|7791', 'gex_ZZEF1|23140',
       'gex_ZZZ3|26009'],
      dtype='object', length=20531)


Unnamed: 0,gex_?|100130426,gex_?|100133144,gex_?|100134869,gex_?|10357,gex_?|10431,gex_?|136542,gex_?|155060,gex_?|26823,gex_?|280660,gex_?|317712,...,gex_ZWILCH|55055,gex_ZWINT|11130,gex_ZXDA|7789,gex_ZXDB|158586,gex_ZXDC|79364,gex_ZYG11A|440590,gex_ZYG11B|79699,gex_ZYX|7791,gex_ZZEF1|23140,gex_ZZZ3|26009
0,0.0,4.195434,2.953228,7.231068,10.469998,0.0,8.592659,0.000000,0.000000,0.0,...,9.124426,9.885547,4.177870,7.457742,9.985372,1.820975,9.417243,11.921384,9.587230,9.403817
1,0.0,0.802896,2.237013,8.397589,10.222879,0.0,7.607020,0.000000,0.000000,0.0,...,8.218655,10.034551,4.426969,8.462707,10.125465,4.302100,10.112739,11.189874,10.041892,9.783275
2,0.0,4.183391,4.877784,7.097548,9.959911,0.0,8.428193,0.507820,0.000000,0.0,...,9.498281,10.776918,4.646773,8.687873,10.588546,5.514933,10.055676,10.842696,9.509474,10.133065
3,0.0,2.755529,3.916639,7.398145,10.496434,0.0,7.073135,0.000000,3.128986,0.0,...,9.329610,11.361143,5.144968,8.649199,9.498885,2.767888,9.776481,11.223296,8.811940,9.527567
4,0.0,4.009168,5.235065,7.518291,9.919535,0.0,7.992802,0.835358,0.000000,0.0,...,8.989721,9.594618,6.332848,8.745624,10.302148,1.565987,9.332981,10.934199,10.580070,9.562055
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
401,0.0,5.035584,3.977573,7.714362,11.225461,0.0,5.968162,0.000000,0.000000,0.0,...,9.727070,11.610245,3.802255,8.037893,9.997673,3.000072,10.538354,11.404152,9.240267,9.791479
402,0.0,3.903704,4.024435,7.142873,9.576339,0.0,7.139347,0.000000,0.000000,0.0,...,8.713679,9.481592,5.497338,8.256657,9.711478,1.667892,9.908784,12.348994,10.086043,9.755756
403,0.0,0.000000,0.892585,8.571294,9.936660,0.0,7.619978,0.000000,0.000000,0.0,...,9.092707,10.606433,5.140010,7.794318,9.973999,5.122383,9.017769,12.651590,10.371994,9.364669
404,0.0,3.126940,3.091852,7.668268,10.308020,0.0,6.358922,0.000000,0.000000,0.0,...,9.262647,10.148044,5.441171,8.328657,9.829008,6.142242,9.544802,12.455296,9.361354,8.944604


In [None]:
data

{'data': array([[ 0.        ,  4.19543422,  2.95322798, ..., 11.92138352,
          9.58722996,  9.40381732],
        [ 0.        ,  0.80289629,  2.23701297, ..., 11.18987397,
         10.04189183,  9.78327493],
        [ 0.        ,  4.18339123,  4.87778351, ..., 10.84269605,
          9.50947416, 10.13306513],
        ...,
        [ 0.        ,  0.        ,  0.89258532, ..., 12.65158991,
         10.37199439,  9.36466873],
        [ 0.        ,  3.12693983,  3.09185213, ..., 12.45529639,
          9.36135386,  8.94460373],
        [ 0.        ,  2.86554275,  3.52418908, ..., 12.38008763,
          9.5812326 ,  8.723463  ]]),
 'target': array([  -13.,   -15.,   -17.,    19.,   -20.,    20.,   -28.,   -35.,
          -37.,   -46.,   -55.,    56.,    56.,    57.,   -59.,    62.,
          -64.,   -64.,    65.,   -67.,   -68.,    68.,    69.,    76.,
           81.,   -82.,   -84.,    88.,   -89.,    90.,    92.,    93.,
          -95.,    98.,    99.,  -105.,   106.,  -110.,  -117.,   1

In [None]:
d  = load_metabric(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame = True)
d.data

Unnamed: 0,MKI67,EGFR,PGR,ERBB2,hormone_treatment,radiotherapy,chemotherapy,ER_positive,age
0,5.818934,6.470783,10.672935,5.630679,0.0,0.0,0.0,1.0,75.330002
1,10.117913,5.335094,9.717084,5.893656,0.0,0.0,0.0,0.0,54.099998
2,5.705204,8.450347,10.859011,5.667925,0.0,0.0,0.0,1.0,73.639999
3,5.184060,8.427523,10.361415,5.575082,1.0,0.0,0.0,1.0,73.980003
4,5.621474,5.456216,9.500981,5.753597,1.0,0.0,0.0,1.0,34.680000
...,...,...,...,...,...,...,...,...,...
1898,5.567494,5.818224,11.343552,5.574574,0.0,1.0,0.0,1.0,58.799999
1899,7.803252,5.352677,10.012809,6.017503,0.0,0.0,0.0,1.0,67.459999
1900,6.100280,7.107530,10.501780,6.268520,0.0,0.0,0.0,1.0,29.980000
1901,5.490514,7.606261,12.297510,6.313382,0.0,1.0,0.0,1.0,63.169998


In [None]:
def breslow_estimator(log_hazard, time, event):
    #time, event = transform_back(y)
    risk_score = np.exp(log_hazard)
    print(risk_score.shape)
    is_sorted = lambda a: np.all(a[:-1] <= a[1:])

    if is_sorted(time) == False:
        order = np.argsort(time, kind="mergesort")
        time = time[order]
        event = event[order]
        risk_score = risk_score[order]

    uniq_times = np.unique(time)
    idx = np.digitize(time, np.unique(time))
    breaks = np.flatnonzero(np.concatenate(([1], np.diff(idx))))
    # numpy diff nth discrete difference over index, add 1 at the beginning
    # flatnonzero return indices that are nonzero in flattened version
    n_events = np.add.reduceat(event, breaks, axis=0)

    # consider removing zero rows
    risk_matrix = np.unique((np.outer(time,time)>=np.square(time)).astype(int).T, axis=0)
    print(risk_matrix.shape)
    denominator = np.sum(risk_score.reshape(-1)*risk_matrix,axis=1)[::-1] 

    baseline_hazard = n_events / denominator
    cum_hazard_baseline = np.cumsum(n_events / denominator)
    baseline_survival = np.exp(-cum_hazard_baseline)
    return uniq_times, baseline_hazard, cum_hazard_baseline, baseline_survival

In [None]:
time, event = transform_back(y_train)
uniq_times_old, baseline_hazard, cum_hazard_baseline_old, baseline_survival_old = breslow_estimator(best_preds_train, time, event)

(1523, 1)
(1381, 1523)


In [None]:
p1 = np.exp(-(cum_hazard_baseline_old*np.exp(best_preds_train[20])))

In [None]:
import plotly_express as px

px.line(x =uniq_times_old, y = p1)

In [None]:
from sksurv.util import Surv
from sksurv.metrics import integrated_brier_score
data = load_metabric(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=False)
X  = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# use from arrays and then build the two survival struc arrays
# do test predictions in shape of (n_samples, n_times) and put in unique times
# for the last time point the unique time point has to be decremente by a tiny bit!
time, event = transform_back(y_train)
time_train, event_train = transform_back(y_train)
y_train = Surv.from_arrays( event_train.astype(dtype=bool),time_train)
time_test, event_test = transform_back(y_test)
y_test = Surv.from_arrays(event_test.astype(dtype=bool),time_test)
uniq_times_old, baseline_hazard, cum_hazard_baseline_old, baseline_survival_old = breslow_estimator(best_preds_train, time, event)
print('cum_hazard_baseline_old', cum_hazard_baseline_old.shape)
# cumbaseline hazard for each (unique) time step of the test data
cum_hazard_baseline_old_new = np.interp(time_test, np.unique(time_train), cum_hazard_baseline_old)
best_preds_test = np.ones(y_test.shape[0])
print('cum_hazard_baseline_old_new.shape',cum_hazard_baseline_old_new[:, None].shape)
#preds = np.exp(-(cum_hazard_baseline_old_new[:,None]*np.exp(best_preds_test)))
# as many as there unique time steps
#cum_hazard_baseline_old_new = np.ones((334))
preds = np.exp(-(np.outer(np.exp(best_preds_test), np.exp(cum_hazard_baseline_old_new))))
print('preds',preds.shape)
preds = np.ones((381,374))
uni_times = np.unique(time_test)
print('uni_times.shape',uni_times.shape)
times = np.unique(time_test)
#times = np.arange(time_test.min(),time_test.max())
times = np.delete(times,-1)
# the highest time has to be decremented, but at the same time only a range seems to work
times = np.append(times,337.03333-0.1)
print('y_train',y_train.shape)
print('y_test',y_test.shape)
print('preds',preds.shape)
print('times',times.shape)
ibs = integrated_brier_score(y_train, y_test, preds, times)
print("Integrated Brier Score: {:.3f}".format(ibs))

(1523, 1)
(1381, 1522)
cum_hazard_baseline_old (1381,)
cum_hazard_baseline_old_new.shape (381, 1)
preds (381, 381)
uni_times.shape (374,)
y_train (1522,)
y_test (381,)
preds (381, 374)
times (374,)
Integrated Brier Score: 0.551


In [None]:
## process
hazard for each unique time step multiplied by the predicted hazard
applying surv function exp(-x)
preds hence (individuals, unique times)
unique times with the last value slightly decremented

SyntaxError: invalid syntax (2733391844.py, line 2)

In [None]:
print(time_test.min(),time_test.max())
times

3.7666667 337.03333


array([  3.76666665,   4.76666665,   5.76666665,   6.76666665,
         7.76666665,   8.76666665,   9.76666665,  10.76666665,
        11.76666665,  12.76666665,  13.76666665,  14.76666665,
        15.76666665,  16.76666665,  17.76666665,  18.76666665,
        19.76666665,  20.76666665,  21.76666665,  22.76666665,
        23.76666665,  24.76666665,  25.76666665,  26.76666665,
        27.76666665,  28.76666665,  29.76666665,  30.76666665,
        31.76666665,  32.76666665,  33.76666665,  34.76666665,
        35.76666665,  36.76666665,  37.76666665,  38.76666665,
        39.76666665,  40.76666665,  41.76666665,  42.76666665,
        43.76666665,  44.76666665,  45.76666665,  46.76666665,
        47.76666665,  48.76666665,  49.76666665,  50.76666665,
        51.76666665,  52.76666665,  53.76666665,  54.76666665,
        55.76666665,  56.76666665,  57.76666665,  58.76666665,
        59.76666665,  60.76666665,  61.76666665,  62.76666665,
        63.76666665,  64.76666665,  65.76666665,  66.76

In [None]:
np.unique(time_test)

array([  3.7666667,   5.5      ,   5.8333335,   7.8      ,   9.133333 ,
        10.633333 ,  10.833333 ,  11.866667 ,  12.266666 ,  13.4      ,
        14.7      ,  15.2      ,  16.566668 ,  16.6      ,  16.7      ,
        17.666666 ,  17.833334 ,  18.266666 ,  18.833334 ,  20.2      ,
        20.433332 ,  21.       ,  22.133333 ,  22.233334 ,  23.2      ,
        23.8      ,  23.833334 ,  23.9      ,  23.933332 ,  24.633333 ,
        24.866667 ,  25.233334 ,  25.433332 ,  25.633333 ,  26.333334 ,
        27.466667 ,  27.866667 ,  28.566668 ,  28.833334 ,  29.066668 ,
        29.3      ,  30.8      ,  31.166666 ,  32.066666 ,  32.733334 ,
        32.833332 ,  32.866665 ,  33.133335 ,  34.3      ,  34.333332 ,
        34.566666 ,  34.633335 ,  34.7      ,  34.766666 ,  35.       ,
        35.2      ,  36.4      ,  36.633335 ,  37.       ,  37.5      ,
        37.733334 ,  38.133335 ,  38.8      ,  41.466667 ,  41.833332 ,
        42.633335 ,  42.666668 ,  42.9      ,  43.1      ,  43.2

In [None]:
new_time, new_event = transform_back(y_test)
old_time, old_event = transform_back(y_train)
cum_hazard_baseline_old_new = np.interp(np.unique(new_time), np.unique(old_time), cum_hazard_baseline_old)

In [None]:
np.unique(time).shape # should I take the unique one?

(1381,)

In [None]:
px.line(x =np.unique(new_time), y = cum_hazard_baseline_old_new )

In [None]:
pd.DataFrame(outer_scores)

Unnamed: 0,cindex_train_METABRIC,cindex_test_METABRIC
0,0.60615,0.559541
1,0.609315,0.611044
2,0.617208,0.608548
3,0.446914,0.460198
4,0.643331,0.649819


In [None]:
t = np.array([2,4,5,6,9,14])
new_times = np.array([3,4,5])
y = np.array([0.2,0.3,0.7,0.7,0.8,0.9])
np.interp(new_times, t, y)

array([0.25, 0.3 , 0.7 ])

In [None]:
import numpy as np

a = np.array([1, 2, 3, 4.2])
b = np.array([2.5, 3.5, 4.5, 5.5, 6.5])

closest_indices = np.abs(a[:, np.newaxis] - b).argmin(axis=1)

print(closest_indices)


[0 0 0 2]


In [None]:
np.abs(a[:, np.newaxis] - b)

array([[1.5, 2.5, 3.5, 4.5, 5.5],
       [0.5, 1.5, 2.5, 3.5, 4.5],
       [0.5, 0.5, 1.5, 2.5, 3.5],
       [1.5, 0.5, 0.5, 1.5, 2.5],
       [2.5, 1.5, 0.5, 0.5, 1.5]])

In [None]:
a[closest_indices]

array([1, 1, 1, 2, 3])

In [None]:
## Training Procedure

In [None]:

# Define models to apply
loss_functions = ['breslow_loss', 'efron_loss']#, 'deephit_loss'] #, 'cind_loss', 'aft_loss', 'efron_loss', 'aft_loss', 'ah_loss', 'deephit_loss'
criterion_functions = [BreslowLoss, EfronLoss] #, deephit_loss1_pycox] #, cind_loss, efron_likelihood, aft_likelihood, ah_likelihood, deephit_loss1_pycox
scoring_functions = [breslow_likelihood_torch, efron_likelihood_torch] #, deephit_loss1_pycox] #, cind_loss, efron_likelihood, aft_likelihood, ah_likelihood, deephit_loss1_pycox]

n_models = len(criterion_functions)

# dict of outer scores
outer_scores = {'breslow_loss':[], 'efron_loss':[]} #, 'efron_loss':[],  'aft_loss':[], 'ah_loss':[], 'deephit_loss':[]#,'deephit_loss' 'cind_loss':[],'aft_loss':[]

# Load dataset
data, target = load_metabric(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
X, y = load_metabric(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=False)

# deephit data adaptation
time, event = transform_back(target.to_numpy())
data['time'] = time
data['event'] = event
df = discretizer_df(data, n_cuts=100, type = 'equidistant', min_time=0.0)

y_deep = transform(df.time.to_numpy(), df.event.to_numpy())
n = len(np.unique(np.absolute(y_deep)))
y_deephit = np.tile(y_deep, (n,1)).T
X_deephit = df.iloc[:,:-2].to_numpy()

NameError: name 'EfronLoss' is not defined

In [None]:
# examples
# example
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import pandas as pd
import numpy as np

# Create a sample dataframe
df = pd.DataFrame({'age': [25, 30, 35],
                   'income': [50000, 60000, 70000],
                   'gender': ['male', 'female', 'male'],
                   'is_married': [True, False, True],
                   'num_children': [2, 0, 1],
                   'target': [0, 1, 1]})

# Select columns by data type
numeric_cols = df.select_dtypes(include=np.number).columns
categorical_cols = df.select_dtypes(include='object').columns
boolean_cols = df.select_dtypes(include=bool).columns

# Define the feature mapper
mapper = DataFrameMapper([
    (numeric_cols, StandardScaler()),
    (categorical_cols, OneHotEncoder()),
    (boolean_cols, None)
])

# Create the pipeline
pipeline = make_pipeline(mapper, LogisticRegression())

# Split the data into input features and target variable
X = df.drop('target', axis=1)
y = df['target']

# Fit the pipeline to the data
pipeline.fit(X, y)


In [None]:


class CustomStandardScaler(StandardScaler):
    
    def __init__(self, copy=True, with_mean=True, with_std=True):
        super().__init__(copy=copy, with_mean=with_mean, with_std=with_std)
        
    def fit(self, X, y=None):
        # Add your own code here
        return super().fit(X, y)
    
    def transform(self, X, y=None):
        # Add your own code here
        X_transformed = super().transform(X, y)
        # Add your own code here
        return X_transformed
    
    def fit_transform(self, X, y=None):
        # Add your own code here
        X_transformed = super().fit_transform(X, y)
        # Add your own code here
        return X_transformed
    
ct = make_column_transformer(
        [(StandardScaler(), make_column_selector(dtype_include=['float32'])), 
        (LabelBinarizer(), make_column_selector(dtype_include='bool'))],remainder='passthrough'
        #(BoolToNumericTransformer(), make_column_selector(dtype_include=['bool']))
        )

data = load_rgbsg(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
X  = data.data
X = ct.fit_transform(X)
print(X.shape)
np.savetxt('testX.csv',X,delimiter=',')