In [36]:
# XGBsurv benchmark
from xgbsurv.datasets import load_metabric, load_flchain, load_rgbsg, load_support
#from xgbsurv.datasets import (load_flchain, load_rgbsg, load_support,
#load_tcga)
from xgbsurv import XGBSurv
from xgbsurv.evaluation import cindex_censored, ibs
from xgbsurv.models.utils import sort_X_y
import os
import numpy as np
from numpy import savetxt
import pandas as pd
from scipy.stats import uniform as scuniform
from scipy.stats import randint as scrandint
from scipy.stats import loguniform as scloguniform 
from sklearn.model_selection import RandomizedSearchCV, KFold, StratifiedKFold
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.datasets import load_iris
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.preprocessing import StandardScaler, LabelEncoder,LabelBinarizer
# import models
from xgbsurv.models.breslow_final import breslow_likelihood, breslow_estimator
from pycox.evaluation import EvalSurv
from xgbsurv.models.utils import transform_back, transform
from xgbsurv.preprocessing.dataset_preprocessing import discretizer_df
import sys
#sys.path.append('/Users/JUSC/Documents/xgbsurv_benchmarking/deep_learning/')
from loss_functions_pytorch import BreslowLoss, breslow_likelihood_torch
from skorch import NeuralNet
from skorch.callbacks import EarlyStopping
from skorch.dataset import CVSplit, ValidSplit
import torch
from torch import nn
import torch.nn.init as init

## Set Parameters
deep learning hyperparameter spaces follow pycox paper
Time-to-Event Prediction with Neural Networks and Cox Regression
page 21

In [37]:
# set parameters, put into function
n_outer_splits = 5
n_inner_splits = 5
rand_state = 42
n_iter = 10 # set to 50
#n_iter_cind = 200
early_stopping_rounds=15
base_score = 0.0

param_grid_breslow = {
    'estimator__module__n_layers': [1, 2, 4],
    'estimator__module__n_layers': [1, 2, 4],
    'estimator__module__num_nodes': [64, 128, 256, 512],
    'estimator__module__dropout': scuniform(0.0,0.7),
    'estimator__optimizer__weight_decay': [0.4, 0.2, 0.1, 0.05, 0.02, 0.01, 0],
    'estimator__batch_size': [64, 128, 256, 512, 1024],
    #lr not in paper because of learning rate finder
    # note: setting learning rate higher would make exp(partial_hazard) explode
    'estimator__lr': scloguniform(0.001,0.01), # scheduler unten einbauen
    #'max_epochs':  scrandint(10,20), # corresponds to num_rounds
}

## Set MLP Model

In [38]:

class SurvivalModel(nn.Module):
    def __init__(self, n_layers, in_features, num_nodes, dropout, out_features):
        super(SurvivalModel, self).__init__()
        self.n_layers = n_layers
        self.in_features = in_features
        self.num_nodes = num_nodes
        self.dropout = dropout
        self.out_features = out_features
        model = []
        # first layer
        model.append(torch.nn.Linear(in_features, num_nodes))
        model.append(torch.nn.ReLU())
        model.append(torch.nn.Dropout(dropout))
        model.append(torch.nn.BatchNorm1d(num_nodes))

        for i in range(n_layers-1):
            model.append(torch.nn.Linear(num_nodes, num_nodes))
            #init.kaiming_normal_(model[-1].weight, nonlinearity='relu')
            model.append(torch.nn.ReLU())
            model.append(torch.nn.Dropout(dropout))
            model.append(torch.nn.BatchNorm1d(num_nodes))

        # output layer
        model.append(torch.nn.Linear(num_nodes, out_features))
    
        self.layers = nn.Sequential(*model)

        # for layer in self.layers:
        #     if isinstance(layer, nn.Linear):
        #         #nn.init.uniform_(layer.weight, a=-0.5, b=0.5)
        #         nn.init.kaiming_normal_(layer.weight)


    def forward(self, x):
        res = self.layers(x)
        #print(res)
        return res


## Set Splitting Strategy

In [39]:
# Define stratified inner k-fold cross-validation
class CustomSplit(StratifiedKFold):
    def __init__(self, n_splits=5, shuffle=True, random_state=None):
        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    def split(self, X, y, groups=None):
        try:
            if y.shape[1]>1:
                y = y[:,0]
        except:
            pass
        bins = np.sign(y)
        return super().split(X, bins, groups=groups)

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

outer_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)
inner_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)



## Set Scoring Function

In [40]:
# Define Scorer
def custom_scoring_function(y_true, y_pred):

        #y_true = torch.from_numpy(y_true)
        if isinstance(y_pred, np.ndarray):
            y_pred = torch.from_numpy(y_pred)
        if isinstance(y_true, np.ndarray):
            y_true = torch.from_numpy(y_true)
        if isinstance(y_pred, pd.Series):
            y_pred = torch.tensor(y_pred.values)
        if isinstance(y_true, pd.Series):
            y_true = torch.tensor(y_true.values)
        score = breslow_likelihood_torch(y_true, y_pred) 
        return score.numpy()

scoring_function = make_scorer(custom_scoring_function, greater_is_better=False)

## Set Data Loading Functions

In [41]:


data = load_flchain(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
X  = data.data
X.dtypes
class BoolToNumericTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        X_copy[X_copy == True] = 1
        X_copy[X_copy == False] = 0
        return X_copy.astype(float)
    
ct = make_column_transformer(
(StandardScaler(), make_column_selector(dtype_include=['float32'])),
(BoolToNumericTransformer(), make_column_selector(dtype_include=['bool'])),remainder='passthrough'
)
X = ct.fit_transform(X)
X.shape

(7871, 8)

In [42]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_selector
import pandas as pd

# Define a custom transformer to convert True and False values to 1 and 0, respectively
class BoolToNumericTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        X_copy[X_copy == True] = 1
        X_copy[X_copy == False] = 0
        return X_copy.astype(float)

# Let's say you have a pandas DataFrame with columns of different types:
df = pd.DataFrame({
    'numeric_feature': [1.0, 2.0, 3.0],
    'bool_feature': [True, False, True]
})

# Define the column transformer using make_column_selector and the custom transformer
column_transformer = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include=['float64'])),
    (BoolToNumericTransformer(), make_column_selector(dtype_include=['bool']))
)

# Apply the transformation to the DataFrame
transformed_df = column_transformer.fit_transform(df)

print(transformed_df)


[[-1.22474487  1.        ]
 [ 0.          0.        ]
 [ 1.22474487  1.        ]]


In [43]:
class TorchStandardScaler(StandardScaler):
    def transform(self, X, copy=None):
        X_scaled = super().transform(X, copy=copy)
        return torch.from_numpy(X_scaled)

    def fit_transform(self, X, y=None):
        X_transformed = super().fit_transform(X, y=y)
        return torch.from_numpy(X_transformed)
    
ct = make_column_transformer(
       (StandardScaler(),
        make_column_selector(dtype_include=float)),)

In [44]:
class BoolToNumericTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        X_copy[X_copy == True] = 1
        X_copy[X_copy == False] = 0
        return X_copy.astype(np.float32)

# adapt this for bool case
class CustomStandardScaler(StandardScaler):
    
    def __init__(self, copy=True, with_mean=True, with_std=True):
        super().__init__(copy=copy, with_mean=with_mean, with_std=with_std)
        
    def fit(self, X, y=None):
        # Add your own code here
        return super().fit(X, y)
    
    def transform(self, X, y=None):
        # Add your own code here
        X_transformed = super().transform(X, y)
        # Add your own code here
        return X_transformed.astype(np.float32)
    
    def fit_transform(self, X, y=None):
        # Add your own code here
        
        #to_add = X[X.columns[X.columns.isin(['sex','D'])]]
        #X = X[X.columns[~X.columns.isin(['sex','D'])]]
        #df.columns.get_loc('age')
        X_transformed = super().fit_transform(X, y)
        #to_add = to_add.values.reshape(-1, 1)

        # Horizontally stack the two arrays
        #result = np.hstack((X_transformed, to_add))
        
        # Add your own code here
        return X_transformed.astype(np.float32)

class CustomStandardScaler2(StandardScaler):
    """Just to change the datatype of bool variables."""
    def __init__(self, copy=True, with_mean=True, with_std=True):
        super().__init__(copy=copy, with_mean=with_mean, with_std=with_std)
        
    def fit(self, X, y=None):
        # Add your own code here
        return super().fit(X, y)
    
    def transform(self, X, y=None):
        # Add your own code here
        X_transformed = super().transform(X, y)
        # Add your own code here
        return X_transformed.astype(np.float32)
    
    def fit_transform(self, X, y=None):
        # Add your own code here
        
        to_return = X
        #X = X[X.columns[~X.columns.isin(['sex','D'])]]
        #df.columns.get_loc('age')
        X_transformed = super().fit_transform(X, y)
        #to_add = to_add.values.reshape(-1, 1)

        # Horizontally stack the two arrays
        #result = np.hstack((X_transformed, to_add))
        
        # Add your own code here
        return to_return.astype(np.float32)
    
class CustomLabelBinarizer(LabelBinarizer):
    def __init__(self, neg_label=0, pos_label=1, sparse_output=False):
        super().__init__(neg_label=0, pos_label=1, sparse_output=False)

    def fit(self, X, y=None):
        # Add custom fit logic here
        # Call the parent class's fit method
        print('fit',X)
        super().fit(X, y)

        return self

    def transform(self, X, y=None):
        # Add custom transform logic here

        # Call the parent class's transform method
        print('transform',X)
        transformed_X = super().transform(X)

        return transformed_X
    
    def fit_transform(self, X, y=None):
        # Add custom transform logic here

        # Call the parent class's transform method
        print(y)
        print('fit_transform',X.values)
        to_add = X[X.columns[X.columns.isin(['sex','D'])]]
        X = X[X.columns[~X.columns.isin(['sex','D'])]]
        transformed_X = super().fit_transform(X.values)
    
        return transformed_X.astype(np.float32)

    def inverse_transform(self, X, y=None):
        # Add custom inverse_transform logic here

        # Call the parent class's inverse_transform method
        inverse_X = super().inverse_transform(X)

        return inverse_X.astype(np.float32)
    
# ct = make_column_transformer(
#         (CustomStandardScaler(), make_column_selector(dtype_include=['float32'])),
#         #(CustomLabelBinarizer(), make_column_selector(dtype_include=['bool'])), remainder='passthrough')

ct = CustomStandardScaler()
data = load_flchain(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
X  = data.data
X = ct.fit_transform(X)
X.shape

(7871, 8)

## Set Training Function

In [45]:
# set parameters, put into function
n_outer_splits = 5
n_inner_splits = 5
rand_state = 42
n_iter = 1 # # set to 50
early_stopping_rounds=10
base_score = 0.0

def train_eval(X, y, net, n_iter, filename):

        dataset_name = filename.split('_')[0]
        # add IBS later
        outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[]}
        best_params = {'best_params_'+dataset_name:[]}
        best_model = {'best_model_'+dataset_name:[]}
        # ct = make_column_transformer(
        #  (StandardScaler(),
        #  make_column_selector(dtype_include=float,dtype_exclude=bool)),)
        # (LabelBinarizer(),
        # make_column_selector(dtype_include=bool, dtype_exclude=float)))
        ct = make_column_transformer(
        (CustomStandardScaler(), make_column_selector(dtype_include=['float32'])),
        (CustomStandardScaler2(), make_column_selector(dtype_include=['category'])), remainder='passthrough')
        #columns_to_transform = [col for col in X.columns if col != 'sex']
        #ct = make_column_transformer(
        #(StandardScaler(), columns_to_transform), 
        #(LabelBinarizer(), make_column_selector(dtype_include='bool')),
        #remainder='passthrough'
        #)
        pipe = Pipeline(
        [('scaler', ct),
        ('estimator', net)]
        )
        # pipe = Pipeline([
        # ('scale', StandardScaler()),
        # ('estimator', net),
        # ])
        rs = RandomizedSearchCV(pipe, param_grid_breslow, scoring = scoring_function, n_jobs=-1, 
                            cv=inner_custom_cv, n_iter=n_iter, refit=True)
        
        for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
                # Split data into training and testing sets for outer fold
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y[train_index], y[test_index]

                print(X_train.shape, type(X_train))
                print(y_train.shape, type(y_train))
                print(X_test.shape, type(X_test))
                print(y_test.shape, type(y_test))
                # save splits and data
                savetxt('splits/train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
                savetxt('splits/test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
                
                savetxt('splits/X_train_'+str(i)+'_'+filename, X_train, delimiter=',')
                savetxt('splits/X_test_'+str(i)+'_'+filename, X_test, delimiter=',')

                savetxt('splits/y_train_'+str(i)+'_'+filename, y_train, delimiter=',')
                savetxt('splits/y_test_'+str(i)+'_'+filename, y_test, delimiter=',')
                
                #savetxt('splits/test_index_'+str(i)+'.csv', test_index, delimiter=',')
                
                # create validation dataset for early stopping
                strat = np.sign(y_train)
                valid_split = ValidSplit(cv=0.1, stratified=strat, random_state=42)

                # train
                rs.fit(X_train, y_train)
                
                # predict
                #scaler = StandardScaler()
                #X_train = scaler.fit_transform(X_train)
                #X_test = scaler.transform(X_test)
                print(rs.best_estimator_.predict(X_train))
                best_preds_train = rs.best_estimator_.predict(X_train)
                best_preds_test = rs.best_estimator_.predict(X_test)

                #print(best_preds_test, type(best_preds_train))
                #print(best_preds_test, type(best_preds_test))
                # predict survival function
                # d = predict_survival_function(X_test, dataframe=True)

                # save predictions, get dataset name
                savetxt('predictions/train_preds_'+str(i)+'_'+filename, best_preds_train, delimiter=',')
                savetxt('predictions/test_preds_'+str(i)+'_'+filename, best_preds_test, delimiter=',')
                
                # save hyperparameter settings
                params = rs.best_estimator_.get_params
                best_params['best_params_'+dataset_name] += [rs.best_params_]
                best_model['best_model_'+dataset_name] += [params]
                #print(rs.best_params_)
                # save c-index values
                try:
                        score_train = cindex_censored(y_train, best_preds_train.reshape(-1))
                        score_test = cindex_censored(y_test, best_preds_test.reshape(-1))
                        outer_scores['cindex_train_'+dataset_name] += [score_train]
                        outer_scores['cindex_test_'+dataset_name] += [score_test]
                except:
                        outer_scores['cindex_train_'+dataset_name] += [np.nan]
                        outer_scores['cindex_test_'+dataset_name] += [np.nan]
        df_best_params = pd.DataFrame(best_params)
        df_best_model = pd.DataFrame(best_model)
        df_outer_scores = pd.DataFrame(outer_scores)
        df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
        df_metrics.to_csv('metrics/metric_summary_'+str(i)+'_'+filename, index=False)
        return best_model, best_params, outer_scores, best_preds_train, best_preds_test, X_train, X_test, y_train, y_test
                

In [46]:
data = load_support(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
X  = data.data
X.dtypes

age                  float32
sex                 category
race                 float32
n_comorbidities      float32
diabetes            category
dementia            category
cancer              category
blood_pressure       float32
heart_rate           float32
respiration_rate     float32
temperature          float32
white_blood_cell     float32
serum_sodium         float32
serum_creatinine     float32
dtype: object

In [47]:
data_set_fns = [load_support] #, load_flchain, load_rgbsg, load_support, load_tcga]
for dataset in data_set_fns:
    # get name of current dataset
    data = dataset(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
    X  = data.data #.astype(np.float32)
    y = data.target
    print('Dataset:',data.filename)
    print(X.dtypes)
    print(y.dtypes)
    
    net = NeuralNet(
    SurvivalModel, 
    module__n_layers = 1,
    module__in_features = X.shape[1],
    #module__num_nodes = 32,
    #module__dropout = 0.1, # these could also be removed
    module__out_features = 1,
    # for split sizes when result size = 1
    iterator_train__drop_last=True,
    #iterator_valid__drop_last=True,
    criterion=BreslowLoss,
    optimizer=torch.optim.AdamW,
    optimizer__weight_decay = 0.4,
    batch_size=32, # separate train and valid->iterator_train__batch_size=128 and iterator_valid__batch_size=128 ?
    callbacks=[EarlyStopping(patience=10)],
    #TODO: enable stratification, verify
    train_split=ValidSplit(0.2), # might cause lower performance in metrics, explain in thesis
    #lr=0.001,
    #max_epochs=1, #0,#100
    #train_split=None,
    verbose=1
    )
    best_model,params, outer_scores, best_preds_train, best_preds_test, X_train, X_test, y_train, y_test = train_eval(X, y, net, n_iter, data.filename)
    
#train eval function here


Dataset: SUPPORT_adapted.csv
age                  float32
sex                 category
race                 float32
n_comorbidities      float32
diabetes            category
dementia            category
cancer              category
blood_pressure       float32
heart_rate           float32
respiration_rate     float32
temperature          float32
white_blood_cell     float32
serum_sodium         float32
serum_creatinine     float32
dtype: object
float32
(7098, 14) <class 'pandas.core.frame.DataFrame'>
(7098,) <class 'pandas.core.series.Series'>
(1775, 14) <class 'pandas.core.frame.DataFrame'>
(1775,) <class 'pandas.core.series.Series'>
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1        [36m4.0003[0m        [32m3.9078[0m  0.3023
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1        [36m3.9697[0m        [32m3.9037[0m  0.3030
  epoch    train_loss    valid_loss     dur
-------  ----

In [23]:
pd.DataFrame(params)

Unnamed: 0,best_params_SUPPORT
0,"{'estimator__batch_size': 512, 'estimator__lr'..."
1,"{'estimator__batch_size': 64, 'estimator__lr':..."
2,"{'estimator__batch_size': 128, 'estimator__lr'..."
3,"{'estimator__batch_size': 256, 'estimator__lr'..."
4,"{'estimator__batch_size': 512, 'estimator__lr'..."


In [18]:
def breslow_estimator(log_hazard, time, event):
    #time, event = transform_back(y)
    risk_score = np.exp(log_hazard)
    print(risk_score.shape)
    is_sorted = lambda a: np.all(a[:-1] <= a[1:])

    if is_sorted(time) == False:
        order = np.argsort(time, kind="mergesort")
        time = time[order]
        event = event[order]
        risk_score = risk_score[order]

    uniq_times = np.unique(time)
    idx = np.digitize(time, np.unique(time))
    breaks = np.flatnonzero(np.concatenate(([1], np.diff(idx))))
    # numpy diff nth discrete difference over index, add 1 at the beginning
    # flatnonzero return indices that are nonzero in flattened version
    n_events = np.add.reduceat(event, breaks, axis=0)

    # consider removing zero rows
    risk_matrix = np.unique((np.outer(time,time)>=np.square(time)).astype(int).T, axis=0)
    print(risk_matrix.shape)
    denominator = np.sum(risk_score.reshape(-1)*risk_matrix,axis=1)[::-1] 

    baseline_hazard = n_events / denominator
    cum_hazard_baseline = np.cumsum(n_events / denominator)
    baseline_survival = np.exp(-cum_hazard_baseline)
    return uniq_times, baseline_hazard, cum_hazard_baseline, baseline_survival

In [95]:
time, event = transform_back(y_train)
uniq_times_old, baseline_hazard, cum_hazard_baseline_old, baseline_survival_old = breslow_estimator(best_preds_train, time, event)

(1523, 1)
(1381, 1523)


In [96]:
p1 = np.exp(-(cum_hazard_baseline_old*np.exp(best_preds_train[20])))

In [97]:
import plotly_express as px

px.line(x =uniq_times_old, y = p1)

In [152]:
from sksurv.util import Surv
from sksurv.metrics import integrated_brier_score
data = load_metabric(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=False)
X  = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# use from arrays and then build the two survival struc arrays
# do test predictions in shape of (n_samples, n_times) and put in unique times
# for the last time point the unique time point has to be decremente by a tiny bit!
time, event = transform_back(y_train)
time_train, event_train = transform_back(y_train)
y_train = Surv.from_arrays( event_train.astype(dtype=bool),time_train)
time_test, event_test = transform_back(y_test)
y_test = Surv.from_arrays(event_test.astype(dtype=bool),time_test)
uniq_times_old, baseline_hazard, cum_hazard_baseline_old, baseline_survival_old = breslow_estimator(best_preds_train, time, event)
print('cum_hazard_baseline_old', cum_hazard_baseline_old.shape)
# cumbaseline hazard for each (unique) time step of the test data
cum_hazard_baseline_old_new = np.interp(time_test, np.unique(time_train), cum_hazard_baseline_old)
best_preds_test = np.ones(y_test.shape[0])
print('cum_hazard_baseline_old_new.shape',cum_hazard_baseline_old_new[:, None].shape)
#preds = np.exp(-(cum_hazard_baseline_old_new[:,None]*np.exp(best_preds_test)))
# as many as there unique time steps
#cum_hazard_baseline_old_new = np.ones((334))
preds = np.exp(-(np.outer(np.exp(best_preds_test), np.exp(cum_hazard_baseline_old_new))))
print('preds',preds.shape)
preds = np.ones((381,374))
uni_times = np.unique(time_test)
print('uni_times.shape',uni_times.shape)
times = np.unique(time_test)
#times = np.arange(time_test.min(),time_test.max())
times = np.delete(times,-1)
# the highest time has to be decremented, but at the same time only a range seems to work
times = np.append(times,337.03333-0.1)
print('y_train',y_train.shape)
print('y_test',y_test.shape)
print('preds',preds.shape)
print('times',times.shape)
ibs = integrated_brier_score(y_train, y_test, preds, times)
print("Integrated Brier Score: {:.3f}".format(ibs))

(1523, 1)
(1381, 1522)
cum_hazard_baseline_old (1381,)
cum_hazard_baseline_old_new.shape (381, 1)
preds (381, 381)
uni_times.shape (374,)
y_train (1522,)
y_test (381,)
preds (381, 374)
times (374,)
Integrated Brier Score: 0.551


In [151]:
## process
hazard for each unique time step multiplied by the predicted hazard
applying surv function exp(-x)
preds hence (individuals, unique times)
unique times with the last value slightly decremented

337.0333251953125

In [115]:
print(time_test.min(),time_test.max())
times

3.7666667 337.03333


array([  3.76666665,   4.76666665,   5.76666665,   6.76666665,
         7.76666665,   8.76666665,   9.76666665,  10.76666665,
        11.76666665,  12.76666665,  13.76666665,  14.76666665,
        15.76666665,  16.76666665,  17.76666665,  18.76666665,
        19.76666665,  20.76666665,  21.76666665,  22.76666665,
        23.76666665,  24.76666665,  25.76666665,  26.76666665,
        27.76666665,  28.76666665,  29.76666665,  30.76666665,
        31.76666665,  32.76666665,  33.76666665,  34.76666665,
        35.76666665,  36.76666665,  37.76666665,  38.76666665,
        39.76666665,  40.76666665,  41.76666665,  42.76666665,
        43.76666665,  44.76666665,  45.76666665,  46.76666665,
        47.76666665,  48.76666665,  49.76666665,  50.76666665,
        51.76666665,  52.76666665,  53.76666665,  54.76666665,
        55.76666665,  56.76666665,  57.76666665,  58.76666665,
        59.76666665,  60.76666665,  61.76666665,  62.76666665,
        63.76666665,  64.76666665,  65.76666665,  66.76

In [114]:
np.unique(time_test)

array([  3.7666667,   5.5      ,   5.8333335,   7.8      ,   9.133333 ,
        10.633333 ,  10.833333 ,  11.866667 ,  12.266666 ,  13.4      ,
        14.7      ,  15.2      ,  16.566668 ,  16.6      ,  16.7      ,
        17.666666 ,  17.833334 ,  18.266666 ,  18.833334 ,  20.2      ,
        20.433332 ,  21.       ,  22.133333 ,  22.233334 ,  23.2      ,
        23.8      ,  23.833334 ,  23.9      ,  23.933332 ,  24.633333 ,
        24.866667 ,  25.233334 ,  25.433332 ,  25.633333 ,  26.333334 ,
        27.466667 ,  27.866667 ,  28.566668 ,  28.833334 ,  29.066668 ,
        29.3      ,  30.8      ,  31.166666 ,  32.066666 ,  32.733334 ,
        32.833332 ,  32.866665 ,  33.133335 ,  34.3      ,  34.333332 ,
        34.566666 ,  34.633335 ,  34.7      ,  34.766666 ,  35.       ,
        35.2      ,  36.4      ,  36.633335 ,  37.       ,  37.5      ,
        37.733334 ,  38.133335 ,  38.8      ,  41.466667 ,  41.833332 ,
        42.633335 ,  42.666668 ,  42.9      ,  43.1      ,  43.2

In [None]:
new_time, new_event = transform_back(y_test)
old_time, old_event = transform_back(y_train)
cum_hazard_baseline_old_new = np.interp(np.unique(new_time), np.unique(old_time), cum_hazard_baseline_old)

In [71]:
np.unique(time).shape # should I take the unique one?

(1381,)

In [73]:
px.line(x =np.unique(new_time), y = cum_hazard_baseline_old_new )

In [None]:
pd.DataFrame(outer_scores)

Unnamed: 0,cindex_train_METABRIC,cindex_test_METABRIC
0,0.60615,0.559541
1,0.609315,0.611044
2,0.617208,0.608548
3,0.446914,0.460198
4,0.643331,0.649819


In [3]:
t = np.array([2,4,5,6,9,14])
new_times = np.array([3,4,5])
y = np.array([0.2,0.3,0.7,0.7,0.8,0.9])
np.interp(new_times, t, y)

array([0.25, 0.3 , 0.7 ])

In [2]:
import numpy as np

a = np.array([1, 2, 3, 4.2])
b = np.array([2.5, 3.5, 4.5, 5.5, 6.5])

closest_indices = np.abs(a[:, np.newaxis] - b).argmin(axis=1)

print(closest_indices)


[0 0 0 2]


In [118]:
np.abs(a[:, np.newaxis] - b)

array([[1.5, 2.5, 3.5, 4.5, 5.5],
       [0.5, 1.5, 2.5, 3.5, 4.5],
       [0.5, 0.5, 1.5, 2.5, 3.5],
       [1.5, 0.5, 0.5, 1.5, 2.5],
       [2.5, 1.5, 0.5, 0.5, 1.5]])

In [120]:
a[closest_indices]

array([1, 1, 1, 2, 3])

In [98]:
## Training Procedure

In [99]:

# Define models to apply
loss_functions = ['breslow_loss', 'efron_loss']#, 'deephit_loss'] #, 'cind_loss', 'aft_loss', 'efron_loss', 'aft_loss', 'ah_loss', 'deephit_loss'
criterion_functions = [BreslowLoss, EfronLoss] #, deephit_loss1_pycox] #, cind_loss, efron_likelihood, aft_likelihood, ah_likelihood, deephit_loss1_pycox
scoring_functions = [breslow_likelihood_torch, efron_likelihood_torch] #, deephit_loss1_pycox] #, cind_loss, efron_likelihood, aft_likelihood, ah_likelihood, deephit_loss1_pycox]

n_models = len(criterion_functions)

# dict of outer scores
outer_scores = {'breslow_loss':[], 'efron_loss':[]} #, 'efron_loss':[],  'aft_loss':[], 'ah_loss':[], 'deephit_loss':[]#,'deephit_loss' 'cind_loss':[],'aft_loss':[]

# Load dataset
data, target = load_metabric(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
X, y = load_metabric(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=False)

# deephit data adaptation
time, event = transform_back(target.to_numpy())
data['time'] = time
data['event'] = event
df = discretizer_df(data, n_cuts=100, type = 'equidistant', min_time=0.0)

y_deep = transform(df.time.to_numpy(), df.event.to_numpy())
n = len(np.unique(np.absolute(y_deep)))
y_deephit = np.tile(y_deep, (n,1)).T
X_deephit = df.iloc[:,:-2].to_numpy()

NameError: name 'EfronLoss' is not defined

In [None]:
# examples
# example
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import pandas as pd
import numpy as np

# Create a sample dataframe
df = pd.DataFrame({'age': [25, 30, 35],
                   'income': [50000, 60000, 70000],
                   'gender': ['male', 'female', 'male'],
                   'is_married': [True, False, True],
                   'num_children': [2, 0, 1],
                   'target': [0, 1, 1]})

# Select columns by data type
numeric_cols = df.select_dtypes(include=np.number).columns
categorical_cols = df.select_dtypes(include='object').columns
boolean_cols = df.select_dtypes(include=bool).columns

# Define the feature mapper
mapper = DataFrameMapper([
    (numeric_cols, StandardScaler()),
    (categorical_cols, OneHotEncoder()),
    (boolean_cols, None)
])

# Create the pipeline
pipeline = make_pipeline(mapper, LogisticRegression())

# Split the data into input features and target variable
X = df.drop('target', axis=1)
y = df['target']

# Fit the pipeline to the data
pipeline.fit(X, y)


In [None]:


class CustomStandardScaler(StandardScaler):
    
    def __init__(self, copy=True, with_mean=True, with_std=True):
        super().__init__(copy=copy, with_mean=with_mean, with_std=with_std)
        
    def fit(self, X, y=None):
        # Add your own code here
        return super().fit(X, y)
    
    def transform(self, X, y=None):
        # Add your own code here
        X_transformed = super().transform(X, y)
        # Add your own code here
        return X_transformed
    
    def fit_transform(self, X, y=None):
        # Add your own code here
        X_transformed = super().fit_transform(X, y)
        # Add your own code here
        return X_transformed
    
ct = make_column_transformer(
        [(StandardScaler(), make_column_selector(dtype_include=['float32'])), 
        (LabelBinarizer(), make_column_selector(dtype_include='bool'))],remainder='passthrough'
        #(BoolToNumericTransformer(), make_column_selector(dtype_include=['bool']))
        )

data = load_rgbsg(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
X  = data.data
X = ct.fit_transform(X)
print(X.shape)
np.savetxt('testX.csv',X,delimiter=',')