In [117]:
import pandas as pd
import numpy as np
from numpy import savetxt
from xgbsurv.datasets import (load_metabric, load_flchain, load_rgbsg, load_support, load_tcga)
from xgbsurv.models.utils import sort_X_y_pandas, transform_back, transform
from xgbsurv.models.breslow_final import get_cumulative_hazard_function_breslow, breslow_estimator_loop
import torch
from torch import nn
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, LabelBinarizer, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.decomposition import PCA
from loss_functions_pytorch import BreslowLoss, breslow_likelihood_torch
from skorch import NeuralNet
from skorch.callbacks import EarlyStopping
from skorch.dataset import ValidSplit
from pycox.evaluation import EvalSurv
from scipy.stats import uniform as scuniform
#from scipy.stats import randint as scrandint
from scipy.stats import loguniform as scloguniform 
#torch.set_default_dtype(torch.float64)
#torch.set_default_tensor_type(torch.DoubleTensor)

## Set Parameters

In [84]:
# set parameters, put into function
n_outer_splits = 5
n_inner_splits = 5
rand_state = 42
n_iter = 10 # set to 50
#n_iter_cind = 200
early_stopping_rounds=15
base_score = 0.0

param_grid_breslow = {
    'estimator__module__n_layers': [1, 2, 4],
    'estimator__module__n_layers': [1, 2, 4],
    'estimator__module__num_nodes': [64, 128, 256, 512],
    'estimator__module__dropout': scuniform(0.0,0.7),
    'estimator__optimizer__weight_decay': [0.4, 0.2, 0.1, 0.05, 0.02, 0.01, 0],
    'estimator__batch_size': [64, 128, 256, 512, 1024],
    #lr not in paper because of learning rate finder
    # note: setting learning rate higher would make exp(partial_hazard) explode
    'estimator__lr': scloguniform(0.001,0.01), # scheduler unten einbauen
    #'max_epochs':  scrandint(10,20), # corresponds to num_rounds
}

## Load Data

In [115]:
data = load_flchain(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
print(type(data.data),type(data.target))
X, y = data.data, data.target
#X, y = data.data.astype(np.float64), data.target.astype(np.float64)
#one_hot_dict = {'load_flchain':[''], 'load_support':['cancer'], 'load_rgbsg':['grade']}
#X = pd.get_dummies(X, columns=['cancer'])
#X = pd.get_dummies(X, columns=['grade'])
#X = pd.get_dummies(X, columns=['mgus'])
X.mgus.value

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>


age            float32
sex           category
sample_yr      float32
kappa          float32
lambda         float32
flc_grp        float32
creatinine     float32
mgus          category
dtype: object

## Set Loss Function

In [86]:
# Define Scorer
def custom_scoring_function(y_true, y_pred):

        #y_true = torch.from_numpy(y_true)
        if isinstance(y_pred, np.ndarray):
            y_pred = torch.from_numpy(y_pred)
        if isinstance(y_true, np.ndarray):
            y_true = torch.from_numpy(y_true)
        if isinstance(y_pred, pd.Series):
            y_pred = torch.tensor(y_pred.values)
        if isinstance(y_true, pd.Series):
            y_true = torch.tensor(y_true.values)
        score = breslow_likelihood_torch(y_true, y_pred).to(torch.float32)
        return score.numpy()

scoring_function = make_scorer(custom_scoring_function, greater_is_better=False)

In [87]:
## Set up Custom Splitter

## Set Torch Model

In [88]:

class SurvivalModel(nn.Module):
    def __init__(self, n_layers, in_features, num_nodes, dropout, out_features):
        super(SurvivalModel, self).__init__()
        self.n_layers = n_layers
        self.in_features = in_features
        self.num_nodes = num_nodes
        self.dropout = dropout
        self.out_features = out_features
        model = []
        # first layer
        model.append(torch.nn.Linear(in_features, num_nodes))
        model.append(torch.nn.ReLU())
        model.append(torch.nn.Dropout(dropout))
        model.append(torch.nn.BatchNorm1d(num_nodes))

        for i in range(n_layers-1):
            model.append(torch.nn.Linear(num_nodes, num_nodes))
            #init.kaiming_normal_(model[-1].weight, nonlinearity='relu')
            model.append(torch.nn.ReLU())
            model.append(torch.nn.Dropout(dropout))
            model.append(torch.nn.BatchNorm1d(num_nodes))

        # output layer
        model.append(torch.nn.Linear(num_nodes, out_features))
    
        self.layers = nn.Sequential(*model)

        # for layer in self.layers:
        #     if isinstance(layer, nn.Linear):
        #         #nn.init.uniform_(layer.weight, a=-0.5, b=0.5)
        #         nn.init.kaiming_normal_(layer.weight)


    def forward(self, X):
        X = X.to(torch.float32)
        res = self.layers(X)
        #print(res)
        return res


## Set up Scaler

In [89]:
class CustomStandardScaler(StandardScaler):
    
    def __init__(self, copy=True, with_mean=True, with_std=True):
        super().__init__(copy=copy, with_mean=with_mean, with_std=with_std)
        
    def fit(self, X, y=None):
        return super().fit(X, y)
    
    def transform(self, X, y=None):
        X_transformed = super().transform(X, y)
        return X_transformed.astype(np.float32)
    
    def fit_transform(self, X, y=None):
        X_transformed = super().fit_transform(X, y)
        return X_transformed.astype(np.float32)

## Custom Split

In [90]:
# Define stratified inner k-fold cross-validation
class CustomSplit(StratifiedKFold):
    def __init__(self, n_splits=5, shuffle=True, random_state=None):
        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    def split(self, X, y, groups=None):
        print('split', X.dtypes)
        try:
            if y.shape[1]>1:
                y = y[:,0]
        except:
            pass
        bins = np.sign(y)
        return super().split(X, bins, groups=groups)

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

outer_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)
inner_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)



## Setting Net Model

In [91]:
# if torch.cuda.is_available():
#     module = SurvivalModel().to(device='cuda', dtype=torch.float64)
# else:
#     module = SurvivalModel().to(device='cpu', dtype=torch.float64)

net = NeuralNet(
    SurvivalModel, 
    module__n_layers = 1,
    module__in_features = X.shape[1],
    module__num_nodes = 32,
    module__dropout = 0.1, # these could also be removed
    module__out_features = 1,
    #device = 'cuda',
    # for split sizes when result size = 1
    iterator_train__drop_last=True,
    #iterator_valid__drop_last=True,
    criterion=BreslowLoss,
    optimizer=torch.optim.AdamW,
    optimizer__weight_decay = 0.4,
    batch_size=32, # separate train and valid->iterator_train__batch_size=128 and iterator_valid__batch_size=128 ?
    callbacks=[EarlyStopping(patience=10)],
    #TODO: enable stratification, verify
    train_split=ValidSplit(0.2), # might cause lower performance in metrics, explain in thesis
    lr=0.001,
    #max_epochs=1, #0,#100
    #train_split=None,
    verbose=1
    )



In [134]:
## Set up Randomized Search
#param_grid_breslow = {
#    'estimator__module__n_layers': [1, 2, 4]}

def train_eval(X, y, net, n_iter, filename):

        dataset_name = filename.split('_')[0]
        # add IBS later
        outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[],
                        'ibs_train_'+dataset_name:[], 'ibs_test_'+dataset_name:[]}
        best_params = {'best_params_'+dataset_name:[]}
        best_model = {'best_model_'+dataset_name:[]}
        ct = make_column_transformer(
                #(OneHotEncoder(sparse_output=False), make_column_selector(dtype_include=['category', 'object']))
                (StandardScaler(), make_column_selector(dtype_include=['float32']))
                ,remainder='passthrough')

        pipe = Pipeline([('scaler',ct),
                        ('estimator', net)])
        rs = RandomizedSearchCV(pipe, param_grid_breslow, scoring = scoring_function, n_jobs=-1, 
                                    n_iter=2, refit=True)
        for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
                # Split data into training and testing sets for outer fold
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                X_train, y_train = sort_X_y_pandas(X_train, y_train)
                X_test, y_test = sort_X_y_pandas(X_test, y_test)

                print(X_train.shape, type(X_train))
                print(y_train.shape, type(y_train))
                print(X_test.shape, type(X_test))
                print(y_test.shape, type(y_test))
                # save splits and data
                savetxt('splits/train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
                savetxt('splits/test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
                
                savetxt('splits/X_train_'+str(i)+'_'+filename, X_train, delimiter=',')
                savetxt('splits/X_test_'+str(i)+'_'+filename, X_test, delimiter=',')

                savetxt('splits/y_train_'+str(i)+'_'+filename, y_train, delimiter=',')
                savetxt('splits/y_test_'+str(i)+'_'+filename, y_test, delimiter=',')

                strat = np.sign(y_train)
                valid_split = ValidSplit(cv=0.1, stratified=strat, random_state=42)




                rs.fit(X_train, y_train)
                best_preds_train = rs.best_estimator_.predict(X_train)
                best_preds_test = rs.best_estimator_.predict(X_test)
                # save hyperparameter settings
                params = rs.best_estimator_.get_params
                best_params['best_params_'+dataset_name] += [rs.best_params_]
                best_model['best_model_'+dataset_name] += [params]
                try:
                    cum_hazard_train = get_cumulative_hazard_function_breslow(
                            X_train.values, X_train.values, y_train.values, y_train.values,
                            best_preds_train.reshape(-1), best_preds_train.reshape(-1)
                            )

                    df_survival_train = np.exp(-cum_hazard_train)
                    durations_train, events_train = transform_back(y_train.values)
                    time_grid_train = np.linspace(durations_train.min(), durations_train.max(), 100)
                    ev = EvalSurv(df_survival_train, durations_train, events_train, censor_surv='km')
                    print('Concordance Index',ev.concordance_td('antolini'))
                    print('Integrated Brier Score:',ev.integrated_brier_score(time_grid_train))
                    cindex_score_train = ev.concordance_td('antolini')
                    ibs_score_train = ev.integrated_brier_score(time_grid_train)

                    outer_scores['cindex_train_'+dataset_name] += [cindex_score_train]
                    outer_scores['ibs_train_'+dataset_name] += [ibs_score_train]

                except:
                    outer_scores['cindex_train_'+dataset_name] += [np.nan]
                    outer_scores['ibs_train_'+dataset_name] += [np.nan]
                    
                try:
                    cum_hazard_test = get_cumulative_hazard_function_breslow(
                            X_train.values, X_test.values, y_train.values, y_test.values,
                            best_preds_train.reshape(-1), best_preds_test.reshape(-1)
                            )
                    df_survival_test = np.exp(-cum_hazard_test)
                    durations_test, events_test = transform_back(y_test.values)
                    print('durations',durations_test.min(), durations_test.max())
                    time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
                    ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
                    print('Concordance Index',ev.concordance_td('antolini'))
                    print('Integrated Brier Score:',ev.integrated_brier_score(time_grid_test))
                    cindex_score_test = ev.concordance_td('antolini')
                    ibs_score_test = ev.integrated_brier_score(time_grid_test)

                    outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
                    outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
                except: 
                    outer_scores['cindex_test_'+dataset_name] += [np.nan]
                    outer_scores['ibs_test_'+dataset_name] += [np.nan]
            
        df_best_params = pd.DataFrame(best_params)
        df_best_model = pd.DataFrame(best_model)
        df_outer_scores = pd.DataFrame(outer_scores)
        df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
        df_metrics.to_csv('metrics/metric_summary_'+str(i)+'_'+filename, index=False)
        return best_model, best_params, outer_scores, best_preds_train, best_preds_test, X_train, X_test, y_train, y_test

                
#cv=inner_custom_cv,pipe

In [135]:
data_set_fns = [load_metabric,  load_flchain, load_rgbsg, load_support] #, load_flchain, load_rgbsg, load_support, load_tcga]
data_set_fns_str = ['load_metabric', 'load_flchain', 'load_rgbsg', 'load_support'] 
one_hot_dict = {'load_flchain': ['mgus'], 'load_support':['cancer'], 'load_rgbsg':['grade']}
#'load_flchain':[''],
#X = pd.get_dummies(X, columns=['cancer'])
#X = pd.get_dummies(X, columns=['grade'])
#X = pd.get_dummies(X, columns=['mgus'])
for idx, dataset in enumerate(data_set_fns):
    # get name of current dataset
    data = dataset(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
    X  = data.data #.astype(np.float32)
    y = data.target #.values #.to_numpy()
    #dataset_name = data.filename.split('_')[0]
    #print(str(data_set_fns))
    print(data_set_fns_str[idx])
    if data_set_fns_str[idx] in one_hot_dict.keys():
        print(data_set_fns_str[idx],one_hot_dict.keys() )
        X = pd.get_dummies(X, columns=one_hot_dict[data_set_fns_str[idx]])
    print(X.columns)
    X, y = sort_X_y_pandas(X, y)


    #print('Dataset:',data.filename)
    #print(X.dtypes)
    
    net = NeuralNet(
        SurvivalModel, 
        module__n_layers = 1,
        module__in_features = X.shape[1],
        #module__num_nodes = 32,
        #module__dropout = 0.1, # these could also be removed
        module__out_features = 1,
        # for split sizes when result size = 1
        iterator_train__drop_last=True,
        #iterator_valid__drop_last=True,
        criterion=BreslowLoss,
        optimizer=torch.optim.AdamW,
        optimizer__weight_decay = 0.4,
        batch_size=32, # separate train and valid->iterator_train__batch_size=128 and iterator_valid__batch_size=128 ?
        callbacks=[EarlyStopping(patience=10)],
        # add extensive callback, and random number seed
        #TODO: enable stratification, verify
        train_split=ValidSplit(0.2), # might cause lower performance in metrics, explain in thesis
        #lr=0.001,
        #max_epochs=1, #0,#100
        #train_split=None,
        verbose=1
    )
    best_model,params, outer_scores, best_preds_train, best_preds_test, X_train, X_test, y_train, y_test = train_eval(X, y, net, n_iter, data.filename)
    
#train eval function here





load_metabric
Index(['MKI67', 'EGFR', 'PGR', 'ERBB2', 'hormone_treatment', 'radiotherapy',
       'chemotherapy', 'ER_positive', 'age'],
      dtype='object')
split MKI67                float32
EGFR                 float32
PGR                  float32
ERBB2                float32
hormone_treatment    float32
radiotherapy         float32
chemotherapy         float32
ER_positive          float32
age                  float32
dtype: object
(1522, 9) <class 'pandas.core.frame.DataFrame'>
(1522,) <class 'pandas.core.series.Series'>
(381, 9) <class 'pandas.core.frame.DataFrame'>
(381,) <class 'pandas.core.series.Series'>
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1        [36m4.7083[0m        [32m4.5451[0m  0.1175
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1        [36m4.6540[0m        [32m4.5027[0m  0.1038
  epoch    train_loss    valid_loss     dur
-------  ------------  -----------

## TCGA

In [136]:
## Set up Randomized Search
#param_grid_breslow = {
#    'estimator__module__n_layers': [1, 2, 4]}

def train_eval(X, y, net, n_iter, filename):

        dataset_name = filename.split('_')[0]
        # add IBS later
        outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[],
                        'ibs_train_'+dataset_name:[], 'ibs_test_'+dataset_name:[]}
        best_params = {'best_params_'+dataset_name:[]}
        best_model = {'best_model_'+dataset_name:[]}
        ct = make_column_transformer(
                #(OneHotEncoder(sparse_output=False), make_column_selector(dtype_include=['category', 'object']))
                (StandardScaler(), make_column_selector(dtype_include=['float32']))
                ,remainder='passthrough')
        pca = PCA()
        pipe = Pipeline([('scaler',ct),
                         ('pca', PCA(n_components=10)),
                        ('estimator', net)])
        rs = RandomizedSearchCV(pipe, param_grid_breslow, scoring = scoring_function, n_jobs=-1, 
                                    n_iter=2, refit=True)
        for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
                # Split data into training and testing sets for outer fold
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                X_train, y_train = sort_X_y_pandas(X_train, y_train)
                X_test, y_test = sort_X_y_pandas(X_test, y_test)

                print(X_train.shape, type(X_train))
                print(y_train.shape, type(y_train))
                print(X_test.shape, type(X_test))
                print(y_test.shape, type(y_test))
                # save splits and data
                savetxt('splits/train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
                savetxt('splits/test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
                
                savetxt('splits/X_train_'+str(i)+'_'+filename, X_train, delimiter=',')
                savetxt('splits/X_test_'+str(i)+'_'+filename, X_test, delimiter=',')

                savetxt('splits/y_train_'+str(i)+'_'+filename, y_train, delimiter=',')
                savetxt('splits/y_test_'+str(i)+'_'+filename, y_test, delimiter=',')

                strat = np.sign(y_train)
                valid_split = ValidSplit(cv=0.1, stratified=strat, random_state=42)




                rs.fit(X_train, y_train)
                best_preds_train = rs.best_estimator_.predict(X_train)
                best_preds_test = rs.best_estimator_.predict(X_test)
                # save hyperparameter settings
                params = rs.best_estimator_.get_params
                best_params['best_params_'+dataset_name] += [rs.best_params_]
                best_model['best_model_'+dataset_name] += [params]
                try:
                    cum_hazard_train = get_cumulative_hazard_function_breslow(
                            X_train.values, X_train.values, y_train.values, y_train.values,
                            best_preds_train.reshape(-1), best_preds_train.reshape(-1)
                            )

                    df_survival_train = np.exp(-cum_hazard_train)
                    durations_train, events_train = transform_back(y_train.values)
                    time_grid_train = np.linspace(durations_train.min(), durations_train.max(), 100)
                    ev = EvalSurv(df_survival_train, durations_train, events_train, censor_surv='km')
                    print('Concordance Index',ev.concordance_td('antolini'))
                    print('Integrated Brier Score:',ev.integrated_brier_score(time_grid_train))
                    cindex_score_train = ev.concordance_td('antolini')
                    ibs_score_train = ev.integrated_brier_score(time_grid_train)

                    outer_scores['cindex_train_'+dataset_name] += [cindex_score_train]
                    outer_scores['ibs_train_'+dataset_name] += [ibs_score_train]

                except:
                    outer_scores['cindex_train_'+dataset_name] += [np.nan]
                    outer_scores['ibs_train_'+dataset_name] += [np.nan]
                    
                try:
                    cum_hazard_test = get_cumulative_hazard_function_breslow(
                            X_train.values, X_test.values, y_train.values, y_test.values,
                            best_preds_train.reshape(-1), best_preds_test.reshape(-1)
                            )
                    df_survival_test = np.exp(-cum_hazard_test)
                    durations_test, events_test = transform_back(y_test.values)
                    print('durations',durations_test.min(), durations_test.max())
                    time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
                    ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
                    print('Concordance Index',ev.concordance_td('antolini'))
                    print('Integrated Brier Score:',ev.integrated_brier_score(time_grid_test))
                    cindex_score_test = ev.concordance_td('antolini')
                    ibs_score_test = ev.integrated_brier_score(time_grid_test)

                    outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
                    outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
                except: 
                    outer_scores['cindex_test_'+dataset_name] += [np.nan]
                    outer_scores['ibs_test_'+dataset_name] += [np.nan]
            
        df_best_params = pd.DataFrame(best_params)
        df_best_model = pd.DataFrame(best_model)
        df_outer_scores = pd.DataFrame(outer_scores)
        df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
        df_metrics.to_csv('metrics/metric_summary_'+str(i)+'_'+filename, index=False)
        return best_model, best_params, outer_scores, best_preds_train, best_preds_test, X_train, X_test, y_train, y_test

                
#cv=inner_custom_cv,pipe

In [137]:
#data_set_fns = [load_metabric,  load_flchain, load_rgbsg, load_support] #, load_flchain, load_rgbsg, load_support, load_tcga]
#data_set_fns_str = ['load_metabric', 'load_flchain', 'load_rgbsg', 'load_support'] 
#one_hot_dict = {'load_flchain': ['mgus'], 'load_support':['cancer'], 'load_rgbsg':['grade']}
#'load_flchain':[''],
#X = pd.get_dummies(X, columns=['cancer'])
#X = pd.get_dummies(X, columns=['grade'])
#X = pd.get_dummies(X, columns=['mgus'])
for idx, dataset in enumerate(data_set_fns):
    # get name of current dataset
    data = load_tcga(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
    X  = data.data #.astype(np.float32)
    y = data.target #.values #.to_numpy()
    #dataset_name = data.filename.split('_')[0]
    #print(str(data_set_fns))
    # print(data_set_fns_str[idx])
    # if data_set_fns_str[idx] in one_hot_dict.keys():
    #     print(data_set_fns_str[idx],one_hot_dict.keys() )
    #     X = pd.get_dummies(X, columns=one_hot_dict[data_set_fns_str[idx]])
    # print(X.columns)
    X, y = sort_X_y_pandas(X, y)


    #print('Dataset:',data.filename)
    #print(X.dtypes)
    
    net = NeuralNet(
        SurvivalModel, 
        module__n_layers = 1,
        module__in_features = X.shape[1],
        #module__num_nodes = 32,
        #module__dropout = 0.1, # these could also be removed
        module__out_features = 1,
        # for split sizes when result size = 1
        iterator_train__drop_last=True,
        #iterator_valid__drop_last=True,
        criterion=BreslowLoss,
        optimizer=torch.optim.AdamW,
        optimizer__weight_decay = 0.4,
        batch_size=32, # separate train and valid->iterator_train__batch_size=128 and iterator_valid__batch_size=128 ?
        callbacks=[EarlyStopping(patience=10)],
        #TODO: enable stratification, verify
        train_split=ValidSplit(0.2), # might cause lower performance in metrics, explain in thesis
        #lr=0.001,
        #max_epochs=1, #0,#100
        #train_split=None,
        verbose=1
    )
    best_model,params, outer_scores, best_preds_train, best_preds_test, X_train, X_test, y_train, y_test = train_eval(X, y, net, n_iter, data.filename)
    
#train eval function here





split gex_?|100130426      float32
gex_?|100133144      float32
gex_?|100134869      float32
gex_?|10357          float32
gex_?|10431          float32
                      ...   
gex_ZYG11A|440590    float32
gex_ZYG11B|79699     float32
gex_ZYX|7791         float32
gex_ZZEF1|23140      float32
gex_ZZZ3|26009       float32
Length: 20531, dtype: object
(324, 20531) <class 'pandas.core.frame.DataFrame'>
(324,) <class 'pandas.core.series.Series'>
(82, 20531) <class 'pandas.core.frame.DataFrame'>
(82,) <class 'pandas.core.series.Series'>


ValueError: 
All the 10 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1230, in fit
    self.partial_fit(X, y, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1189, in partial_fit
    self.fit_loop(X, y, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1104, in fit_loop
    self.run_single_epoch(iterator_valid, training=False, prefix="valid",
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1137, in run_single_epoch
    step = step_fn(batch, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 873, in validation_step
    y_pred = self.infer(Xi, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1427, in infer
    return self.module_(x, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/var/folders/jr/dh6mkdzs31lc5pkqymtdbh180000gp/T/ipykernel_55994/3268685940.py", line 36, in forward
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/nn/modules/container.py", line 139, in forward
    input = module(input)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 114, in forward
    return F.linear(input, self.weight, self.bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (52x10 and 20531x64)

--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1230, in fit
    self.partial_fit(X, y, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1189, in partial_fit
    self.fit_loop(X, y, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1101, in fit_loop
    self.run_single_epoch(iterator_train, training=True, prefix="train",
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1137, in run_single_epoch
    step = step_fn(batch, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1016, in train_step
    self._step_optimizer(step_fn)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 972, in _step_optimizer
    optimizer.step(step_fn)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/optim/optimizer.py", line 113, in wrapper
    return func(*args, **kwargs)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
    return func(*args, **kwargs)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/optim/adamw.py", line 119, in step
    loss = closure()
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1006, in step_fn
    step = self.train_step_single(batch, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 905, in train_step_single
    y_pred = self.infer(Xi, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1427, in infer
    return self.module_(x, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/var/folders/jr/dh6mkdzs31lc5pkqymtdbh180000gp/T/ipykernel_55994/3268685940.py", line 36, in forward
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/nn/modules/container.py", line 139, in forward
    input = module(input)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 114, in forward
    return F.linear(input, self.weight, self.bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (128x10 and 20531x512)


In [142]:
data = load_tcga(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
for i in data.data.dtypes:
    print(i) #.creatinine.value_counts()


float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32
float32


In [100]:
## Train Fit Evaluate
X_train, X_test, y_train, y_test = train_test_split(X, y)
# sort data
X_train, y_train = sort_X_y_pandas(X_train, y_train)
X_test, y_test = sort_X_y_pandas(X_test, y_test)
rs.fit(X_train, y_train)
best_preds_train = rs.best_estimator_.predict(X_train)
best_preds_test = rs.best_estimator_.predict(X_test)

cum_hazard_train = get_cumulative_hazard_function_breslow(
        X_train.values, X_train.values, y_train.values, y_train.values,
        best_preds_train.reshape(-1), best_preds_train.reshape(-1)
        )

df_survival_train = np.exp(-cum_hazard_train)
durations_train, events_train = transform_back(y_train.values)
time_grid_train = np.linspace(durations_train.min(), durations_train.max(), 100)
ev = EvalSurv(df_survival_train, durations_train, events_train, censor_surv='km')
print('Concordance Index',ev.concordance_td('antolini'))
print('Integrated Brier Score:',ev.integrated_brier_score(time_grid_train))
cindex_score_train = ev.concordance_td('antolini')
ibs_score_train = ev.integrated_brier_score(time_grid_train)

cum_hazard_test = get_cumulative_hazard_function_breslow(
        X_train.values, X_test.values, y_train.values, y_test.values,
        best_preds_train.reshape(-1), best_preds_test.reshape(-1)
        )
df_survival_test = np.exp(-cum_hazard_test)
durations_test, events_test = transform_back(y_test.values)
print('durations',durations_test.min(), durations_test.max())
time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
print('Concordance Index',ev.concordance_td('antolini'))
print('Integrated Brier Score:',ev.integrated_brier_score(time_grid_test))
cindex_score_test = ev.concordance_td('antolini')
ibs_score_test = ev.integrated_brier_score(time_grid_test)

ValueError: 
All the 10 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 1228, in fit
    self.initialize()
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 815, in initialize
    self._initialize_module()
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 712, in _initialize_module
    self.initialize_module()
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 564, in initialize_module
    module = self.initialized_instance(self.module, kwargs)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/skorch/net.py", line 541, in initialized_instance
    return instance_or_cls(**kwargs)
  File "/var/folders/jr/dh6mkdzs31lc5pkqymtdbh180000gp/T/ipykernel_55994/3268685940.py", line 3, in __init__
TypeError: super(type, obj): obj must be an instance or subtype of type


In [None]:
def breslow_estimator_breslow(    
    predictor: np.array,
    time: np.array,
    event: np.array

):
    exp_predictor: np.array = np.exp(predictor)
    local_risk_set: float = np.sum(exp_predictor)
    event_mask: np.array = event.astype(np.bool_)
    n_unique_events: int = np.unique(time[event_mask]).shape[0]
    cumulative_baseline_hazards: np.array = np.zeros(np.unique(time).shape)#np.zeros(n_unique_events)
    n_events_counted: int = 0
    local_death_set: int = 0
    accumulated_risk_set: float = 0
    previous_time: float = time[0]
    print(cumulative_baseline_hazards.shape)
    #print('cumulative_baseline_hazards',cumulative_baseline_hazards)
    for _ in range(len(time)):
        print(_)
        sample_time: float = time[_]
        sample_event: int = event[_]
        sample_predictor: float = exp_predictor[_]
        #print(sample_time, sample_event)
        if sample_time > previous_time and local_death_set:
            print('n_events_counted', n_events_counted)
            print('local death set', local_death_set)
            #print('local risk set', local_risk_set)
            cumulative_baseline_hazards[n_events_counted] = local_death_set / (
                local_risk_set
            )

            local_death_set = 0
            local_risk_set -= accumulated_risk_set
            #print('local_risk_set', local_risk_set)
            accumulated_risk_set = 0
            n_events_counted += 1

        if sample_event:
            local_death_set += 1
        accumulated_risk_set += sample_predictor
        previous_time = sample_time

    #print('cumulative_baseline_hazards',cumulative_baseline_hazards.shape)
    #print('n_events_counted',n_events_counted)
    # the minus 1 in the indicator is one factor
    cumulative_baseline_hazards[n_events_counted] = local_death_set / (
        local_risk_set
    )
    print('test')
    return (
        np.unique(time[event_mask]),
        np.cumsum(cumulative_baseline_hazards),
    )

In [None]:
time_train, event_train = transform_back(y_train.values)
#breslow_estimator_loop(time=time_train, event=event_train, predictor=best_preds_train)
#time_train_unique, ind = np.unique(time_train, return_index=True)
#event_train_unique = event_train[ind]
# test data
preds = best_preds_train[:4]
#print(preds)
time = np.array([1.,1.,3.,3.])
event = np.array([1,1,1,1]) #[1,1,1,1]
(time_uni, cumhazards) = breslow_estimator_breslow( predictor=preds, time=time, event=event)

(2,)
0
1
2
n_events_counted 0
local death set 2
3
test


In [None]:
type(best_preds_train)

numpy.ndarray

In [None]:
for i in range(5500,5600):
    try:
        breslow_estimator_breslow( predictor=best_preds_train.reshape(-1)[:i], time=time_train[:i], event=event_train[:i])
    except:
        print('error from observation', i)
        break

(1293,)
0
1
2
3
n_events_counted 0
local death set 1
4
5
n_events_counted 1
local death set 1
6
7
8
n_events_counted 2
local death set 1
9
10
11
n_events_counted 3
local death set 2
12
13
n_events_counted 4
local death set 1
14
15
16
17
18
19
20
n_events_counted 5
local death set 1
21
22
23
24
n_events_counted 6
local death set 2
25
26
27
28
29
n_events_counted 7
local death set 1
30
31
32
33
34
35
n_events_counted 8
local death set 3
36
n_events_counted 9
local death set 1
37
38
39
40
41
n_events_counted 10
local death set 2
42
43
44
45
46
n_events_counted 11
local death set 3
47
48
49
n_events_counted 12
local death set 2
50
51
52
n_events_counted 13
local death set 1
53
54
n_events_counted 14
local death set 1
55
56
57
58
59
n_events_counted 15
local death set 2
60
61
62
63
n_events_counted 16
local death set 2
64
n_events_counted 17
local death set 1
65
66
67
68
69
70
n_events_counted 18
local death set 1
71
72
73
n_events_counted 19
local death set 1
74
n_events_counted 20
local d

In [None]:
best_preds_train.reshape(-1).shape

(1427,)

In [None]:
time_train.shape

(1427,)

In [None]:
event_train.shape

(1427,)

In [None]:
event_train.sum()

833

In [None]:
best_preds_train.reshape(-1)[5575]

IndexError: index 5575 is out of bounds for axis 0 with size 1427