In [5]:
import pandas as pd
import numpy as np
from numpy import savetxt
from xgbsurv.datasets import (load_metabric, load_flchain, load_rgbsg, load_support, load_tcga)
from xgbsurv.models.utils import sort_X_y_pandas, transform_back, transform
from xgbsurv.models.efron_final import get_cumulative_hazard_function_efron, efron_estimator
import torch
from torch import nn
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, LabelBinarizer, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.decomposition import PCA
from loss_functions_pytorch import EfronLoss, efron_likelihood_torch
from skorch import NeuralNet
from skorch.callbacks import EarlyStopping, Callback, LRScheduler
from skorch.dataset import ValidSplit
from pycox.evaluation import EvalSurv
from scipy.stats import uniform as scuniform
from scipy.stats import randint as scrandint
from scipy.stats import loguniform as scloguniform
import random
import os
#torch.set_default_dtype(torch.float64)
#torch.set_default_tensor_type(torch.DoubleTensor)

  from .autonotebook import tqdm as notebook_tqdm


## Set Parameters

In [6]:
# set parameters, put into function
n_outer_splits = 5
n_inner_splits = 5
rand_state = 42
n_iter = 10 # set to 50
#n_iter_cind = 200
early_stopping_rounds=15
base_score = 0.0

param_grid_breslow = {
    'estimator__module__n_layers': [1, 2, 4],
    'estimator__module__n_layers': [1, 2, 4],
    'estimator__module__num_nodes': [64, 128, 256, 512],
    'estimator__module__dropout': scuniform(0.0,0.7),
    'estimator__optimizer__weight_decay': [0.4, 0.2, 0.1, 0.05, 0.02, 0.01, 0],
    'estimator__batch_size': [64, 128, 256, 512, 1024],
    #lr not in paper because of learning rate finder
    # note: setting learning rate higher would make exp(partial_hazard) explode
    #'estimator__lr': scloguniform(0.001,0.01), # scheduler unten einbauen
    # use callback instead
    'estimator__lr':[0.01]
    #'max_epochs':  scrandint(10,20), # corresponds to num_rounds
}

## Set Seed

In [7]:
def seed_torch(seed=42):
    """Sets all seeds within torch and adjacent libraries.

    Args:
        seed: Random seed to be used by the seeding functions.

    Returns:
        None
    """
    random.seed(seed)
    #os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    #torch.backends.cudnn.benchmark = False
    #torch.backends.cudnn.deterministic = True
    return None


class FixSeed(Callback):
    def __init__(self, seed):
        self.seed = seed

    def initialize(self):
        seed_torch(self.seed)
        return super().initialize()

## Set Loss Function

In [15]:
# Define Scorer
def custom_scoring_function(y_true, y_pred):

        #y_true = torch.from_numpy(y_true)
        if isinstance(y_pred, np.ndarray):
            y_pred = torch.from_numpy(y_pred)
        if isinstance(y_true, np.ndarray):
            y_true = torch.from_numpy(y_true)
        if isinstance(y_pred, pd.Series):
            y_pred = torch.tensor(y_pred.values)
        if isinstance(y_true, pd.Series):
            y_true = torch.tensor(y_true.values)
        score = efron_likelihood_torch(y_true, y_pred) #.to(torch.float32)
        return score.numpy()

scoring_function = make_scorer(custom_scoring_function, greater_is_better=False)

In [16]:
## Set up Custom Splitter

## Set Torch Model

In [17]:

class SurvivalModel(nn.Module):
    def __init__(self, n_layers, input_units, num_nodes, dropout, out_features):
        super(SurvivalModel, self).__init__()
        self.n_layers = n_layers
        self.in_features = input_units
        self.num_nodes = num_nodes
        self.dropout = dropout
        self.out_features = out_features
        model = []
        # first layer
        model.append(torch.nn.Linear(input_units, num_nodes))
        model.append(torch.nn.ReLU())
        model.append(torch.nn.Dropout(dropout))
        model.append(torch.nn.BatchNorm1d(num_nodes))

        for i in range(n_layers-1):
            model.append(torch.nn.Linear(num_nodes, num_nodes))
            #init.kaiming_normal_(model[-1].weight, nonlinearity='relu')
            model.append(torch.nn.ReLU())
            model.append(torch.nn.Dropout(dropout))
            model.append(torch.nn.BatchNorm1d(num_nodes))

        # output layer
        model.append(torch.nn.Linear(num_nodes, out_features))
    
        self.layers = nn.Sequential(*model)

        # for layer in self.layers:
        #     if isinstance(layer, nn.Linear):
        #         #nn.init.uniform_(layer.weight, a=-0.5, b=0.5)
        #         nn.init.kaiming_normal_(layer.weight)


    def forward(self, X):
        X = X.to(torch.float32)
        res = self.layers(X)
        #print(res)
        return res


## Set up Scaler

In [18]:
class CustomStandardScaler(StandardScaler):
    
    def __init__(self, copy=True, with_mean=True, with_std=True):
        super().__init__(copy=copy, with_mean=with_mean, with_std=with_std)
        
    def fit(self, X, y=None):
        return super().fit(X, y)
    
    def transform(self, X, y=None):
        X_transformed = super().transform(X, y)
        return X_transformed.astype(np.float32)
    
    def fit_transform(self, X, y=None):
        X_transformed = super().fit_transform(X, y)
        return X_transformed.astype(np.float32)

## Custom Split

In [19]:
# Define stratified inner k-fold cross-validation
class CustomSplit(StratifiedKFold):
    def __init__(self, n_splits=5, shuffle=True, random_state=None):
        super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    def split(self, X, y, groups=None):
        print('split', X.dtypes)
        try:
            if y.shape[1]>1:
                y = y[:,0]
        except:
            pass
        bins = np.sign(y)
        return super().split(X, bins, groups=groups)

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

outer_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)
inner_custom_cv = CustomSplit(n_splits=n_outer_splits, shuffle=True, random_state=rand_state)



## Setting Training Procedure

In [20]:

def train_eval(X, y, net, n_iter, filename):
        model = '_efron_'
        dataset_name = filename.split('_')[0]
        # add IBS later
        outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[],
                        'ibs_train_'+dataset_name:[], 'ibs_test_'+dataset_name:[]}
        best_params = {'best_params_'+dataset_name:[]}
        best_model = {'best_model_'+dataset_name:[]}
        ct = make_column_transformer(
                #(OneHotEncoder(sparse_output=False), make_column_selector(dtype_include=['category', 'object']))
                (StandardScaler(), make_column_selector(dtype_include=['float32']))
                ,remainder='passthrough')

        pipe = Pipeline([('scaler',ct),
                        ('estimator', net)])
        rs = RandomizedSearchCV(pipe, param_grid_breslow, scoring = scoring_function, n_jobs=-1, 
                                    n_iter=2, refit=True)
        for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
                # Split data into training and testing sets for outer fold
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                X_train, y_train = sort_X_y_pandas(X_train, y_train)
                X_test, y_test = sort_X_y_pandas(X_test, y_test)

                print(X_train.shape, type(X_train))
                print(y_train.shape, type(y_train))
                print(X_test.shape, type(X_test))
                print(y_test.shape, type(y_test))
                # save splits and data
                savetxt('splits/train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
                savetxt('splits/test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
                
                savetxt('splits/X_train_'+str(i)+'_'+filename, X_train, delimiter=',')
                savetxt('splits/X_test_'+str(i)+'_'+filename, X_test, delimiter=',')

                savetxt('splits/y_train_'+str(i)+'_'+filename, y_train, delimiter=',')
                savetxt('splits/y_test_'+str(i)+'_'+filename, y_test, delimiter=',')

                strat = np.sign(y_train)
                valid_split = ValidSplit(cv=0.1, stratified=strat, random_state=42)




                rs.fit(X_train, y_train)
                best_preds_train = rs.best_estimator_.predict(X_train)
                best_preds_test = rs.best_estimator_.predict(X_test)
                # save hyperparameter settings
                params = rs.best_estimator_.get_params
                best_params['best_params_'+dataset_name] += [rs.best_params_]
                best_model['best_model_'+dataset_name] += [params]
                try:
                    cum_hazard_train = get_cumulative_hazard_function_efron(
                            X_train.values, X_train.values, y_train.values, y_train.values,
                            best_preds_train.reshape(-1), best_preds_train.reshape(-1)
                            )

                    df_survival_train = np.exp(-cum_hazard_train)
                    durations_train, events_train = transform_back(y_train.values)
                    time_grid_train = np.linspace(durations_train.min(), durations_train.max(), 100)
                    ev = EvalSurv(df_survival_train, durations_train, events_train, censor_surv='km')
                    print('Concordance Index',ev.concordance_td('antolini'))
                    print('Integrated Brier Score:',ev.integrated_brier_score(time_grid_train))
                    cindex_score_train = ev.concordance_td('antolini')
                    ibs_score_train = ev.integrated_brier_score(time_grid_train)

                    outer_scores['cindex_train_'+dataset_name] += [cindex_score_train]
                    outer_scores['ibs_train_'+dataset_name] += [ibs_score_train]

                except:
                    outer_scores['cindex_train_'+dataset_name] += [np.nan]
                    outer_scores['ibs_train_'+dataset_name] += [np.nan]
                    
                try:
                    cum_hazard_test = get_cumulative_hazard_function_efron(
                            X_train.values, X_test.values, y_train.values, y_test.values,
                            best_preds_train.reshape(-1), best_preds_test.reshape(-1)
                            )
                    df_survival_test = np.exp(-cum_hazard_test)
                    durations_test, events_test = transform_back(y_test.values)
                    print('durations',durations_test.min(), durations_test.max())
                    time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
                    ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
                    print('Concordance Index',ev.concordance_td('antolini'))
                    print('Integrated Brier Score:',ev.integrated_brier_score(time_grid_test))
                    cindex_score_test = ev.concordance_td('antolini')
                    ibs_score_test = ev.integrated_brier_score(time_grid_test)

                    outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
                    outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
                except: 
                    outer_scores['cindex_test_'+dataset_name] += [np.nan]
                    outer_scores['ibs_test_'+dataset_name] += [np.nan]
            
        df_best_params = pd.DataFrame(best_params)
        df_best_model = pd.DataFrame(best_model)
        df_outer_scores = pd.DataFrame(outer_scores)
        df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
        df_metrics.to_csv('metrics/metric_summary'+model+str(i)+'_'+filename, index=False)
        return best_model, best_params, outer_scores, best_preds_train, best_preds_test, X_train, X_test, y_train, y_test

                

In [21]:
data_set_fns = [load_metabric,  load_flchain, load_rgbsg, load_support] #, load_flchain, load_rgbsg, load_support, load_tcga]
data_set_fns_str = ['load_metabric', 'load_flchain', 'load_rgbsg', 'load_support'] 
one_hot_dict = {'load_flchain': ['mgus'], 'load_support':['cancer'], 'load_rgbsg':['grade']}

for idx, dataset in enumerate(data_set_fns):
    # get name of current dataset
    data = dataset(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
    X  = data.data #.astype(np.float32)
    y = data.target #.values #.to_numpy()

    print(data_set_fns_str[idx])
    if data_set_fns_str[idx] in one_hot_dict.keys():
        X = pd.get_dummies(X, columns=one_hot_dict[data_set_fns_str[idx]])
    X, y = sort_X_y_pandas(X, y)
    
    net = NeuralNet(
        SurvivalModel, 
        module__n_layers = 1,
        module__input_units = X.shape[1],
        #module__num_nodes = 32,
        #module__dropout = 0.1, # these could also be removed
        module__out_features = 1,
        # for split sizes when result size = 1
        iterator_train__drop_last=True,
        #iterator_valid__drop_last=True,
        criterion=EfronLoss,
        optimizer=torch.optim.AdamW,
        optimizer__weight_decay = 0.4,
        batch_size=32, # separate train and valid->iterator_train__batch_size=128 and iterator_valid__batch_size=128 ?
        callbacks=[
            (
                "sched",
                LRScheduler(
                    torch.optim.lr_scheduler.ReduceLROnPlateau,
                    monitor="valid_loss",
                    patience=5,
                ),
            ),
            (
                "es",
                EarlyStopping(
                    monitor="valid_loss",
                    patience=10,
                    load_best=True,
                ),
            ),
            ("seed", FixSeed(seed=42)),
        ],
        
        #[EarlyStopping(patience=10)],
        # add extensive callback, and random number seed
        #TODO: enable stratification, verify
        train_split=ValidSplit(0.2), # might cause lower performance in metrics, explain in thesis
        #lr=0.001,
        #max_epochs=1, #0,#100
        #train_split=None,
        verbose=1
    )
    best_model,params, outer_scores, best_preds_train, best_preds_test, X_train, X_test, y_train, y_test = train_eval(X, y, net, n_iter, data.filename)


load_metabric
split MKI67                float32
EGFR                 float32
PGR                  float32
ERBB2                float32
hormone_treatment    float32
radiotherapy         float32
chemotherapy         float32
ER_positive          float32
age                  float32
dtype: object
(1522, 9) <class 'pandas.core.frame.DataFrame'>
(1522,) <class 'pandas.core.series.Series'>
(381, 9) <class 'pandas.core.frame.DataFrame'>
(381,) <class 'pandas.core.series.Series'>
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m1442.8818[0m      [32m559.0554[0m  0.0404
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m1553.1927[0m      [32m616.2332[0m  0.0416
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m1716.5940[0m      [32m617.3030[0m  0.0468
  epoch    train_loss    valid_loss     dur
-------  ------------  ------

  assert pd.Series(self.index_surv).is_monotonic
  assert pd.Series(self.index_surv).is_monotonic


      6      516.4738  0.0116
      3      743.8466  0.0075
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m271.3890[0m      [32m222.2282[0m  0.0933
      7      606.7001  0.0103
      7      712.1039  0.0081
      5      776.0490  0.0103
      4      743.8466  0.0073
      8      712.1039  0.0071
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m287.2089[0m      [32m263.8742[0m  0.0989
      7      516.4738  0.0127
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m304.2133[0m      [32m322.8533[0m  0.0931
      8      606.7001  0.0123
      6      776.0490  0.0105
      5      743.8466  0.0093
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m299.3781[0m      [32m306.7532[0m  0.1011
      9      712.1039  0.0093
      8      516.4738  0.0079
      9   

  assert pd.Series(self.index_surv).is_monotonic
  assert pd.Series(self.index_surv).is_monotonic


      8      713.9573  0.0077
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m305.2678[0m      [32m303.0040[0m  0.1053
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m311.5889[0m      [32m303.7360[0m  0.0990
      9      703.7762  0.0104
      7      807.8309  0.0093
      9      713.9573  0.0088
     10      703.7762  0.0084
Restoring best model from epoch 1.
      8      807.8309  0.0087
     10      713.9573  0.0073
Restoring best model from epoch 1.
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m317.1685[0m      [32m343.5718[0m  0.1124
      9      807.8309  0.0090
     10      807.8309  0.0088
Restoring best model from epoch 1.
      2      [36m272.9078[0m      [32m256.4216[0m  0.0984
      2      [36m254.6189[0m      [32m220.8305[0m  0.0943
      2      [36m300.8359[0m      304.2607  0.092

  assert pd.Series(self.index_surv).is_monotonic
  assert pd.Series(self.index_surv).is_monotonic


      6      632.6849  0.0110
      5      674.6183  0.0095
      4      718.4949  0.0099
      8      606.8842  0.0079
      5      718.4949  0.0072
      7      632.6849  0.0083
      9      558.2759  0.0170
      6      674.6183  0.0134
      6      718.4949  0.0069
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m252.1219[0m      [32m233.0888[0m  0.1120
      8      632.6849  0.0073
      9      606.8842  0.0181
      7      674.6183  0.0077
     10      558.2759  0.0167
Restoring best model from epoch 1.
      7      718.4949  0.0167
     10      606.8842  0.0134
Restoring best model from epoch 1.
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m277.5827[0m      [32m255.4323[0m  0.1331
      8      674.6183  0.0196
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m311.5094[0m      [32m289.0778[0m  0.1335


  assert pd.Series(self.index_surv).is_monotonic
  assert pd.Series(self.index_surv).is_monotonic


      7      706.8821  0.0071
      5      712.3888  0.0084
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m296.8741[0m      [32m306.2928[0m  0.0948
      9      629.9693  0.0109
      6      676.3395  0.0088
     10      568.2552  0.0085
Restoring best model from epoch 1.
      6      712.3888  0.0073
     10      629.9693  0.0069
Restoring best model from epoch 1.
      7      676.3395  0.0071
      8      706.8821  0.0084
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m324.7582[0m      [32m305.5092[0m  0.0942
      9      706.8821  0.0071
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m319.4371[0m      [32m288.9980[0m  0.1064
      7      712.3888  0.0108
      8      676.3395  0.0121
      8      712.3888  0.0072
     10      706.8821  0.0095
Restoring best model from epoch 1.
      9      676.3395  0.

  assert pd.Series(self.index_surv).is_monotonic
  assert pd.Series(self.index_surv).is_monotonic


  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m1132.7626[0m      [32m934.3302[0m  0.2093
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m2247.2872[0m     [32m1354.1697[0m  0.2184
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m2553.7782[0m     [32m1860.8048[0m  0.2272
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m2715.9485[0m     [32m1997.5682[0m  0.2459
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m2676.3245[0m     [32m2063.7544[0m  0.2349
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       [36m76.5684[0m       [32m79.8806[0m  0.3255
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     

  assert pd.Series(self.index_surv).is_monotonic


Concordance Index 0.7892602373896233
Integrated Brier Score: 0.10264072727155817
durations 1.0 5215.0
Concordance Index 0.7999081261484231
Integrated Brier Score: 0.10374214047695596
(6297, 9) <class 'pandas.core.frame.DataFrame'>
(6297,) <class 'pandas.core.series.Series'>
(1574, 9) <class 'pandas.core.frame.DataFrame'>
(1574,) <class 'pandas.core.series.Series'>


  assert pd.Series(self.index_surv).is_monotonic


  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       [36m32.6089[0m       [32m31.6413[0m  0.4244
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       [36m44.8537[0m       [32m45.2798[0m  0.4336
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       [36m60.2694[0m       [32m58.6035[0m  0.4341
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       [36m66.1613[0m       [32m63.6303[0m  0.4606
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       [36m67.7624[0m       [32m63.8072[0m  0.4610
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m188.8795[0m      [32m181.1588[0m  0.4677
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     

  assert pd.Series(self.index_surv).is_monotonic


Concordance Index 0.7988506378373427
Integrated Brier Score: 0.09912649479415031
durations 1.0 5123.0
Concordance Index 0.8048386048627422
Integrated Brier Score: 0.09776734913552253
(6297, 9) <class 'pandas.core.frame.DataFrame'>
(6297,) <class 'pandas.core.series.Series'>
(1574, 9) <class 'pandas.core.frame.DataFrame'>
(1574,) <class 'pandas.core.series.Series'>


  assert pd.Series(self.index_surv).is_monotonic


  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       [36m31.9348[0m       [32m30.8790[0m  0.3957
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       [36m61.6850[0m       [32m55.2658[0m  0.4109
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       [36m46.9085[0m       [32m40.2280[0m  0.4324
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       [36m66.7236[0m       [32m62.2651[0m  0.4378
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       [36m68.6133[0m       [32m63.8959[0m  0.4331
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1           nan      [32m185.4028[0m  0.4611
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m301

  assert pd.Series(self.index_surv).is_monotonic


Concordance Index 0.8015267130949442
Integrated Brier Score: 0.09801181463394257
durations 1.0 5166.0
Concordance Index 0.7869990755441599
Integrated Brier Score: 0.10219977438413154
(6297, 9) <class 'pandas.core.frame.DataFrame'>
(6297,) <class 'pandas.core.series.Series'>
(1574, 9) <class 'pandas.core.frame.DataFrame'>
(1574,) <class 'pandas.core.series.Series'>


  assert pd.Series(self.index_surv).is_monotonic


  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       [36m44.2740[0m       [32m46.1580[0m  0.3926
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       [36m30.4116[0m       [32m31.8677[0m  0.3959
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       [36m66.2189[0m       [32m62.3383[0m  0.4357
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       [36m67.6241[0m       [32m65.2236[0m  0.4337
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       [36m60.9366[0m       [32m57.2915[0m  0.4740
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m359.4213[0m      [32m337.0289[0m  0.4958
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     

  assert pd.Series(self.index_surv).is_monotonic


Concordance Index 0.8068622647283072
Integrated Brier Score: 0.0975460191416738
durations 1.0 5171.0
Concordance Index 0.778009361243848
Integrated Brier Score: 0.10135364249261401
(6297, 9) <class 'pandas.core.frame.DataFrame'>
(6297,) <class 'pandas.core.series.Series'>
(1574, 9) <class 'pandas.core.frame.DataFrame'>
(1574,) <class 'pandas.core.series.Series'>


  assert pd.Series(self.index_surv).is_monotonic


  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       [36m31.2599[0m       [32m30.1748[0m  0.3788
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       [36m45.8949[0m       [32m43.6916[0m  0.3961
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       [36m61.8435[0m       [32m56.2228[0m  0.4015
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       [36m66.8608[0m       [32m63.2777[0m  0.4277
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       [36m67.9818[0m       [32m63.2852[0m  0.4353
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m279.1763[0m      [32m275.9826[0m  0.4683
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     

  assert pd.Series(self.index_surv).is_monotonic


Concordance Index 0.7963675977542859
Integrated Brier Score: 0.10133199504691294
durations 1.0 5187.0
Concordance Index 0.7944194432910247
Integrated Brier Score: 0.09900474177011949
load_rgbsg
split horm_treatment      category
menopause           category
age                  float32
n_positive_nodes     float32
progesterone         float32
estrogene            float32
grade_0.0              uint8
grade_1.0              uint8
grade_2.0              uint8
dtype: object
(1785, 9) <class 'pandas.core.frame.DataFrame'>
(1785,) <class 'pandas.core.series.Series'>
(447, 9) <class 'pandas.core.frame.DataFrame'>
(447,) <class 'pandas.core.series.Series'>


  assert pd.Series(self.index_surv).is_monotonic


  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m106.4342[0m       [32m90.5647[0m  0.1168
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m112.1225[0m       [32m92.7967[0m  0.1223
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m118.9879[0m       [32m99.9257[0m  0.1300
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m134.5665[0m      [32m115.6042[0m  0.1326
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m153.2099[0m      [32m134.9823[0m  0.1351
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m715.1565[0m      [32m518.3607[0m  0.1405
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     

  assert pd.Series(self.index_surv).is_monotonic
  assert pd.Series(self.index_surv).is_monotonic


  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m103.7590[0m       [32m92.6279[0m  0.0787
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m107.6924[0m      [32m101.0999[0m  0.0808
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m115.9881[0m      [32m104.0756[0m  0.0922
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m131.2614[0m      [32m118.7590[0m  0.0918
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m149.4000[0m      [32m141.0423[0m  0.0957
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m257.8528[0m      [32m217.2874[0m  0.0972
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     

Traceback (most recent call last):
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/var/folders/jr/dh6mkdzs31lc5pkqymtdbh180000gp/T/ipykernel_83010/1822012356.py", line 14, in custom_scoring_function
AttributeError: 'int' object has no attribute 'numpy'

Traceback (most recent call last):
  File "/Users/JUSC/miniconda3/envs/xgbsurv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Users/JUSC/miniconda3/envs/xgbsurv/l

  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m119.0548[0m      [32m109.9539[0m  0.1150
      2      [36m116.2713[0m      [32m109.0147[0m  0.1072
      3      [36m115.5427[0m      [32m108.7427[0m  0.1109
      4      [36m115.1563[0m      108.7938  0.1062
      5      [36m114.9808[0m      [32m108.6775[0m  0.1068
      6      115.3082      108.7091  0.1057
      7      [36m114.6407[0m      108.6842  0.1058
      8      114.7449      108.6929  0.1063
      9      115.1023      108.8166  0.1059
     10      114.8224      [32m108.4576[0m  0.1037
Concordance Index 0.6724687505494662
Integrated Brier Score: 0.17672688701514408
durations 1.87269 84.0
Concordance Index 0.6529077184775675
Integrated Brier Score: 0.18988832939225028
(1786, 9) <class 'pandas.core.frame.DataFrame'>
(1786,) <class 'pandas.core.series.Series'>
(446, 9) <class 'pandas.core.frame.DataFrame'>
(446,) <class 'pandas.core.series.Series'>


  assert pd.Series(self.index_surv).is_monotonic
  assert pd.Series(self.index_surv).is_monotonic


  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m104.8007[0m       [32m93.2880[0m  0.0881
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m106.6483[0m       [32m95.9274[0m  0.0881
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m116.8657[0m      [32m104.2338[0m  0.0884
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m258.2902[0m      [32m220.0420[0m  0.0809
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m130.6302[0m      [32m120.1868[0m  0.0959
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m258.5566[0m      [32m223.8082[0m  0.1022
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     

  assert pd.Series(self.index_surv).is_monotonic
  assert pd.Series(self.index_surv).is_monotonic


  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m108.6369[0m       [32m95.2676[0m  0.0881
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m105.0174[0m       [32m89.4833[0m  0.0955
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m115.6564[0m      [32m100.0909[0m  0.0919
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m132.6349[0m      [32m118.2267[0m  0.0945
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m151.0288[0m      [32m136.1281[0m  0.0944
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m250.8967[0m      [32m212.7052[0m  0.0977
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     

  assert pd.Series(self.index_surv).is_monotonic
  assert pd.Series(self.index_surv).is_monotonic


  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m101.7282[0m       [32m87.4560[0m  0.0829
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m116.8231[0m       [32m99.8416[0m  0.0833
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m110.2602[0m       [32m93.0723[0m  0.0901
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m261.6484[0m      [32m208.3373[0m  0.0919
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m132.5224[0m      [32m120.3552[0m  0.1036
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m285.9399[0m      [32m238.1077[0m  0.0878
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     

  assert pd.Series(self.index_surv).is_monotonic
  assert pd.Series(self.index_surv).is_monotonic


  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m124.0263[0m      [32m124.0201[0m  0.4027
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m131.3084[0m      [32m130.6045[0m  0.3984
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m123.6923[0m      [32m124.2135[0m  0.4292
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m166.0955[0m      [32m165.3362[0m  0.4335
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m302.9884[0m      [32m298.4634[0m  0.4317
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m301.5585[0m      [32m298.2152[0m  0.4425
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     

  assert pd.Series(self.index_surv).is_monotonic


Concordance Index 0.5975996427595404
Integrated Brier Score: 0.19579720445357937
durations 3.0 2024.0
Concordance Index 0.5871250741306627
Integrated Brier Score: 0.20000452814229397
(7098, 16) <class 'pandas.core.frame.DataFrame'>
(7098,) <class 'pandas.core.series.Series'>
(1775, 16) <class 'pandas.core.frame.DataFrame'>
(1775,) <class 'pandas.core.series.Series'>


  assert pd.Series(self.index_surv).is_monotonic


  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m1631.6097[0m     [32m1522.2317[0m  0.3463
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m1631.7936[0m     [32m1519.3457[0m  0.3516
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m1720.1249[0m     [32m1605.3218[0m  0.3682
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m2098.0450[0m     [32m1914.6310[0m  0.4138
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m2197.3352[0m     [32m1998.2247[0m  0.4333
      2     [36m1607.6857[0m     1522.6580  0.3414
      2     [36m1605.9679[0m     [32m1518.1426[0m  0.3684
      2     [36m1692.4287[0m     1606.2024  0.3564
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  -

  assert pd.Series(self.index_surv).is_monotonic


Concordance Index 0.6022286148330224
Integrated Brier Score: 0.1982350939466629
durations 3.0 2026.0
Concordance Index 0.5964557206194817
Integrated Brier Score: 0.19771296061445162
(7098, 16) <class 'pandas.core.frame.DataFrame'>
(7098,) <class 'pandas.core.series.Series'>
(1775, 16) <class 'pandas.core.frame.DataFrame'>
(1775,) <class 'pandas.core.series.Series'>


  assert pd.Series(self.index_surv).is_monotonic


  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m1654.3765[0m     [32m1519.6995[0m  0.3524
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m1654.6725[0m     [32m1523.7416[0m  0.3603
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m1730.8246[0m     [32m1593.3090[0m  0.3562
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m2087.5662[0m     [32m1924.8121[0m  0.3583
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m2200.7100[0m     [32m2009.9210[0m  0.4210
      2     [36m1620.5692[0m     1519.8488  0.3299
      2     [36m1622.0004[0m     1524.3564  0.3623
      2     [36m1700.0514[0m     [32m1592.9277[0m  0.3755
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  -

  assert pd.Series(self.index_surv).is_monotonic


Concordance Index 0.5994081344585557
Integrated Brier Score: 0.1962765548198142
durations 3.0 2029.0
Concordance Index 0.5858071847691031
Integrated Brier Score: 0.19820621552043144
(7099, 16) <class 'pandas.core.frame.DataFrame'>
(7099,) <class 'pandas.core.series.Series'>
(1774, 16) <class 'pandas.core.frame.DataFrame'>
(1774,) <class 'pandas.core.series.Series'>


  assert pd.Series(self.index_surv).is_monotonic


  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m1654.3526[0m     [32m1499.3692[0m  0.3417
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m1658.1699[0m     [32m1503.6561[0m  0.3483
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m1744.4500[0m     [32m1583.7843[0m  0.3373
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m2102.7757[0m     [32m1890.9436[0m  0.3745
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m2194.1981[0m     [32m2024.7054[0m  0.3940
      2     [36m1626.7612[0m     [32m1496.3441[0m  0.3344
      2     [36m1630.1081[0m     [32m1500.7872[0m  0.3542
      2     [36m1714.7822[0m     [32m1582.0932[0m  0.3789
  epoch    train_loss    valid_loss     dur
-------  -----------

  assert pd.Series(self.index_surv).is_monotonic


Concordance Index 0.5868860448721398
Integrated Brier Score: 0.19866395689762062
durations 3.0 2029.0
Concordance Index 0.5967965232950506
Integrated Brier Score: 0.1995458058993539
(7099, 16) <class 'pandas.core.frame.DataFrame'>
(7099,) <class 'pandas.core.series.Series'>
(1774, 16) <class 'pandas.core.frame.DataFrame'>
(1774,) <class 'pandas.core.series.Series'>


  assert pd.Series(self.index_surv).is_monotonic


  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m1639.3902[0m     [32m1508.9588[0m  0.3510
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m1638.4174[0m     [32m1508.9292[0m  0.3437
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m2095.6031[0m     [32m1929.1237[0m  0.3689
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m2211.3889[0m     [32m2045.1043[0m  0.3799
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1     [36m1719.3617[0m     [32m1585.9002[0m  0.4193
      2     [36m1607.8524[0m     1509.5791  0.3552
      2     [36m1608.0039[0m     1509.3027  0.4345
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1      [36m316.3619[0m      [32m296.5996[0m  0

  assert pd.Series(self.index_surv).is_monotonic


Concordance Index 0.5918104177904105
Integrated Brier Score: 0.19864858066171123
durations 3.0 2029.0
Concordance Index 0.5972852111690634
Integrated Brier Score: 0.19577887728238652


  assert pd.Series(self.index_surv).is_monotonic


## TCGA

In [None]:
param_grid_breslow_tcga = {
    'estimator__module__n_layers': [1, 2, 4],
    'estimator__module__n_layers': [1, 2, 4],
    'estimator__module__num_nodes': [64, 128, 256, 512],
    'estimator__module__dropout': scuniform(0.0,0.7),
    'estimator__optimizer__weight_decay': [0.4, 0.2, 0.1, 0.05, 0.02, 0.01, 0],
    'estimator__batch_size': [64, 128, 256, 512, 1024],
    #lr not in paper because of learning rate finder
    # note: setting learning rate higher would make exp(partial_hazard) explode
    #'estimator__lr': scloguniform(0.001,0.01), # scheduler unten einbauen
    # use callback instead
    'estimator__lr':[0.01],
    #'estimator__max_epochs':  scrandint(10,20), # corresponds to num_rounds
    'pca__n_components': [8, 10, 12, 14, 16]
}

In [None]:

def train_eval(X, y, net, n_iter, filename):
        model = '_efron_'
        dataset_name = filename.split('_')[0]
        # add IBS later
        outer_scores = {'cindex_train_'+dataset_name:[], 'cindex_test_'+dataset_name:[],
                        'ibs_train_'+dataset_name:[], 'ibs_test_'+dataset_name:[]}
        best_params = {'best_params_'+dataset_name:[]}
        best_model = {'best_model_'+dataset_name:[]}
        ct = make_column_transformer(
                #(OneHotEncoder(sparse_output=False), make_column_selector(dtype_include=['category', 'object']))
                (StandardScaler(), make_column_selector(dtype_include=['float32']))
                ,remainder='drop')
        pipe = Pipeline([('scaler',ct),
                         ('pca', PCA()),#n_components=10
                        ('estimator', net)])
        rs = RandomizedSearchCV(pipe, param_grid_breslow_tcga, scoring = scoring_function, n_jobs=-1, 
                                    n_iter=2, refit=True)
        for i, (train_index, test_index) in enumerate(outer_custom_cv.split(X, y)):
                # Split data into training and testing sets for outer fold
                X_train, X_test = X.iloc[train_index], X.iloc[test_index]
                y_train, y_test = y.iloc[train_index], y.iloc[test_index]
                X_train, y_train = sort_X_y_pandas(X_train, y_train)
                X_test, y_test = sort_X_y_pandas(X_test, y_test)

                #print(X_train.shape, type(X_train))
                #print(y_train.shape, type(y_train))
                #print(X_test.shape, type(X_test))
                #print(y_test.shape, type(y_test))
                # save splits and data
                savetxt('splits/train_index_'+str(i)+'_'+filename, train_index, delimiter=',')
                savetxt('splits/test_index_'+str(i)+'_'+filename, test_index, delimiter=',')
                
                savetxt('splits/X_train_'+str(i)+'_'+filename, X_train, delimiter=',')
                savetxt('splits/X_test_'+str(i)+'_'+filename, X_test, delimiter=',')

                savetxt('splits/y_train_'+str(i)+'_'+filename, y_train, delimiter=',')
                savetxt('splits/y_test_'+str(i)+'_'+filename, y_test, delimiter=',')

                strat = np.sign(y_train)
                valid_split = ValidSplit(cv=0.1, stratified=strat, random_state=42)




                rs.fit(X_train, y_train)
                best_preds_train = rs.best_estimator_.predict(X_train)
                best_preds_test = rs.best_estimator_.predict(X_test)
                # save hyperparameter settings
                params = rs.best_estimator_.get_params
                best_params['best_params_'+dataset_name] += [rs.best_params_]
                best_model['best_model_'+dataset_name] += [params]
                try:
                    cum_hazard_train = get_cumulative_hazard_function_efron(
                            X_train.values, X_train.values, y_train.values, y_train.values,
                            best_preds_train.reshape(-1), best_preds_train.reshape(-1)
                            )

                    df_survival_train = np.exp(-cum_hazard_train)
                    durations_train, events_train = transform_back(y_train.values)
                    time_grid_train = np.linspace(durations_train.min(), durations_train.max(), 100)
                    ev = EvalSurv(df_survival_train, durations_train, events_train, censor_surv='km')
                    print('Concordance Index',ev.concordance_td('antolini'))
                    print('Integrated Brier Score:',ev.integrated_brier_score(time_grid_train))
                    cindex_score_train = ev.concordance_td('antolini')
                    ibs_score_train = ev.integrated_brier_score(time_grid_train)

                    outer_scores['cindex_train_'+dataset_name] += [cindex_score_train]
                    outer_scores['ibs_train_'+dataset_name] += [ibs_score_train]

                except:
                    outer_scores['cindex_train_'+dataset_name] += [np.nan]
                    outer_scores['ibs_train_'+dataset_name] += [np.nan]
                    
                try:
                    cum_hazard_test = get_cumulative_hazard_function_efron(
                            X_train.values, X_test.values, y_train.values, y_test.values,
                            best_preds_train.reshape(-1), best_preds_test.reshape(-1)
                            )
                    df_survival_test = np.exp(-cum_hazard_test)
                    durations_test, events_test = transform_back(y_test.values)
                    print('durations',durations_test.min(), durations_test.max())
                    time_grid_test = np.linspace(durations_test.min(), durations_test.max(), 100)
                    ev = EvalSurv(df_survival_test, durations_test, events_test, censor_surv='km')
                    print('Concordance Index',ev.concordance_td('antolini'))
                    print('Integrated Brier Score:',ev.integrated_brier_score(time_grid_test))
                    cindex_score_test = ev.concordance_td('antolini')
                    ibs_score_test = ev.integrated_brier_score(time_grid_test)

                    outer_scores['cindex_test_'+dataset_name] += [cindex_score_test]
                    outer_scores['ibs_test_'+dataset_name] += [ibs_score_test]
                except: 
                    outer_scores['cindex_test_'+dataset_name] += [np.nan]
                    outer_scores['ibs_test_'+dataset_name] += [np.nan]
            
                df_best_params = pd.DataFrame(best_params)
                df_best_model = pd.DataFrame(best_model)
                df_outer_scores = pd.DataFrame(outer_scores)
                df_metrics = pd.concat([df_best_params,df_best_model,df_outer_scores], axis=1)
                df_metrics.to_csv('metrics/metric_summary'+model+str(i)+'_'+filename, index=False)
        return best_model, best_params, outer_scores, best_preds_train, best_preds_test, X_train, X_test, y_train, y_test

                
#cv=inner_custom_cv,pipe

In [None]:
cancer_types = [
    'BLCA',
    'BRCA',
    'HNSC',
    'KIRC',
    'LGG',
    'LIHC',
    'LUAD',
    'LUSC',
    'OV',
    'STAD']
import skorch.callbacks

class InputShapeSetter(skorch.callbacks.Callback):
    def on_train_begin(self, net, X, y):
        net.set_params(module__input_units=X.shape[-1])

for idx, cancer_type in enumerate(cancer_types):
    # get name of current dataset
    data = load_tcga(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", cancer_type=cancer_type, as_frame=True)
    X  = data.data #.astype(np.float32)
    y = data.target #.values #.to_numpy()

    X, y = sort_X_y_pandas(X, y)

    net = NeuralNet(
        SurvivalModel, 
        module__n_layers = 1,
        module__input_units = X.shape[1],
        #module__num_nodes = 32,
        #module__dropout = 0.1, # these could also be removed
        module__out_features = 1,
        # for split sizes when result size = 1
        iterator_train__drop_last=True,
        #iterator_valid__drop_last=True,
        criterion=EfronLoss,
        optimizer=torch.optim.AdamW,
        optimizer__weight_decay = 0.4,
        batch_size=32, # separate train and valid->iterator_train__batch_size=128 and iterator_valid__batch_size=128 ?
        callbacks=[
            (
                "sched",
                LRScheduler(
                    torch.optim.lr_scheduler.ReduceLROnPlateau,
                    monitor="valid_loss",
                    patience=5,
                ),
            ),
            (
                "es",
                EarlyStopping(
                    monitor="valid_loss",
                    patience=10,
                    load_best=True,
                ),
            ),
            ("seed", FixSeed(seed=42)),
            ("Input Shape Setter",InputShapeSetter())
        ],#[EarlyStopping(patience=10)],#,InputShapeSetter()],
        #TODO: enable stratification, verify
        train_split=ValidSplit(0.2), # might cause lower performance in metrics, explain in thesis
        #lr=0.001,
        #max_epochs=1, #0,#100
        #train_split=None,
        verbose=1
    )
    best_model,params, outer_scores, best_preds_train, best_preds_test, X_train, X_test, y_train, y_test = train_eval(X, y, net, n_iter, data.filename)

split gex_?|100130426      float32
gex_?|100133144      float32
gex_?|100134869      float32
gex_?|10357          float32
gex_?|10431          float32
                      ...   
gex_ZYG11A|440590    float32
gex_ZYG11B|79699     float32
gex_ZYX|7791         float32
gex_ZZEF1|23140      float32
gex_ZZZ3|26009       float32
Length: 20531, dtype: object
Re-initializing module because the following parameters were re-set: module__input_units.
Re-initializing criterion.
Re-initializing optimizer.
  epoch    valid_loss     dur
-------  ------------  ------
      1      [36m111.4011[0m  0.0027
      2      111.4011  0.0016
      3      111.4011  0.0015
      4      111.4011  0.0015
      5      111.4011  0.0015
      6      111.4011  0.0015
      7      111.4011  0.0015
      8      111.4011  0.0015
      9      111.4011  0.0015
     10      111.4011  0.0022
Restoring best model from epoch 1.
Re-initializing module because the following parameters were re-set: module__input_units.
Re-initi