In [1]:
from xgbsurv.datasets import load_metabric, load_flchain, load_rgbsg, load_support
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

## FLCHAIN

In [2]:
data = load_flchain(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
X  = data.data
X.dtypes

age           float32
sex              bool
sample_yr     float32
kappa         float32
lambda        float32
flc_grp       float32
creatinine    float32
mgus          float32
dtype: object

In [3]:
y  = data.target
y.dtypes

dtype('float32')

In [4]:
y.shape

(7871,)

## RGBSG

In [5]:

def rgbsg_preprocess(path="add your path here"):
    filename="original_data/RGBSG_pycox_full.csv"
    df = pd.read_csv(path+filename)
    # drop death cause column
    # name columns
    df.columns = [
        "horm_treatment",
        "grade",
        "menopause",
        "age",
        "n_positive_nodes",
        "progesterone",
        "estrogene",
        "time",
        "event"
        ]
    # remove zero time observations
    df = df[df.time!=0]
    # sort data
    df.sort_values(by='time', ascending=True, inplace=True)
    # save data
    df.to_csv(path+"data/RGBSG_adapted.csv", index=False)
    return

In [6]:
rgbsg_preprocess(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/")
data = load_rgbsg(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
X  = data.data
X.dtypes


horm_treatment         bool
grade               float32
menopause              bool
age                 float32
n_positive_nodes    float32
progesterone        float32
estrogene           float32
dtype: object

In [7]:
X.horm_treatment.value_counts()

False    1647
True      585
Name: horm_treatment, dtype: int64

In [8]:
X

Unnamed: 0,horm_treatment,grade,menopause,age,n_positive_nodes,progesterone,estrogene
0,False,1.0,False,50.0,2.0,90.0,30.0
1,True,2.0,True,57.0,18.0,11.0,13.0
2,False,2.0,False,44.0,19.0,28.0,31.0
3,False,0.0,False,50.0,1.0,1.0,4.0
4,False,1.0,False,51.0,5.0,360.0,57.0
...,...,...,...,...,...,...,...
2227,False,1.0,True,80.0,1.0,875.0,534.0
2228,True,1.0,True,59.0,4.0,4.0,3.0
2229,False,1.0,False,43.0,1.0,22.0,0.0
2230,True,1.0,True,57.0,4.0,16.0,5.0


## Metabric

## SUPPORT

In [14]:
data = load_support(path="/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/", as_frame=True)
#/Users/JUSC/Documents/xgbsurv/xgbsurv/datasets/data/SUPPORT_adapted.csv
X  = data.data
X.dtypes

age                  float32
sex                 category
race                 float32
n_comorbidities      float32
diabetes            category
dementia            category
cancer              category
blood_pressure       float32
heart_rate           float32
respiration_rate     float32
temperature          float32
white_blood_cell     float32
serum_sodium         float32
serum_creatinine     float32
dtype: object

In [15]:
X.isnull().sum()

age                 0
sex                 0
race                0
n_comorbidities     0
diabetes            0
dementia            0
cancer              0
blood_pressure      0
heart_rate          0
respiration_rate    0
temperature         0
white_blood_cell    0
serum_sodium        0
serum_creatinine    0
dtype: int64

In [11]:
X.diabetes.value_counts()

0.0    7150
1.0    1723
Name: diabetes, dtype: int64

In [12]:
X.dementia.value_counts()

0.0    8586
1.0     287
Name: dementia, dtype: int64

In [13]:
X.cancer.value_counts()

1.0    5857
0.0    1786
2.0    1230
Name: cancer, dtype: int64

In [26]:
import numpy as np
from sklearn.datasets import make_regression
import torch
from torch import nn
import torch.nn.functional as F


# This is a toy dataset for binary classification, 1000 data points with 20 features each
X, y = make_regression(1000, 20, n_informative=10, random_state=0, n_targets=2)
X, y = X.astype(np.float32), y.astype(np.float32)

class ClassifierModule(nn.Module):
    def __init__(
            self,
            num_units=10,
            nonlin=F.relu,
            dropout=0.5,
    ):
        super(ClassifierModule, self).__init__()
        self.num_units = num_units
        self.nonlin = nonlin
        self.dropout = dropout

        self.dense0 = nn.Linear(20, num_units)
        self.nonlin = nonlin
        self.dropout = nn.Dropout(dropout)
        self.dense1 = nn.Linear(num_units, 10)
        self.output = nn.Linear(10, 2)

    def forward(self, X, **kwargs):
        X = self.nonlin(self.dense0(X))
        X = self.dropout(X)
        X = F.relu(self.dense1(X))
        X = F.softmax(self.output(X), dim=-1)
        return X
    


from skorch import NeuralNetRegressor

net = NeuralNetRegressor(
    ClassifierModule,
    max_epochs=20,
    lr=0.1,
#     device='cuda',  # uncomment this to train with CUDA
)



# Training the network
net.fit(X, y)






  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1    [36m32074.7522[0m    [32m31853.7147[0m  0.0321
      2    [36m32054.3299[0m    31856.0969  0.0092
      3    32055.2980    [32m31851.9863[0m  0.0098
      4    [36m32053.1770[0m    31852.8633  0.0096
      5    32053.5234    [32m31851.8642[0m  0.0093
      6    32053.3120    31852.2989  0.0093
      7    [36m32052.7355[0m    [32m31851.8258[0m  0.0091
      8    [36m32052.5515[0m    [32m31851.6427[0m  0.0101
      9    [36m32052.1894[0m    [32m31851.6353[0m  0.0093
     10    32052.3781    31851.6450  0.0086
     11    [36m32052.1515[0m    31851.9789  0.0092
     12    32053.4512    [32m31851.5747[0m  0.0089
     13    [36m32051.5519[0m    31851.6219  0.0097
     14    [36m32051.4684[0m    31851.6134  0.0093
     15    32051.7034    [32m31851.5312[0m  0.0090
     16    32051.7598    31851.5341  0.0099
     17    32051.5032    31851.5541  0.0093
     18    

<class 'skorch.regressor.NeuralNetRegressor'>[initialized](
  module_=ClassifierModule(
    (dense0): Linear(in_features=20, out_features=10, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
    (dense1): Linear(in_features=10, out_features=10, bias=True)
    (output): Linear(in_features=10, out_features=2, bias=True)
  ),
)

In [25]:
y

array([  57, -144,  -61,  -18,  -45,  -47,  307, -150,    1,  -20,  -15,
        291, -105,  -14,  151,  156, -373,   67,   19,   -4,  128,    9,
         42, -150,   91,   67,  173,   28,  135,  146,  151,  -94,  389,
         27, -210,   21, -308, -241,   97, -311,  197,  -65,  -75,  -97,
       -145,  399,   43,   -3,   61,    1,  324, -482,   86,   94,  -42,
        194,   31, -261,  -31, -242,  337, -424, -196,  154,  -22,   28,
       -150,  135,  -94,  246,  103,   23,   19, -300,  110,   80,  105,
        129,  158,  134,  -86, -149, -160,   83,   44, -356,   62, -264,
        578, -264,   22, -633,   28,  137,   97,   12,  231,   67,  -29,
       -166,  187, -277,   35,  -12, -286,  120, -128, -151, -320,   75,
       -308,   32, -157, -168, -336,  244,  190,  115, -108,   19,   47,
        175,  286,  105, -293,  286,  137, -169, -384,  204,  172, -142,
       -129,   16, -134,  279,  -56, -277,   -4,  238,  280,   66,  -60,
       -460,  128,  131, -283,  615,  165,  -15,  -