## Setup

In [85]:
import pandas as pd
import numpy as np
import scipy.io as sio
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from random_survival_forest import RandomSurvivalForest, concordance_index

np.random.seed(1337)

In [86]:
D = sio.loadmat('data.mat')
#dict_keys(['__header__', '__version__', '__globals__', 'Censored', 'Integ_Symb_Types', 'Integ_Symbs', 'Integ_X', 'Integ_X_raw', 'Patients', 'Subtypes', 'Survival'])

T = np.asarray([t[0] for t in D['Survival']]).astype('float32')
O = 1 - np.asarray([c[0] for c in D['Censored']]).astype('int32')
X = D['Integ_X_raw'].astype('float32') # not sure if this should be 'Integ_X_raw' or 'Integ_X'
X_headers = list(D['Integ_Symbs'])

df = pd.DataFrame(data=X, columns=X_headers)
df['Survival'] = T
df['Censored'] = O
print(df.shape)
df.head()

(560, 401)


Unnamed: 0,age_at_initial_pathologic_diagnosis_Clinical,gender-Is-male_Clinical,histological_type-Is-oligoastrocytoma_Clinical,histological_type-Is-astrocytoma_Clinical,histological_type-Is-oligodendroglioma_Clinical,histological_type-Is-glioblastoma multiforme (gbm)_Clinical,histological_type-Is-treated primary gbm_Clinical,histological_type-Is-untreated primary (de novo) gbm_Clinical,radiation_therapy-Is-yes_Clinical,ACADS_Mut,...,CDKN1B_Protein,MAPK14_Protein,TP53_Protein,SQSTM1_Protein,RPS6KB1_Protein,RPS6KB1_Protein.1,RPS6KA1_Protein,RPS6KA1_Protein.1,Survival,Censored
0,50.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.161241,-0.097997,0.108144,0.661336,-0.11131,0.474086,0.377963,0.057217,144.0,1
1,57.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.740407,-0.118854,0.269457,0.116623,-0.573753,-0.587442,-0.52398,0.0,393.0,1
2,53.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.049271,-0.204173,0.426592,-0.379803,-0.1583,0.10067,-0.481545,0.098332,470.0,0
3,86.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,-0.126167,-0.064162,-0.223155,-0.188487,0.292586,-0.044545,-0.018513,0.085689,211.0,1
4,66.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.262911,-0.594671,0.878369,0.137746,-1.202437,0.260542,-0.026632,1.196287,691.0,1


In [87]:
y = df.iloc[:,-2:]
X = df.iloc[:,:-2]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)
print("shape of X Train :"+str(X_train.shape))
print("shape of X Test :"+str(X_test.shape))
print("shape of Y Train :"+str(y_train.shape))
print("shape of Y Test :"+str(y_test.shape))

shape of X Train :(420, 399)
shape of X Test :(140, 399)
shape of Y Train :(420, 2)
shape of Y Test :(140, 2)


## Establish a baseline using Logistic Regression


In [117]:
#logistic Regression

#logistic Regression
y = df[["Survival", "Censored"]]
#y = df[["Survival"]]
y["Survival"] = y["Survival"].values.astype(int)
X = df.drop(columns=["Survival", "Censored"])
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)

logreg = LogisticRegression()
logreg.fit(X_train, y_train["Survival"])
y_pred=logreg.predict(X_test)
#cnf_matrix = metrics.confusion_matrix(y_test["Survival"], y_pred["Censored"])


#c_index = concordance_index(logreg, y_test, y_pred)
#print('C-index: {:.2f}'.format(c_index))
#print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
#print("Precision:",metrics.precision_score(y_test, y_pred))
#print("Recall:",metrics.recall_score(y_test, y_pred))
c_val = concordance_index(y_time=y_test["Survival"], y_pred=y_pred, y_event=y_test["Censored"])
print('C-index: {:.2f}'.format(c_val))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


C-index: 0.38


## Establish a baseline using Support Vector Machine (SVM)

In [None]:
!pip install pysurvival

In [None]:
from pysurvival.models.svm import LinearSVMModel
from pysurvival.models.simulations import SimulationModel
from pysurvival.utils.metrics import concordance_index
from scipy.stats.stats import pearsonr

In [None]:
svm_model = LinearSVMModel()
svm_model.fit(X=X_train, T=y_train["Survival"], E=y_train["Censored"], init_method='he_normal', lr = 0.5,  
              tol = 1e-3,  l2_reg = 1e-3, verbose = False)


In [None]:
c_index = concordance_index(svm_model, X=X_test, T=y_test["Survival"], E=y_test["Censored"])
print('C-index: {:.2f}'.format(c_index))

## Establish a baseline using Random Survival Forest (RSF)

In [None]:
from random_survival_forest import RandomSurvivalForest, concordance_index

y = df[["Survival", "Censored"]]
y["Survival"] = y["Survival"].values.astype(int)
X = df.drop(columns=["Survival", "Censored"])
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)


rsf = RandomSurvivalForest(n_estimators=10)
rsf = rsf.fit(X_train, y_train)
y_pred = rsf.predict(X_test)
c_val = concordance_index(y_time=y_test["Survival"], y_pred=y_pred, y_event=y_test["Censored"])

In [None]:
print("C-index", round(c_val, 3))

## Neural Network SurvivalNet Reimplementation

In [88]:
import keras
from keras import layers
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Activation

In [109]:
class partial_log_likelihood(keras.losses.Loss):
    def __init__(self, name="partial_log_likelihood"):
        super().__init__(name=name)
    def call(self, y_true, y_pred):
        censored = y_true[:,1]
        y_true = y_true[:,0]
        y_pred = tf.reshape(y_pred, [-1])
        sorted_survival, sorted_indices = tf.math.top_k(y_true, len(y_true), True) 
        sorted_censored = keras.backend.gather(censored, sorted_indices)
        sorted_preds = keras.backend.gather(y_pred, sorted_indices)
        exp = keras.backend.exp(sorted_preds)
        sums = keras.backend.cumsum(exp) + tf.convert_to_tensor(1.0)
        logsums = keras.backend.log(sums)
        
        return keras.backend.sum(sorted_censored * sorted_preds - sorted_censored*logsums) * -1

In [110]:
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=len(X_train.columns)))
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(1))

In [116]:
y_train.loc[:,"Censored"] = np.array(y_train.loc[:,"Censored"].values).astype(np.float32)
model.compile(optimizer=keras.optimizers.Adam(),loss=partial_log_likelihood())
model.fit(X_train, y_train.values,batch_size=32, epochs=10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[selected_item_labels] = value


<keras.callbacks.History at 0x7f1918330df0>

In [118]:
concordance_index(y_time=y_test["Survival"], y_pred=model.predict(X_test), y_event=y_test["Censored"])

0.8667285095856524