## Setup

In [None]:
import pandas as pd
import numpy as np
import scipy.io as sio
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from random_survival_forest import RandomSurvivalForest, concordance_index

np.random.seed(1337)

In [None]:
D = sio.loadmat('data.mat')
#dict_keys(['__header__', '__version__', '__globals__', 'Censored', 'Integ_Symb_Types', 'Integ_Symbs', 'Integ_X', 'Integ_X_raw', 'Patients', 'Subtypes', 'Survival'])

T = np.asarray([t[0] for t in D['Survival']]).astype('float32')
O = 1 - np.asarray([c[0] for c in D['Censored']]).astype('int32')
X = D['Integ_X_raw'].astype('float32') # not sure if this should be 'Integ_X_raw' or 'Integ_X'
X_headers = list(D['Integ_Symbs'])

df = pd.DataFrame(data=X, columns=X_headers)
df['Survival'] = T
df['Censored'] = O
print(df.shape)
df.head()

In [None]:
y = df.iloc[:,-2:]
X = df.iloc[:,:-2]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)
print("shape of X Train :"+str(X_train.shape))
print("shape of X Test :"+str(X_test.shape))
print("shape of Y Train :"+str(y_train.shape))
print("shape of Y Test :"+str(y_test.shape))

## Establish a baseline using Logistic Regression


In [42]:
#logistic Regression

#logistic Regression
y = df[["Survival"]]
y["Survival"] = y["Survival"].values.astype(int)

X = df.drop(columns=["Survival"])
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred=logreg.predict(X_test)
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)


#c_index = concordance_index(logreg, y_test, y_pred)
#print('C-index: {:.2f}'.format(c_index))
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
#print("Precision:",metrics.precision_score(y_test, y_pred))
#print("Recall:",metrics.recall_score(y_test, y_pred))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
  return f(**kwargs)


Accuracy: 0.007142857142857143


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

## Establish a baseline using Support Vector Machine (SVM)

In [6]:
!pip install pysurvival

Collecting pysurvival
  Downloading pysurvival-0.1.2-cp37-cp37m-macosx_10_13_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 4.4 MB/s eta 0:00:01
Collecting torch
  Downloading torch-1.9.0-cp37-none-macosx_10_9_x86_64.whl (127.9 MB)
[K     |████████████████████████████████| 127.9 MB 8.8 MB/s eta 0:00:011
Collecting pyarrow
  Downloading pyarrow-5.0.0-cp37-cp37m-macosx_10_13_x86_64.whl (17.6 MB)
[K     |████████████████████████████████| 17.6 MB 15.0 MB/s eta 0:00:01     |████████████████████████████▉   | 15.8 MB 15.0 MB/s eta 0:00:01
Collecting progressbar
  Downloading progressbar-2.5.tar.gz (10 kB)
Building wheels for collected packages: progressbar
  Building wheel for progressbar (setup.py) ... [?25ldone
[?25h  Created wheel for progressbar: filename=progressbar-2.5-py3-none-any.whl size=12082 sha256=68a42346469d994fd8ff1b5d9fd9e1998f59df94d306b89f75d5c38986a1a010
  Stored in directory: /Users/ellahayashi/Library/Caches/pip/wheels/f0/fd/1f/3e35ed57e94cd8ced

In [7]:
from pysurvival.models.svm import LinearSVMModel
from pysurvival.models.simulations import SimulationModel
from pysurvival.utils.metrics import concordance_index
from scipy.stats.stats import pearsonr

ModuleNotFoundError: No module named 'pysurvival'

In [None]:
svm_model = LinearSVMModel()
svm_model.fit(X=X_train, T=y_train["Survival"], E=y_train["Censored"], init_method='he_normal', lr = 0.5,  
              tol = 1e-3,  l2_reg = 1e-3, verbose = False)


In [11]:
c_index = concordance_index(svm_model, X=X_test, T=y_test["Survival"], E=y_test["Censored"])
print('C-index: {:.2f}'.format(c_index))

NameError: name 'svm_model' is not defined

## Establish a baseline using Random Survival Forest (RSF)

In [None]:
from random_survival_forest import RandomSurvivalForest, concordance_index

y = df[["Survival", "Censored"]]
y["Survival"] = y["Survival"].values.astype(int)
X = df.drop(columns=["Survival", "Censored"])
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)


rsf = RandomSurvivalForest(n_estimators=10)
rsf = rsf.fit(X_train, y_train)
y_pred = rsf.predict(X_test)
c_val = concordance_index(y_time=y_test["Survival"], y_pred=y_pred, y_event=y_test["Censored"])

In [None]:
print("C-index", round(c_val, 3))

## Neural Network SurvivalNet Reimplementation

In [None]:
import keras
from keras import layers
import tensorflow as tf

In [None]:
class partial_log_likelihood(keras.losses.Loss):
    def __init__(self, name="partial_log_likelihood"):
        super().__init__(name=name)
    def call(self, y_true, y_pred):
        censored = y_true[:,1]
        y_true = y_true[:,0]
        y_pred = tf.reshape(y_pred, [-1])
        sorted_survival, sorted_indices = tf.math.top_k(y_pred, len(y_true), True)  
        sorted_censored = keras.backend.gather(censored, sorted_indices)
        sorted_preds = keras.backend.gather(y_pred, sorted_indices)

        exp = keras.backend.exp(sorted_preds)
        sums = keras.backend.cumsum(exp)
        sums = sums + tf.convert_to_tensor(1e-2)
        logsums = keras.backend.log(sums)

        return keras.backend.sum(sorted_censored * sorted_preds - logsums) * -1

In [None]:
model = keras.Sequential([
    layers.InputLayer(input_shape=(len(X_train.columns),)),
    layers.Dense(128, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(1),
])

In [None]:
y_train.loc[:,"Censored"] = np.array(y_train.loc[:,"Censored"].values).astype(np.float32)
model.compile(optimizer=keras.optimizers.Adam(),loss=partial_log_likelihood())
model.fit(X_train, y_train.values,batch_size=32, epochs=10)

In [None]:
model.predict(X_test)