In [222]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import ShuffleSplit, GridSearchCV,train_test_split

from sksurv.datasets import load_veterans_lung_cancer
from sksurv.column import encode_categorical
from sksurv.metrics import concordance_index_censored
from sksurv.svm import FastSurvivalSVM,FastKernelSurvivalSVM
from sksurv.kernels import clinical_kernel

In this notebook, we create the different features that we want to put in our evaluation model with a dictionary. After that,
we use a gridsearch on SVM survival to find a good set of parameters and we train our model 25 times with the best parameters find using the training and validation set. Finally, we evaluate 25 times our 
model on a separate testing set.

In [223]:
def score_survival_model(model, X, y):
    prediction = model.predict(X)
    result = concordance_index_censored(y['Status'], y['Survival_in_days'], prediction)
    return result[0]

In [224]:
all_features = list(range(177))
eln_clin = [0]+list(range(168,175))
eln_clin_demo = [0]+list(range(168,177))
eln_clin_demo_cyto = [0]+list(range(84,153))+list(range(168,177))
eln_clin_demo_gen = list(range(84))+list(range(168,177))
eln_clin_demo_cyto_gen = list(range(153))+list(range(168,177))
eln_clin_demo_comp = [0]+list(range(153,177))
eln_cyto_gen = list(range(153))
eln_cyto_gen_comp = list(range(168))
eln_cyto_comp = [0] + list(range(84,168))
eln_gen_comp = list(range(84)) + list(range(153,168))

clin_demo = list(range(168,177))
clin_demo_cyto = list(range(84,153))+list(range(168,177))
clin_demo_gen = list(range(1,84))+list(range(168,177))
clin_demo_cyto_gen = list(range(1,153))+ list(range(168,177))
clin_demo_comp = list(range(153,177))
cyto_gen = list(range(1,153))
cyto_gen_comp = list(range(1,168))
cyto_comp = list(range(84,168))
gen_comp = list(range(1,84))+list(range(153,168))
clin_demo_cyto_gen_comp = list(range(1,177))
gen = list(range(1,84))
cyto = list(range(84,153))
comp = list(range(153,168))

dict_features_type = dict(zip(("all_features","eln_clin","eln_clin_demo",
         "eln_clin_demo_cyto","eln_clin_demo_gen","eln_clin_demo_cyto_gen",
         "eln_clin_demo_comp","eln_cyto_gen","eln_cyto_gen_comp",
         "eln_cyto_comp","eln_gen_comp","clin_demo",
         "clin_demo_cyto","clin_demo_gen","clin_demo_cyto_gen",
         "clin_demo_comp","cyto_gen","cyto_gen_comp",
         "cyto_comp","gen_comp","clin_demo_cyto_gen_comp",
         "gen","cyto","comp"), (all_features,eln_clin,eln_clin_demo,
         eln_clin_demo_cyto,eln_clin_demo_gen,eln_clin_demo_cyto_gen,
         eln_clin_demo_comp,eln_cyto_gen,eln_cyto_gen_comp,
         eln_cyto_comp,eln_gen_comp,clin_demo,
         clin_demo_cyto,clin_demo_gen,clin_demo_cyto_gen,
         clin_demo_comp,cyto_gen,cyto_gen_comp,
         cyto_comp,gen_comp,clin_demo_cyto_gen_comp,
         gen,cyto,comp)))

df_final = pd.read_table("df_prognosis_features_ready.tsv")

In [225]:
estimator = FastSurvivalSVM(max_iter=1000, tol=1e-6, random_state=17)

In [226]:
param_grid = {'alpha': 10. ** np.array([-6,-5.5,-5,-4.5,-2.5,-1,0]),'optimizer':["avltree"]}
cv = ShuffleSplit(n_splits=5,random_state=17)
gcv = GridSearchCV(estimator, param_grid, scoring=score_survival_model,
                   n_jobs=4, iid=False, refit=True,
                   cv=cv)
#refit=True to have the best params for the prediction of next step

In [227]:
df=pd.DataFrame(columns=dict_features_type.keys())

for key,item in dict_features_type.items():
    x = df_final.iloc[:,item]
    y = np.array(list(zip(df_final.os_status, df_final.os)),dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
    ci=[]
    for i in range(25):
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)
        gcv = gcv.fit(X_train,y_train)
        print(gcv.best_params_)
        ci.append(concordance_index_censored(y_test['Status'], y_test['Survival_in_days'], gcv.predict(X_test))[0])
        print(ci)
    df[key] = ci


{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.7029707684006729]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.7029707684006729, 0.6958428009088873]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.7029707684006729, 0.6958428009088873, 0.7334145427286357]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.7029707684006729, 0.6958428009088873, 0.7334145427286357, 0.7185412963051939]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.7029707684006729, 0.6958428009088873, 0.7334145427286357, 0.7185412963051939, 0.7229673649613226]
{'alpha': 1e-05, 'optimizer': 'avltree'}
[0.7029707684006729, 0.6958428009088873, 0.7334145427286357, 0.7185412963051939, 0.7229673649613226, 0.7117386562675567]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.7029707684006729, 0.6958428009088873, 0.7334145427286357, 0.7185412963051939, 0.7229673649613226, 0.7117386562675567, 0.7447334178841434]
{'alpha': 3.1622776601683795e-05, 'optimizer

In [228]:
df.to_csv("SVM_different_features_type.csv")

In [229]:
param_grid = {'alpha': 10. ** np.array([-6,-5.5,-5,-4.5,-2.5,-1,0]),'optimizer':["avltree"]}

cv = ShuffleSplit(n_splits=5,random_state=17)
gcv = GridSearchCV(estimator, param_grid, scoring=score_survival_model,
                   n_jobs=4, iid=False, refit=True,
                   cv=cv)
cv = ShuffleSplit(n_splits=5,random_state=17)
gcv = GridSearchCV(estimator, param_grid, scoring=score_survival_model,
                   n_jobs=4, iid=False, refit=True,
                   cv=cv)
df1=pd.DataFrame(columns=dict_features_type.keys())

for key,item in dict_features_type.items():
    x = df_final.iloc[:,item]
    y = np.array(list(zip(df_final.os_status, df_final.os)),dtype=[('Status', '?'), ('Survival_in_days', '<f8')])
    ci=[]
    for i in range(25):
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=i)
        gcv = gcv.fit(X_train,y_train)
        print(gcv.best_params_)
        ci.append(concordance_index_censored(y_test['Status'], y_test['Survival_in_days'], gcv.predict(X_test))[0])
        print(ci)
    df1[key] = ci
df1.to_csv("SVM_bis")

{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.7029707684006729]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.7029707684006729, 0.6958428009088873]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.7029707684006729, 0.6958428009088873, 0.7334145427286357]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.7029707684006729, 0.6958428009088873, 0.7334145427286357, 0.7185412963051939]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.7029707684006729, 0.6958428009088873, 0.7334145427286357, 0.7185412963051939, 0.7229673649613226]
{'alpha': 1e-05, 'optimizer': 'avltree'}
[0.7029707684006729, 0.6958428009088873, 0.7334145427286357, 0.7185412963051939, 0.7229673649613226, 0.7117386562675567]
{'alpha': 3.1622776601683795e-05, 'optimizer': 'avltree'}
[0.7029707684006729, 0.6958428009088873, 0.7334145427286357, 0.7185412963051939, 0.7229673649613226, 0.7117386562675567, 0.7447334178841434]
{'alpha': 3.1622776601683795e-05, 'optimizer

In [233]:
pd.read_csv("SVM_bis.csv").describe()

Unnamed: 0.1,Unnamed: 0,all_features,eln_clin,eln_clin_demo,eln_clin_demo_cyto,eln_clin_demo_gen,eln_clin_demo_cyto_gen,eln_clin_demo_comp,eln_cyto_gen,eln_cyto_gen_comp,...,clin_demo_cyto_gen,clin_demo_comp,cyto_gen,cyto_gen_comp,cyto_comp,gen_comp,clin_demo_cyto_gen_comp,gen,cyto,comp
count,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0,...,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0,25.0
mean,12.0,0.718816,0.657932,0.709827,0.716638,0.716859,0.719273,0.715505,0.65765,0.656148,...,0.718798,0.709417,0.655246,0.654332,0.627455,0.646887,0.718583,0.640107,0.589054,0.618704
std,7.359801,0.012479,0.010542,0.01297,0.01617,0.012339,0.012535,0.012589,0.013913,0.014747,...,0.01257,0.012908,0.015493,0.015626,0.017246,0.015773,0.012827,0.015338,0.016857,0.015088
min,0.0,0.695843,0.638461,0.692016,0.688085,0.691474,0.696531,0.698339,0.626882,0.626043,...,0.694019,0.683843,0.625299,0.625146,0.595688,0.615428,0.693475,0.611725,0.562938,0.583101
25%,6.0,0.711241,0.650984,0.699252,0.703671,0.707827,0.710476,0.704257,0.647108,0.645983,...,0.71239,0.702739,0.644894,0.645812,0.614527,0.636409,0.712234,0.632081,0.577935,0.60901
50%,12.0,0.718045,0.6569,0.708106,0.718232,0.716004,0.718192,0.71306,0.661636,0.658469,...,0.718317,0.710678,0.657205,0.655822,0.63063,0.647532,0.717708,0.63912,0.588302,0.620948
75%,18.0,0.725477,0.66771,0.715398,0.725119,0.724654,0.725352,0.720912,0.666472,0.665428,...,0.725587,0.720825,0.664503,0.660895,0.637697,0.656618,0.724889,0.653605,0.593923,0.627701
max,24.0,0.744733,0.675592,0.73966,0.74554,0.745557,0.745886,0.743037,0.681882,0.68725,...,0.745079,0.731886,0.683553,0.685734,0.658944,0.679995,0.744585,0.664225,0.628693,0.643295
