# ML survival model on the data of cancer eye patients

In [1]:
import pandas as pd

df = pd.read_csv('eye_cancer_patients.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Patient_ID            5000 non-null   object
 1   Age                   5000 non-null   int64 
 2   Gender                5000 non-null   object
 3   Cancer_Type           5000 non-null   object
 4   Laterality            5000 non-null   object
 5   Date_of_Diagnosis     5000 non-null   object
 6   Stage_at_Diagnosis    5000 non-null   object
 7   Treatment_Type        5000 non-null   object
 8   Surgery_Status        5000 non-null   bool  
 9   Radiation_Therapy     5000 non-null   int64 
 10  Chemotherapy          5000 non-null   int64 
 11  Outcome_Status        5000 non-null   object
 12  Survival_Time_Months  5000 non-null   int64 
 13  Genetic_Markers       2503 non-null   object
 14  Family_History        5000 non-null   bool  
 15  Country               5000 non-null   

### Data cleaning

In [6]:
df_new = df
df_new['has_BRAF'] = pd.notna(df_new['Genetic_Markers'])
df_new = df_new.drop(['Patient_ID', 'Genetic_Markers', 'Date_of_Diagnosis', 'Country'], axis = 1)
df_new = pd.get_dummies(df_new, columns=['Gender', 'Cancer_Type', 'Laterality', 'Stage_at_Diagnosis', 'Treatment_Type', 'Outcome_Status'], drop_first=True)
df_new.head()

Unnamed: 0,Age,Surgery_Status,Radiation_Therapy,Chemotherapy,Survival_Time_Months,Family_History,has_BRAF,Gender_M,Gender_Other,Cancer_Type_Melanoma,Cancer_Type_Retinoblastoma,Laterality_Left,Laterality_Right,Stage_at_Diagnosis_Stage II,Stage_at_Diagnosis_Stage III,Stage_at_Diagnosis_Stage IV,Treatment_Type_Radiation,Treatment_Type_Surgery,Outcome_Status_Deceased,Outcome_Status_In Remission
0,58,False,15,3,85,True,False,False,False,False,True,True,False,False,False,True,True,False,True,False
1,15,True,69,6,10,True,False,False,True,False,True,False,True,False,True,False,False,False,False,True
2,64,False,47,6,3,False,True,True,False,False,True,False,False,False,False,True,False,True,False,True
3,33,True,36,6,40,False,False,True,False,True,False,False,True,True,False,False,True,False,False,False
4,8,False,14,14,26,True,True,False,True,False,False,True,False,False,False,False,False,False,False,True


### Preparing the model

In [15]:
from sksurv.ensemble import RandomSurvivalForest
from sksurv.metrics import concordance_index_censored
from sksurv.util import Surv

# duration and event columns
y = Surv.from_dataframe(event="Outcome_Status_Deceased", 
                        time="Survival_Time_Months", 
                        data=df_new)

X = df_new.drop(columns=["Survival_Time_Months", "Outcome_Status_Deceased", "Outcome_Status_In Remission"])


In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
