In [1]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('BrainTumor_recurrance.csv')
data.head()

Unnamed: 0,Patient ID,Age,Gender,Tumor Type,Tumor Grade,Tumor Location,Treatment,Treatment Outcome,Time to Recurrence (months),Recurrence Site,Survival Time (months)
0,1,45,Male,Glioblastoma,IV,Frontal lobe,Surgery,Partial response,10.0,Temporal lobe,18
1,2,55,Female,Meningioma,I,Parietal lobe,Surgery,Complete response,,,36
2,3,60,Male,Astrocytoma,III,Occipital lobe,Surgery + Chemotherapy,Progressive disease,14.0,Frontal lobe,22
3,4,50,Female,Glioblastoma,IV,Temporal lobe,Surgery + Radiation therapy,Complete response,,,12
4,5,65,Male,Astrocytoma,II,Frontal lobe,Surgery + Radiation therapy,Partial response,24.0,Frontal lobe,48


In [3]:
data = data.drop('Patient ID', axis=1)

In [4]:
data['Time to Recurrence (months)'].unique() #As there are no zero values, we will put 0 for nan as non recurrence

array([10., nan, 14., 24.,  8., 12., 18.,  6.,  9., 16., 20., 36., 22.,
       26.])

In [5]:
data['Time to Recurrence (months)'] = data['Time to Recurrence (months)'].fillna(0.0)

In [6]:
for i in data.columns:
    if data[i].dtypes == 'object':
        print(data[i].unique())

['Male' 'Female']
['Glioblastoma' 'Meningioma' 'Astrocytoma']
['IV' 'I' 'III' 'II']
['Frontal lobe' 'Parietal lobe' 'Occipital lobe' 'Temporal lobe']
['Surgery' 'Surgery + Chemotherapy' 'Surgery + Radiation therapy'
 'Surgery + Radiation' 'Chemotherapy' 'Radiation'
 'Chemotherapy + Radiation']
['Partial response' 'Complete response' 'Progressive disease'
 'Stable disease']
['Temporal lobe' nan 'Frontal lobe' 'Parietal lobe' 'Occipital lobe']


In [7]:
treatment = data['Treatment']
treatments = (treatment.unique())
t = []
for i in treatments:
    j = i.split(' + ')
    t += j
t = set(t)
t.remove('Radiation therapy')
for val in t:
    data[val] = data['Treatment'].apply(lambda x: float(val in ['Radiation' if i == 'Radiation therapy' else i for i in x.split(' + ')]))
data = data.drop('Treatment', axis=1)

In [8]:
d = pd.get_dummies(data['Recurrence Site'], dummy_na=False, dtype='float')
data = pd.concat([data,d], axis=1)
data = data.drop('Recurrence Site', axis=1)

In [9]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
data['Gender'] = encoder.fit_transform(data['Gender'])
data['Tumor Grade'] = encoder.fit_transform(data['Tumor Grade'])

In [10]:
cols_to_encode = ['Tumor Location', 'Treatment Outcome', 'Tumor Type']
for col in cols_to_encode:
    d = pd.get_dummies(data[col], dtype='float')
    data = pd.concat([data,d], axis=1)
    data = data.drop(col, axis=1)

In [11]:
data.head(20)

Unnamed: 0,Age,Gender,Tumor Grade,Time to Recurrence (months),Survival Time (months),Radiation,Surgery,Chemotherapy,Frontal lobe,Occipital lobe,...,Occipital lobe.1,Parietal lobe,Temporal lobe,Complete response,Partial response,Progressive disease,Stable disease,Astrocytoma,Glioblastoma,Meningioma
0,45,1,3,10.0,18,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,55,0,0,0.0,36,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,60,1,2,14.0,22,0.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,50,0,3,0.0,12,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,65,1,1,24.0,48,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
5,45,1,3,10.0,18,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
6,55,0,0,0.0,36,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
7,60,1,2,14.0,22,0.0,1.0,1.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
8,50,0,3,0.0,12,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
9,65,1,1,24.0,48,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [14]:
from sklearn.model_selection import train_test_split

features1 = data.drop('Survival Time (months)', axis=1) #To predict survival time
features2 = data.drop(['Survival Time (months)', 'Time to Recurrence (months)'], axis=1) #To predict Time to Recurrance
target1 = data['Survival Time (months)']
target2 = data['Time to Recurrence (months)']

x_train, x_test, y_train, y_test = train_test_split(features1, target1, test_size=0.2, random_state=42)

In [71]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=3000)
model = rf.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [72]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print(r2_score(y_test, y_pred), mean_absolute_error(y_test, y_pred), mean_squared_error(y_test, y_pred))

0.8230654678123599 1.4814666338405191 13.498671636206222
