In [54]:
import os
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib
import scipy as sp
import matplotlib.pyplot as plt
from matplotlib import rcParams
from scipy.stats import sem
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import r2_score
from sklearn import linear_model
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
master = pd.read_csv('/content/drive/My Drive/HNC/Data/Master_file')
master.head()

Unnamed: 0,patient_num,replanned_or_not,R_fx,R_fx-determined,R-vs-SHBR,BODY,Body-1,Body-2,Body-3,Body-4,...,neck_area_fx-26,neck_area_fx-27,neck_area_fx-28,neck_area_fx-29,neck_area_fx-30,neck_area_fx-31,neck_area_fx-32,neck_area_fx-33,neck_area_fx-34,neck_area_fx-35
0,19,R,21.0,15.0,r,21.484754,20.805747,,,,...,,,62.0672,,59.0119,,59.6073,,,
1,35,R,26.0,20.0,r,18.636867,21.230902,,19.124266,,...,69.6107,,68.9219,,,68.8933,,,67.8577,
2,115,R,12.0,12.0,shbr,21.957419,21.883293,,,22.405124,...,62.7572,,61.3708,,,60.883,,62.6707,,
3,272,R,25.0,19.0,r,22.138647,20.795968,,,,...,,62.0832,,,,60.9116,,60.0142,,
4,447,R,23.0,19.0,r,19.8423,21.966858,,,21.909313,...,,,,,,,57.2289,,,


In [23]:
# Keeping only replanned patients
replanned = pd.DataFrame(master.loc[master['replanned_or_not']=='R'])
len(replanned)


31

In [24]:
len(replanned.columns)

420

In [25]:
# Deleting columns with more than 30% null values

for col in replanned.columns:
  if replanned[col].isna().sum() >= 0.3 * len(replanned):
    del replanned[col]
cols = replanned.columns.tolist()
len(cols)

231

In [26]:
# Dropping numerical features with < 3  unique values
for col in replanned.columns:
  if replanned[col].nunique() < 3 and replanned[col].dtype == 'float64':
    print(f"{col} {replanned[col].dtype}: {replanned[col].unique()}\n\n")
    del replanned[col]
cols = replanned.columns.tolist()
len(cols)

volume_outer-PTV_BODY float64: [0.         1.56789567]


volume-ratio_outer-PTV_BODY float64: [0.00000000e+00 7.16429681e-08]


total-dose_prescribed_course float64: [70. nan]


dose_prescribed_fx float64: [ 2. nan]


PEG float64: [ 1.  0. nan]




226

In [27]:
# List of columns having categorical data
obj_df = replanned.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,replanned_or_not,R-vs-SHBR,sex,cancer_category,diagnosis_id,stage_summary,T,N,M,chemo
0,R,r,Male,Oropharyngeal cancer,C01,IVA,T4,N2c,M0,Chemo
1,R,r,Female,Oropharyngeal cancer,C09.9,IVA,T4,N2c,M0,Chemo
2,R,shbr,Male,Nasopharyngeal Cancer,C11.9,III,T1,N2,M0,Chemo
3,R,r,Male,Oropharyngeal cancer,C01,X,T2,N2b,M0,Chemo
4,R,r,Male,Oropharyngeal cancer,C01,X,T2,N3,MX,Chemo


In [28]:
# List of columns having numerical data
num_df = replanned.select_dtypes(include=['int64','float64']).copy()
num_df.head()

Unnamed: 0,patient_num,R_fx,R_fx-determined,BODY,Body-1,Body-6,xmin-slope_Body-1,xmin-slope_Body-2,xmin-slope_Body-3,xmin-slope_Body-4,...,volume-ratio-slope_outer-PTV_Body-23,volume-ratio-slope_outer-PTV_Body-24,volume-ratio-slope_outer-PTV_Body-25,volume-ratio-slope_outer-PTV_Body-26,age_start,cancer_category_id,num_chemo,first_weight_kg,total_weight-loss_kg,neck_area_fx-1
0,19,21.0,15.0,21.484754,20.805747,20.842635,-5.599176,-5.599176,-5.599176,-5.599176,...,0.000282,0.000282,0.000282,0.000282,56.5421,2.0,3.0,99.3,12.7895,70.5077
1,35,26.0,20.0,18.636867,21.230902,17.781444,-3.322,-3.322,-0.553786,-0.553786,...,1.4e-05,1.4e-05,1.4e-05,1.4e-05,54.1246,2.0,3.0,64.7,10.6646,63.9857
2,115,12.0,12.0,21.957419,21.883293,21.081882,-1.557421,-1.557421,-1.557421,-0.319167,...,1.1e-05,7e-06,7e-06,4e-06,49.87,4.0,3.0,69.5,12.518,67.4203
3,272,25.0,19.0,22.138647,20.795968,18.100129,-6.80347,-6.80347,-6.80347,-6.80347,...,0.000568,0.000568,0.000568,0.000568,68.2574,2.0,3.0,98.7,10.537,70.6577
4,447,23.0,19.0,19.8423,21.966858,,-4.473957,-4.473957,-4.473957,-0.970603,...,0.000405,0.000405,0.000405,0.000405,65.0157,2.0,3.0,72.0,4.0278,76.4357


In [29]:
# Filled Nan with mode
print(replanned['cancer_category_id'].value_counts())
replanned['cancer_category_id'].fillna(replanned['cancer_category_id'].mode()[0],inplace=True)
print(replanned['cancer_category_id'].isna().sum())

2.0    20
4.0     5
5.0     2
1.0     1
9.0     1
Name: cancer_category_id, dtype: int64
0


In [30]:
# filling numerical variables with median
for col in num_df.columns:
  if replanned[col].isna().sum() > 0:
    print(f"{col} {replanned[col].dtype}: {replanned[col].isna().sum()}" )
    replanned[col].fillna(replanned[col].median(),inplace=True)
    print(f"{col} {replanned[col].dtype}: {replanned[col].isna().sum()}" )

Body-6 float64: 9
Body-6 float64: 0
volume_body_Body-6 float64: 9
volume_body_Body-6 float64: 0
volume_outer-PTV_Body-6 float64: 9
volume_outer-PTV_Body-6 float64: 0
volume-ratio_inner-PTV_Body-6 float64: 9
volume-ratio_inner-PTV_Body-6 float64: 0
volume-ratio_outer-PTV_Body-6 float64: 9
volume-ratio_outer-PTV_Body-6 float64: 0
age_start float64: 2
age_start float64: 0
num_chemo float64: 2
num_chemo float64: 0
first_weight_kg float64: 2
first_weight_kg float64: 0
total_weight-loss_kg float64: 2
total_weight-loss_kg float64: 0
neck_area_fx-1 float64: 8
neck_area_fx-1 float64: 0


In [31]:
# filling categorical variables with mode
for col in obj_df.columns:
  if replanned[col].isna().sum() > 0:
    print(f"{col} {replanned[col].dtype}: {replanned[col].isna().sum()}" )
    replanned[col].fillna(replanned[col].mode()[0],inplace=True)
    print(f"{col} {replanned[col].dtype}: {replanned[col].isna().sum()}" )

sex object: 2
sex object: 0
cancer_category object: 2
cancer_category object: 0
diagnosis_id object: 2
diagnosis_id object: 0
stage_summary object: 2
stage_summary object: 0
T object: 2
T object: 0
N object: 2
N object: 0
M object: 2
M object: 0
chemo object: 2
chemo object: 0


In [32]:
for col in obj_df:
  replanned[col] = replanned[col].astype('category')
  replanned[col] = replanned[col].cat.codes
replanned.head()


Unnamed: 0,patient_num,replanned_or_not,R_fx,R_fx-determined,R-vs-SHBR,BODY,Body-1,Body-6,xmin-slope_Body-1,xmin-slope_Body-2,...,diagnosis_id,stage_summary,T,N,M,chemo,num_chemo,first_weight_kg,total_weight-loss_kg,neck_area_fx-1
0,19,0,21.0,15.0,0,21.484754,20.805747,20.842635,-5.599176,-5.599176,...,0,2,3,4,0,0,3.0,99.3,12.7895,70.5077
1,35,0,26.0,20.0,0,18.636867,21.230902,17.781444,-3.322,-3.322,...,4,2,3,4,0,0,3.0,64.7,10.6646,63.9857
2,115,0,12.0,12.0,1,21.957419,21.883293,21.081882,-1.557421,-1.557421,...,6,1,0,2,0,0,3.0,69.5,12.518,67.4203
3,272,0,25.0,19.0,0,22.138647,20.795968,18.100129,-6.80347,-6.80347,...,0,6,1,3,0,0,3.0,98.7,10.537,70.6577
4,447,0,23.0,19.0,0,19.8423,21.966858,20.461444,-4.473957,-4.473957,...,0,6,1,5,2,0,3.0,72.0,4.0278,76.4357


In [33]:
cols = replanned.columns.tolist()
list(enumerate(cols))

[(0, 'patient_num'),
 (1, 'replanned_or_not'),
 (2, 'R_fx'),
 (3, 'R_fx-determined'),
 (4, 'R-vs-SHBR'),
 (5, 'BODY'),
 (6, 'Body-1'),
 (7, 'Body-6'),
 (8, 'xmin-slope_Body-1'),
 (9, 'xmin-slope_Body-2'),
 (10, 'xmin-slope_Body-3'),
 (11, 'xmin-slope_Body-4'),
 (12, 'xmin-slope_Body-5'),
 (13, 'xmin-slope_Body-6'),
 (14, 'xmin-slope_Body-7'),
 (15, 'xmin-slope_Body-8'),
 (16, 'xmin-slope_Body-9'),
 (17, 'xmin-slope_Body-10'),
 (18, 'xmin-slope_Body-11'),
 (19, 'xmin-slope_Body-12'),
 (20, 'xmin-slope_Body-13'),
 (21, 'xmin-slope_Body-14'),
 (22, 'xmin-slope_Body-15'),
 (23, 'xmin-slope_Body-16'),
 (24, 'xmin-slope_Body-17'),
 (25, 'xmin-slope_Body-18'),
 (26, 'xmin-slope_Body-19'),
 (27, 'xmin-slope_Body-20'),
 (28, 'xmin-slope_Body-21'),
 (29, 'xmin-slope_Body-22'),
 (30, 'xmin-slope_Body-23'),
 (31, 'xmin-slope_Body-24'),
 (32, 'xmin-slope_Body-25'),
 (33, 'xmin-slope_Body-26'),
 (34, 'res-vector_x'),
 (35, 'res-vector_y'),
 (36, 'res-vector_z'),
 (37, 'res-vector_radial'),
 (38, 're

In [34]:
# Deleting data after 12th fraction
for i in range(13,27):
  del replanned[f'xmin-slope_Body-{i}']
  del replanned[f'xmed-slope_Body-{i}']
  del replanned[f'xave-slope_Body-{i}']
  del replanned[f'volume-slope_body_Body-{i}']
  del replanned[f'volume-slope_outer-PTV_Body-{i}']
  del replanned[f'volume-ratio-slope_inner-PTV_Body-{i}']
  del replanned[f'volume-ratio-slope_outer-PTV_Body-{i}']

In [35]:
# Deleting replanned_or_not column because we are only using replanned data here
del replanned['replanned_or_not']

In [36]:
# Creating the input of training and testing data
Rx = replanned.copy()
del Rx['patient_num']
del Rx['R_fx-determined']
Rx.head()

Unnamed: 0,R_fx,R-vs-SHBR,BODY,Body-1,Body-6,xmin-slope_Body-1,xmin-slope_Body-2,xmin-slope_Body-3,xmin-slope_Body-4,xmin-slope_Body-5,...,diagnosis_id,stage_summary,T,N,M,chemo,num_chemo,first_weight_kg,total_weight-loss_kg,neck_area_fx-1
0,21.0,0,21.484754,20.805747,20.842635,-5.599176,-5.599176,-5.599176,-5.599176,-5.599176,...,0,2,3,4,0,0,3.0,99.3,12.7895,70.5077
1,26.0,0,18.636867,21.230902,17.781444,-3.322,-3.322,-0.553786,-0.553786,-0.553786,...,4,2,3,4,0,0,3.0,64.7,10.6646,63.9857
2,12.0,1,21.957419,21.883293,21.081882,-1.557421,-1.557421,-1.557421,-0.319167,-0.319167,...,6,1,0,2,0,0,3.0,69.5,12.518,67.4203
3,25.0,0,22.138647,20.795968,18.100129,-6.80347,-6.80347,-6.80347,-6.80347,-6.80347,...,0,6,1,3,0,0,3.0,98.7,10.537,70.6577
4,23.0,0,19.8423,21.966858,20.461444,-4.473957,-4.473957,-4.473957,-0.970603,-1.415517,...,0,6,1,5,2,0,3.0,72.0,4.0278,76.4357


In [38]:
# Data Split
# dividing into 70%(train) and 30%(test)
X_train, X_test, y_train, y_test = train_test_split(Rx, Ry, test_size=0.3)
X_train.shape, y_train.shape

((21, 125), (21, 1))

In [50]:
names = ["Random_Forest", "Decesion_Tree","SVR"]

regressors = [
    RandomForestRegressor(),
    DecisionTreeRegressor(),
    SVR(),
    ]

In [51]:
scores = []
for name, clf in zip(names, regressors):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = r2_score(y_test,y_pred)
    scores.append(score)

  This is separate from the ipykernel package so we can avoid doing imports until
  y = column_or_1d(y, warn=True)


In [52]:
scores

[-0.07180289672544093, -2.243073047858942, -0.01450118643912579]