# RTOG-0617 Trial - Analysis

        * Loading dtatset
        * preprocessing dataset

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from plotly.offline import *

init_notebook_mode()

import plotly.graph_objs as go
import plotly.figure_factory as ff
import plotly.express as px
import plotly.io as pio
pio.templates.default = 'plotly_white'
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm
%matplotlib inline

# Variable
### analysis_inclusion 
    0=Excluded, 1=Included
### age
    Age at study entry (years)
### gender 
    1=Male 2=Female
### ethnicity 
    1=Hispanic or Latino 2=Not Hispanic or Latino 9=Unknown
### Assigned treatment arm
    60 Gy, no cetuximab = 1, 74 Gy, no cetuximab = 2, 60 Gy + cetuximab = 3, 74 Gy + cetuximab = 4
    Note: Analyses of 60 Gy vs 74 Gy compare arms 1+3 vs 2+4. Analyses of Cetuximab vs No cetuximab compare arms 
    1+2 vs 3+4.
### race
    1=American Indian/Alaskan Native 2=Asian 3=Black or African American 4=Native Hawaiian/Other Pacific Islander
    5=White 9=Unknown
### zubrod
    0=Normal activity 1=Symptoms, but nearly fully ambulatory
### histology
    1=Squamous cell carcinoma 2=Adenocarcinoma 3=Large cell undifferentiated 5=Non-small cell lung cancer NOS
### nonsquam_squam
    1=Non-squamous histology 2=Squamous histology
### ajcc_stage_grp
    1=IIIA, or N2 with an undetectable primary, 2=IIIB, or N3 with an undetectable primary
### pet_staging
    1=No use of PET in staging 2=PET used in staging
### rt_technique
    1=3D-CRT, 2=IMRT
### has_egfr_hscore
    0=No H-Score (no tissue, insufficient tissue), 1=H-Score able to be determined
### egfr_hscore_200
    1=H-Score < 200, 2=H-Score ≥ 200
    Note: Blank for patients with has_egfr_hscore = 0
### smoke_hx
    1=Non-smoker (<100 cigarettes in lifetime), 2=Former light smoker (≤10 pack years and quit ≥1 year ago),
    3=Former heavy smoker (>10 pack years), 4=Current smoker (quit <1 year ago or currently smoke), 9=Unknown
### rt_dose
    1=Received assigned dose (60 Gy/74 Gy), 2=Received less than assigned dose 3=Received more than assigned dose
    Note: Blank for patients not receiving RT
### overall_rt_review
    1=Per protocol, 2=Acceptable variation 3=Unacceptable deviation 5=Incomplete RT – Death during RT 6=Incomplete
    RT – Progression 7=Incomplete RT – Refusal, 8=No RT given, 9=Not evaluable
### ptv_review
    1=Per protocol, 2=Acceptable variation 3=Unacceptable deviation, 9=Not evaluable,
    Note: Blank indicates missing due to lack of contour to review
### Dmin_PTV_CTV_MARGIN
    Minimum margin between PTV and CTV (mm) 
### Dmax_PTV_CTV_MARGIN
    Maximum margin between PTV and CTV (mm)
### Dmean_PTV_CTV_MARGIN
    Mean margin between PTV and CTV (mm)
### rt_compliance_ptv90
    0= <90% of PTV covered by at least 95% of prescription dose
    1= ≥90% of PTV covered by at least 95% of prescription dose
### grade3_esophagitis
    0=Grade 3+ esophagitis not reported, 1=Grade 3+ esophagitis reported
### survival_status
    0=Alive 1=Dead
    Time since randomization to death/last follow-up (months)
### cod
    1=Lung cancer under study 2=Second primary 3=Protocol treatment 4=Other cause 9=Unknown
    Note: This will be blank for patients who are alive.
### local_failure
    0=Alive at last follow-up without report of local failure at any time, 1=Local failure,
    2=Dead without report of local failure at any time
    Time since randomization to local failure/date of death or last-follow-up if no failure (months)
### distant_failure
    0=Alive at last follow-up without report of distant failure at any time, 1=Distant failure, 2=Dead without
    report of distant failure at any time
    Time since randomization to distant failure/date of death or last-follow-up if no failure (months)
### progression_free_survival
    0=Alive without progression 1=Progressed or death due to any cause
    Time since randomization to progression or date of death, or date of last- follow-up if alive without
    progression (months)
### lost_to_followup
    0=Not lost to follow-up 1=Lost to follow-up

In [None]:
df = pd.read_csv("./datasets_0617/1D1-Dataset.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.age

In [None]:
newdf4age = df[(df.arm == 1) & (df['age'].notnull())] # selecting arm 1 with cases having non null values for age

In [None]:
newdf4age.head(20)

In [None]:
trace0 = go.Scatter(x = newdf4age.age, y = newdf4age.survival_months,
                   mode='markers',marker = {"size":10})
data=[trace0]

iplot({"data":data})

In [None]:
df.dtypes

In [None]:
df.arm.value_counts()

In [None]:
df_egfr=df[df['egfr_hscore_200'].notnull()]

In [None]:
df_egfr[df['egfr_hscore_200']==2]

In [None]:
df.age.describe()

In [None]:
df = df[df.analysis_inclusion==1]

In [None]:
df.v95_ptv.describe

In [None]:
df.v95_ptv.max()

In [None]:
df.egfr_hscore_200.value_counts()

In [None]:
df[df.egfr_hscore_200 ==1].v95_ptv.mean()

In [None]:
df[df.egfr_hscore_200 ==2].v95_ptv.mean()

In [None]:
np.percentile(df.v95_ptv.values, [0,25,50,75,100])

In [None]:
df.v95_ptv.notnull().describe()

In [None]:
df['v95_ptv'] = df['v95_ptv'].astype('str').astype('float')

In [None]:
df_v95 = df[df.v95_ptv.notnull()]

In [None]:
df_v95.v95_ptv.notnull().describe()

In [None]:
np.percentile(df_v95.v95_ptv.values, [0,25,50,75,100])

In [None]:
trace0 = go.Scatter(y = df_v95.volume_ptv, 
                    x = df_v95.survival_months,
                   mode='markers',
                   marker = {'size':10,'color':'rgba(255,70,0,0.5)'},
                   name = 'Scatter Plot')
data = [trace0]

iplot({'data':data})

In [None]:
trace0 = go.Scatter(x = df[(df['arm'] == 1) | (df['arm'] == 3)]['volume_ptv'],
                    y = df[(df['arm'] == 1) | (df['arm'] == 3)]['survival_months'],
                   mode='markers',
                   marker = {'size':10,'color':'rgba(255,70,0,0.5)'},
                   name = 'Standard Dose')

trace1 = go.Scatter(x = df[(df['arm'] == 2) | (df['arm'] == 4)]['volume_ptv'],
                    y = df[(df['arm'] == 2) | (df['arm'] == 4)]['survival_months'],
                   mode='markers',
                   marker = {'size':10,'color':'rgba(0,190,255,0.5)'},
                   name = 'High Dose')

data = [trace0,trace1]

layout = {'title':'Correlation between PTV volume and Survival in months',
         'xaxis':{'title':'Volume of PTV',
                 'zeroline':False},
         'yaxis':{'title':'Survival in Months',
                 'zeroline':False}
         }

iplot({'data':data,'layout':layout})

In [None]:
trace0 = go.Scatter(x = df[(df['arm'] == 1) | (df['arm'] == 3)]['volume_ptv'],
                    y = df[(df['arm'] == 1) | (df['arm'] == 3)]['v20_lung'],
                   mode='markers',
                   marker = {'size':10,'color':'rgba(255,70,0,0.5)'},
                   name = 'Standard Dose')

trace1 = go.Scatter(x = df[(df['arm'] == 2) | (df['arm'] == 4)]['volume_ptv'],
                    y = df[(df['arm'] == 2) | (df['arm'] == 4)]['v20_lung'],
                   mode='markers',
                   marker = {'size':10,'color':'rgba(0,190,255,0.5)'},
                   name = 'High Dose')

data = [trace0,trace1]

layout = {'title':'Correlation between PTV volume and Survival in months',
         'xaxis':{'title':'Volume of PTV',
                 'zeroline':False},
         'yaxis':{'title':'Lung Doses V20Gy',
                 'zeroline':False}
         }

iplot({'data':data,'layout':layout})

In [None]:
df.columns

In [None]:
trace0 = go.Scatter(x = df[(df['arm'] == 1) | (df['arm'] == 3)]['survival_months'],
                    y = df[(df['arm'] == 1) | (df['arm'] == 3)]['v30_heart'],
                   mode='markers',
                   marker = {'size':10,'color':'rgba(255,70,0,0.5)'},
                   name = 'Standard Dose')

trace1 = go.Scatter(x = df[(df['arm'] == 2) | (df['arm'] == 4)]['survival_months'],
                    y = df[(df['arm'] == 2) | (df['arm'] == 4)]['v30_heart'],
                   mode='markers',
                   marker = {'size':10,'color':'rgba(0,190,255,0.5)'},
                   name = 'High Dose')

data = [trace0,trace1]

layout = {'title':'Correlation between PTV volume and Survival in months',
         'xaxis':{'title':'Survival in Months',
                 'zeroline':False},
         'yaxis':{'title':'Heart Doses V30Gy',
                 'zeroline':False}
         }

iplot({'data':data,'layout':layout})

In [None]:
trace1 = go.Scatter(x = df.survival_months,
                    y = df.v30_heart,
                   mode='markers',
                   marker = {'size':10,
                             'color':df.volume_ptv,
                            'colorscale':'Portland',
                            'showscale':True})

data = [trace0]

iplot({'data':data})


In [None]:
df_1.survival_months.notnull().value_counts()

In [None]:
df_1 = df[(df.volume_gtv.notnull()) & (df.v30_heart.notnull())]

In [None]:
trace1 = go.Scatter(x = df_1['survival_months'],
                    y = df_1['v30_heart'],
                    mode='markers',
                    marker = {'size':np.ceil(df_1.volume_gtv)}
                    )

data = [trace0]

iplot({'data':data})

In [None]:
trace0 = go.Box(x=df_1.arm,
               y=df.v30_heart,
               boxpoints='all',
               jitter=0.3,
               pointpos=-1.5,
               boxmean='sd')

data = [trace0]

iplot({'data':data})

In [None]:
trace0 = go.Box(x = df_1[df_1['arm']==1]['arm'],
            y=df_1.v30_heart,
            boxpoints='all',
            jitter=0.3,
            pointpos=-1.5,
            boxmean='sd')
trace1 = go.Box(x = df_1[df_1['arm']==2]['arm'],
            y=df_1.v30_heart,
            boxpoints='all',
            jitter=0.3,
            pointpos=-1.5,
            boxmean='sd')
trace2 = go.Box(x = df_1[df_1['arm']==3]['arm'],
            y=df_1.v30_heart,
            boxpoints='all',
            jitter=0.3,
            pointpos=-1.5,
            boxmean='sd')
trace3 = go.Box(x = df_1[df_1['arm']==4]['arm'],
            y=df_1.v30_heart,
            boxpoints='all',
            jitter=0.3,
            pointpos=-1.5,
            boxmean='sd')


layout = {'title':'V30 Heart Doses in different arms',
         'xaxis':{'title':'Study Arms',
                 'zeroline':False},
         'yaxis':{'title':'Heart Doses V30Gy',
                 'zeroline':False}
         }



data = [trace0,trace1,trace2,trace3]

iplot({'data':data,'layout':layout})

In [None]:
trace0 = go.Histogram(x = df_1.volume_gtv)

data = [trace0]

iplot({'data':data})

In [None]:
trace0 = go.Histogram(x = df_1.volume_gtv,
                     histnorm='probability')

data = [trace0]

iplot({'data':data})

In [None]:
trace0 = go.Histogram(x = df_1.volume_gtv,
                     histnorm='probability',
                     xbins ={'start':df_1.volume_gtv.min(),
                            'end':df_1.volume_gtv.max(),
                            'size':40})

data = [trace0]

iplot({'data':data})

In [None]:
df_SD  = df_1[(df_1.arm==1)]
df_SDT = df_1[(df_1.arm==3)]
df_HD  = df_1[(df_1.arm==2)]
df_HDT = df_1[(df_1.arm==4)]

In [None]:
df_SD.shape

In [None]:
trace0 = go.Histogram(x = df_SD.volume_gtv,
                     histnorm='probability')

trace1 = go.Histogram(x = df_HD.volume_gtv,
                     histnorm='probability',
                     opacity=0.6)

layout = {'title':'GTV volumes in high dose versus standard dose gropus',
         'xaxis':{'title':'GTV Volume',
                 'zeroline':False},
         'yaxis':{'title':'Count',
                 'zeroline':False},
          'barmode':'overlay'
         }


data = [trace0,trace1]

iplot({'data':data,'layout':layout})

In [None]:
trace0 = go.Histogram(x = df_SD.volume_gtv,
                     histnorm='probability')

trace1 = go.Histogram(x = df_HD.volume_gtv,
                     histnorm='probability',
                     opacity=0.6)

layout = {'title':'GTV volumes in high dose versus standard dose gropus',
         'xaxis':{'title':'GTV Volume',
                 'zeroline':False},
         'yaxis':{'title':'Count',
                 'zeroline':False},
          'barmode':'overlay'
         }


data = [trace0,trace1]

iplot({'data':data,'layout':layout})

In [None]:
df.arm.unique().tolist()

In [None]:
np.sort(df.arm.unique()).tolist()

In [None]:
df.groupby('arm').age.mean()

In [None]:
df.groupby('arm').age.mean().tolist()

In [None]:
np.sort(df.rt_technique.unique()).tolist()

In [None]:
df.groupby('rt_technique').v30_heart.mean().tolist() # 1 = 3DCRT and 2 = IMRT 

In [None]:
df.groupby('rt_technique').v60_esophagus.mean().tolist()

In [None]:
df.rt_technique.value_counts()

In [None]:
df_v60eso = df[df.v60_esophagus.notnull()]

In [None]:
result1 = df_v60eso.describe().transpose()

In [None]:
result1[13:14]

In [None]:
result1[35:51]

In [None]:
df_v60eso = df[(df.v60_esophagus.notnull())& (df.rt_technique.notnull())]

In [None]:
df_v60eso.notnull().transpose()[0]['rt_technique']

In [None]:
allcol = df.columns

In [None]:
col = df.columns[38:54]

strNames = {}
for num,var in enumerate(col):
    strNames.update({num:var})

In [None]:
strNames

In [None]:
new_strNames = {}
j = 38
for i in range(0,len(strNames)):
    new_strNames.update({strNames[i]:df[allcol[j]].notnull().tolist()})
    j = j+1

In [None]:
new_strNames.keys()

In [None]:
count = 0
for num,bol in enumerate(bol_v95_ptv):
    if bol == 0:
        print(f'Case no {num} has value {bol}')
        count = count+1
if count>0:
    print(f'Found {count} missing values for variable: "v95_ptv"')
else:
    print('No empty values found')

# 18/04/2020

In [None]:
df.info()

In [None]:
fig = ff.create_distplot(hist_data =[df.age[df.age.notna().values].tolist()],
                        group_labels = ['Age'])
fig['layout'].update(title="Age Distribution",showlegend=False)
iplot(fig)

In [None]:
df.age[df['age'].notna()].describe() # to check missing values

In [None]:
v95 = []
for values in df.v95_ptv[df['v95_ptv'].notnull()].values.tolist():
    if values !=0:
        v95.append(values)

In [None]:
fig = ff.create_distplot(hist_data =[v95],
                        group_labels = ['V95'])
fig['layout'].update(title="V95 Distribution",showlegend=False)
iplot(fig)

In [None]:
df.columns

In [None]:
df_egfr = df[['age','egfr_hscore_200','volume_gtv','volume_itv', 'volume_ptv', 'dmax_ptv', 
              'v100_ptv', 'v95_ptv','v5_lung', 'v20_lung', 'dmean_lung', 
              'v5_heart', 'v30_heart','v20_esophagus', 'v60_esophagus', 
              'Dmin_PTV_CTV_MARGIN','Dmax_PTV_CTV_MARGIN',
              'Dmean_PTV_CTV_MARGIN',]][df.has_egfr_hscore==1]

In [None]:
df_egfr.drop(axis=1,columns=['volume_itv','volume_gtv','Dmin_PTV_CTV_MARGIN',
                             'Dmax_PTV_CTV_MARGIN','Dmean_PTV_CTV_MARGIN'],
            inplace=True)

In [None]:
df_egfr.dropna(axis=0,how='any',inplace=True)

In [None]:
col = [df_egfr.columns]
col = col[0]

# Distribution plots for all continous variables in the selected dataframe

In [None]:
for i in range(0,len(col)):
    if col[i] != 'egfr_hscore_200':
        fig = ff.create_distplot(hist_data =[df_egfr[col[i]][df_egfr['egfr_hscore_200']==1].tolist(),
                                    df_egfr[col[i]][df_egfr['egfr_hscore_200']==2].tolist(),
                                    df_egfr[col[i]].tolist()],
                        group_labels = ['EGFR positive '+col[i],
                                        'EGFR negative '+col[i],
                                        'Whole Group '+col[i]])

        fig['layout'].update(title = col[i]+" Distribution",showlegend=True)
        iplot(fig)

# QQ Plot to check normalcy

In [None]:
for i in range(0,len(col)):
    if col[i] != 'egfr_hscore_200':
        plt.figure(figsize=(8,6))
        sp.stats.probplot(df_egfr[col[i]].tolist(), dist = 'norm', plot = plt,rvalue=True)
        plt.title('QQ plot for ' + col[i])

# Shapiro–Wilk test

    1. The Shapiro–Wilk test tests the null hypothesis that a sample x1, ..., xn came from a normally distributed population
    2. if pvalue > 0.05 then we can not reject null hypothesis  and thus sample comes from normally distributed population)

In [None]:
# Collecting stats for all the variables Whole group
Results_Shapiro = []
for i in range(0,len(col)):
    if col[i] != 'egfr_hscore_200':
        s_st = sp.stats.shapiro(df_egfr[col[i]])
        s_st = list(s_st)
        s_st.append(col[i]+' Whole Group')
        Results_Shapiro.append(s_st)
        del s_st

In [None]:
# displaying all the variables who have normal and not normal distribution
for i in range(0,len(Results_Shapiro)):
    if Results_Shapiro[i][1] > 0.05:
        print(f'\n\t{Results_Shapiro[i][2]} has a normal distribution.(p value {Results_Shapiro[i][1]})\n')
    else:
        print(f'{Results_Shapiro[i][2]} does not have a normal distribution.(p value {Results_Shapiro[i][1]})')

In [None]:
# Collecting stats for all the variables EGFR +ve group
Results_Shapiro = []
for i in range(0,len(col)):
    if col[i] != 'egfr_hscore_200':
        s_st = sp.stats.shapiro(df_egfr[col[i]][df_egfr.egfr_hscore_200 == 1])
        s_st = list(s_st)
        s_st.append(col[i]+' in EGFR+ Group')
        Results_Shapiro.append(s_st)
        del s_st

In [None]:
# displaying all the variables who have normal and not normal distribution
for i in range(0,len(Results_Shapiro)):
    if Results_Shapiro[i][1] > 0.05:
        print(f'\n\t{Results_Shapiro[i][2]} has a normal distribution.(p value {Results_Shapiro[i][1]}\n)')
    else:
        print(f'{Results_Shapiro[i][2]} does not have a normal distribution.(p value {Results_Shapiro[i][1]})')

In [None]:
#Collecting stats for all the variables EGFR -ve group
Results_Shapiro = []
for i in range(0,len(col)):
    if col[i] != 'egfr_hscore_200':
        s_st = sp.stats.shapiro(df_egfr[col[i]][df_egfr.egfr_hscore_200 == 1])
        s_st = list(s_st)
        s_st.append(col[i]+' in EGFR- Group')
        Results_Shapiro.append(s_st)
        del s_st

In [None]:
# displaying all the variables who have normal and not normal distribution
for i in range(0,len(Results_Shapiro)):
    if Results_Shapiro[i][1] > 0.05:
        print(f'\n\t{Results_Shapiro[i][2]} has a normal distribution.p value {Results_Shapiro[i][1]}\n')
    else:
        print(f'{Results_Shapiro[i][2]} does not have a normal distribution.(p value {Results_Shapiro[i][1]})')

# Levene's Test
    Levene's test is an inferential statistic used to assess the equality of variances for a variable calculated for two or more groups. ... Levene's test assesses this assumption. It tests the null hypothesis that the population variances are equal (called homogeneity of variance or homoscedasticity)

In [None]:
#Collecting stats for all the variables 
Results_Levene = []
for i in range(0,len(col)):
    if col[i] != 'egfr_hscore_200':
        l_st = sp.stats.levene(df_egfr[col[i]][df_egfr.egfr_hscore_200 == 1],
                               df_egfr[col[i]][df_egfr.egfr_hscore_200 == 2])
        l_st = list(l_st)
        l_st.append(col[i])
        Results_Levene.append(l_st)
        del l_st

In [None]:
# displaying all the variables who have normal and not normal distribution
for i in range(0,len(Results_Levene)):
    if Results_Shapiro[i][1] > 0.05:
        print(f'\n\t{Results_Levene[i][2]} : has a equal population varaince.(p value {Results_Shapiro[i][1]})\n')
    else:
        print(f'{Results_Levene[i][2]} : does not have a equal population variance.(p value {Results_Shapiro[i][1]})')

# Detecting Outliers

In [None]:
# Boxplot for detecting outliers
Results_Shapiro = []
for i in range(0,len(col)):
    if col[i] != 'egfr_hscore_200':
        trace0 = go.Box(y=df_egfr[col[i]][df_egfr.egfr_hscore_200 == 1],
               name = "EGFR+ve Group",
               boxpoints='outliers')
        trace1 = go.Box(y=df_egfr[col[i]][df_egfr.egfr_hscore_200 == 2],
               name = "EGFR-ve Group",
               boxpoints='outliers')
        layout = {'title':'Detecting Outliers for ' + col[i] + ' in both the groups',
                 'xaxis':{'title':col[i],
                 'zeroline':False},
                 'yaxis':{'title':col[i] + ' Values',
                 'zeroline':False}
                 }
        data = [trace0,trace1]
        iplot({'data':data,'layout':layout})

In [None]:
# Standardizing values to find which are the outliers so that if required these cases can be dropped
i=6
val =  df_egfr[col[i]]
mean = df_egfr[col[i]].mean()
std =  df_egfr[col[i]].std()
((val-mean)/std).sort_values()

# Correlation

In [None]:
#df.info()

In [None]:
colfull = list(df.columns)

In [None]:
age = df.age[df.age.notna()]
rt_dose = df.rt_dose[df.rt_dose.notna()]
volume_ptv = df.volume_ptv[df.volume_ptv.notna()]
dmax_ptv = df.dmax_ptv[df.dmax_ptv.notna()]
v95_ptv  = df.v95_ptv [df.v95_ptv.notna()]
v5_lung = df.v5_lung[df.v5_lung.notna()]
v20_lung = df.v20_lung[df.v20_lung.notna()]
dmean_lung = df.dmean_lung[df.dmean_lung.notna()]
v5_heart = df.v5_heart[df.v5_heart.notna()]
v30_heart = df.v30_heart[df.v30_heart.notna()]
v20_esophagus = df.v20_esophagus[df.v20_esophagus.notna()]
v60_esophagus = df.v60_esophagus[df.v60_esophagus.notna()]
Dmean_PTV_CTV_MARGIN = df.Dmean_PTV_CTV_MARGIN[df.Dmean_PTV_CTV_MARGIN.notna()]
survival_months = df.survival_months[df.survival_months.notna()]

In [None]:
trace0 = go.Scatter(x=survival_months,
                    y=dmean_lung,
                    mode='markers',
                    marker = dict(size=12,color='rgb(255,70,0,0.9)'))
data = [trace0]
layout = {'title':'Correlation between PTV volumes and Lung doses V5Gy ',
         'xaxis':{'title':'PTV volume','zeroline':False},
         'yaxis':{'title':'Lung doses V5Gy','zeroline':False}
         }
iplot({'data':data,'layout':layout})

     7   age                               495 non-null    float64
     22  rt_dose                           482 non-null    float64
     40  volume_ptv                        474 non-null    float64
     41  dmax_ptv                          474 non-null    float64
     42  v100_ptv                          474 non-null    float64
     43  v95_ptv                           474 non-null    float64
     44  v5_lung                           474 non-null    float64
     45  v20_lung                          474 non-null    float64
     46  dmean_lung                        474 non-null    float64
     47  v5_heart                          472 non-null    float64
     48  v30_heart                         472 non-null    float64
     49  v20_esophagus                     471 non-null    float64
     50  v60_esophagus                     471 non-null    float64
     51  Dmin_PTV_CTV_MARGIN               448 non-null    float64
     52  Dmax_PTV_CTV_MARGIN               448 non-null    float64
     53  Dmean_PTV_CTV_MARGIN              448 non-null    float64
     79  survival_months                   495 non-null    float64

In [None]:
stats.pearsonr(volume_ptv,dmean_lung)

In [None]:
stats.linregress(volume_ptv,dmean_lung)

$$ y = -0.009x + 97.467 $$

In [None]:
trace0 = go.Scatter(x=volume_ptv,
                    y=v20_lung,
                    name = "Data",
                    mode='markers',
                    marker = dict(size=12,color='rgb(255,70,0,0.9)'))
trace1 = go.Scatter(x=np.linspace(0,volume_ptv.max(),len(df)),
                  y = (0.005) * np.linspace(0,volume_ptv.max(),len(df)) + 26.862,
                   name='Regression Line',
                   mode = 'lines')
data = [trace0,trace1]
layout = {'title':'Correlation between PTV Volume and V95% Dose Coverage',
         'xaxis':{'title':'PTV volume','zeroline':False},
         'yaxis':{'title':'V95% Coverage','zeroline':False}
         }
iplot({'data':data,'layout':layout})

In [None]:
x = df[['v20_lung','v60_esophagus']][df['volume_ptv'].notna() & df['v20_lung'].notna() & df['v60_esophagus'].notna()]

In [None]:
x = sm.add_constant(x)
x1 = x['v20_lung'].to_numpy()
x2 = x['v60_esophagus'].to_numpy()

In [None]:
y = df.volume_ptv[df['volume_ptv'].notna() & df['v20_lung'].notna() & df['v60_esophagus'].notna()]

In [None]:
y

In [None]:
model = sm.OLS(y,x).fit()

In [None]:
print(model.summary())

    x1 = v20_lung
    x2 = v60_esophagus
    y = volume_ptv
$$y = 4.444*x1 + 2.45*x2 + 335.556$$

# Linear Regression

In [None]:
stats.pearsonr(x1,y)

In [None]:
model=sm.OLS(y,x)
result = model.fit()
result.summary()

In [None]:
x.iloc[0]

In [None]:
y.iloc[0]

$$v20lung*4.444 + v60esophagus*2.45 + 335.556$$

In [None]:
27.19*4.444 + 1.31*2.45 + 335.556

In [None]:
list(zip(np.round(result.predict(x)[0:5], 1), y[0:5]))

# Comparing means --Parametric test

In [None]:
v20_lung.mean()

In [None]:
stats.ttest_1samp(v20_lung, 29.45) # one sample t test

In [None]:
col

In [None]:
i=12
df_1 = df[df[col[i]].notna()]
var1 = df_1[(df_1.arm == 1) | (df_1.arm == 3)][col[i]]
var2 = df_1[(df.arm == 2) | (df_1.arm == 4 )][col[i]]
print(col[i])

In [None]:
# student t test
stats.ttest_ind(var1,var2)

In [None]:
var1.mean()

In [None]:
var2.mean()

### Variables having significant difference in mean among high dose versus standard dose
           v95_ptv, v20_lung, dmean_lung, v30_heart, v20_esophagus,v60_esophagus

# Oneway Anova

In [None]:
i=10
df_1 = df[df[col[i]].notna()]
var1 = df_1[(df_1.arm == 1)][col[i]]
var2 = df_1[(df.arm == 2)][col[i]]
var3 = df_1[(df_1.arm == 3)][col[i]]
var4 = df_1[(df.arm == 4)][col[i]]
print(col[i])

In [None]:
stats.f_oneway(var1,var2,var3,var4)

# Comparing Categorical variables

In [None]:
pd.crosstab(df.arm,df.rt_compliance_ptv90)

In [None]:
stats.chi2_contingency(pd.crosstab(df.arm,df.rt_compliance_ptv90))