In [None]:
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import shutil
import os

# Variable
### analysis_inclusion 
0=Excluded, 1=Included
### age
Age at study entry (years)
### gender 
1=Male 2=Female
### ethnicity 
1=Hispanic or Latino 2=Not Hispanic or Latino 9=Unknown
### Assigned treatment arm
1=60 Gy, no cetuximab
2=74 Gy, no cetuximab
3=60 Gy + cetuximab
4=74 Gy + cetuximab
Note: Analyses of 60 Gy vs 74 Gy compare arms 1+3 vs 2+4. Analyses of Cetuximab vs No cetuximab compare arms 1+2 vs 3+4.
### race
1=American Indian/Alaskan Native 2=Asian 3=Black or African American 4=Native Hawaiian/Other Pacific Islander
5=White 9=Unknown
### zubrod
0=Normal activity 1=Symptoms, but nearly fully ambulatory
### histology
1=Squamous cell carcinoma 2=Adenocarcinoma 3=Large cell undifferentiated 5=Non-small cell lung cancer NOS
### nonsquam_squam
1=Non-squamous histology 2=Squamous histology
### ajcc_stage_grp
1=IIIA, or N2 with an undetectable primary
2=IIIB, or N3 with an undetectable primary
### pet_staging
1=No use of PET in staging 2=PET used in staging
### rt_technique
1=3D-CRT
2=IMRT
### has_egfr_hscore
0=No H-Score (no tissue, insufficient tissue)
1=H-Score able to be determined
### egfr_hscore_200
1=H-Score < 200
2=H-Score ≥ 200
Note: Blank for patients with has_egfr_hscore = 0
### smoke_hx
1=Non-smoker (<100 cigarettes in lifetime)
2=Former light smoker (≤10 pack years and quit ≥1 year ago)
3=Former heavy smoker (>10 pack years)
4=Current smoker (quit <1 year ago or currently smoke)
9=Unknown
### rt_dose
1=Received assigned dose (60 Gy/74 Gy)
2=Received less than assigned dose 3=Received more than assigned dose Note: Blank for patients not receiving RT
### overall_rt_review
1=Per protocol
2=Acceptable variation 3=Unacceptable deviation 5=Incomplete RT – Death during RT 6=Incomplete RT – Progression 7=Incomplete RT – Refusal
8=No RT given
9=Not evaluable
### ptv_review
1=Per protocol
2=Acceptable variation 3=Unacceptable deviation
9=Not evaluable
Note: Blank indicates missing due to lack of contour to review
### Dmin_PTV_CTV_MARGIN
Minimum margin between PTV and CTV (mm) 
### Dmax_PTV_CTV_MARGIN
Maximum margin between PTV and CTV (mm)
### Dmean_PTV_CTV_MARGIN
Mean margin between PTV and CTV (mm)
### rt_compliance_ptv90
0= <90% of PTV covered by at least 95% of prescription dose
1= ≥90% of PTV covered by at least 95% of prescription dose
### grade3_esophagitis
0=Grade 3+ esophagitis not reported 
1=Grade 3+ esophagitis reported
### survival_status
0=Alive 1=Dead
Time since randomization to death/last follow-up (months)
### cod
1=Lung cancer under study 2=Second primary 3=Protocol treatment 4=Other cause 9=Unknown
Note: This will be blank for patients who are alive.
### local_failure
0=Alive at last follow-up without report of local failure at any time
1=Local failure
2=Dead without report of local failure at any time
Time since randomization to local failure/date of death or last-follow-up if no failure (months)
### distant_failure
0=Alive at last follow-up without report of distant failure at any time
1=Distant failure
2=Dead without report of distant failure at any time
Time since randomization to distant failure/date of death or last-follow-up if no failure (months)
### progression_free_survival
0=Alive without progression 1=Progressed or death due to any cause
Time since randomization to progression or date of death, or date of last- follow-up if alive without progression (months)
### lost_to_followup
0=Not lost to follow-up 1=Lost to follow-up

In [None]:
df = pd.read_csv("NCT00533949-D1-Dataset.csv") # reading the main data

In [None]:
# Selecting patients only who were included in the study
df = df[df['analysis_inclusion'] > 0]

In [None]:
# renaming ID as per numpy images
ID = []
for value in df['patid'].values:
    ID.append(value[5:]+".npy")

In [None]:
df['ID'] = ID

In [None]:
# selecting IDs of all alive patient 
alive = df.loc[:,['ID']][(df['survival_status']==0)]
alive = alive['ID'].values

In [None]:
# selecting IDs of all dead patient
dead = df.loc[:,['ID']][df['survival_status']==1]
dead = dead['ID'].values

In [None]:
filenames = os.listdir('../crop128x128x128/')

In [None]:
# moving alive patients to alive folder
for file in filenames:
    for i in range(0,len(alive)):
        if file == alive[i]:
            print(f'Alive patient:{file}')
            src1 = os.path.join('/Users/chufal/Desktop/crop128x128x128/',file)
            dst1 = ('/Users/chufal/Desktop/crop128x128x128/alive')
            shutil.move(src1,dst1)

In [None]:
# moving dead patient to dead folder
for file in filenames:
    for j in range(0,len(dead)):
        if file == dead[j]:
            print(f'Dead patient:{file}')
            src2 = os.path.join('/Users/chufal/Desktop/crop128x128x128/',file)
            dst2 = ('/Users/chufal/Desktop/crop128x128x128/dead')
            shutil.move(src2,dst2)

In [None]:
# Selecting the variables and forming a new dataframe
rtogData = df.loc[:,['patid','arm','analysis_inclusion','survival_status','survival_months',
                     'local_failure','local_failure_months','distant_failure','distant_failure_months',
                     'progression_free_survival','progression_free_survival_months']] 

In [None]:
# Selecting patients only who were included in the study
rtog_included_data = rtogData[rtogData['analysis_inclusion'] > 0]

In [None]:
# dropping the unwanted columns
rtog_included_data.drop('analysis_inclusion',axis=1,inplace=True)

In [None]:
# Reviewing information for new dataframe
rtog_included_data.info()

In [None]:
# saving newly formed dataframe as an csv file in the current working directory
#rtog_included_data.to_csv('rtog4labels.csv')

In [None]:
# selecting patient with survival data of more than 18 months
rtog_18months = rtog_included_data[rtog_included_data['survival_months'] > 17.99999]

In [None]:
#testdf = rtog_2year[(rtog_2year['arm'] == 2) | (rtog_2year['arm'] == 4)] #74Gy is arm 2 and 4

In [None]:
#testdf.info()

In [None]:
#testdf['egfr_hscore_200'].value_counts()

In [None]:
#df['egfr_hscore_200'].value_counts()

In [None]:
rtog_18months.info()

In [None]:
# renaming ID as per numpy images
ID = []
for value in rtog_18months['patid'].values:
    ID.append(value[5:]+".npy")

In [None]:
rtog_18months['ID'] = ID

In [None]:
rtog_18months.info()

# Patient with minimum survival of 18 months

In [None]:
# selecting IDs of all alive patient 
alive_patient = rtog_18months.loc[:,['ID']][(rtog_18months['survival_status']==0)]

In [None]:
# selecting IDs of all dead patient
dead_patient = rtog_18months.loc[:,['ID']][rtog_18months['survival_status']==1]

In [None]:
# Selecting IDs of all alive patient in high dose group
H_dose = rtog_18months[(rtog_18months['arm'] == 2) | (rtog_18months['arm'] == 4)] #74Gy is arm 2 and 4
alive4HD =  H_dose.loc[:,['ID']][(H_dose['survival_status']==0)]

In [None]:
# Selecting IDs of all dead patient in high dose group
H_dose = rtog_18months[(rtog_18months['arm'] == 2) | (rtog_18months['arm'] == 4)] #74Gy is arm 2 and 4
dead4HD =  H_dose.loc[:,['ID']][(H_dose['survival_status']==1)]

In [None]:
# Selecting IDs of all alive patient in standard dose group
S_dose = rtog_18months[(rtog_18months['arm'] == 1) | (rtog_18months['arm'] == 3)] #60Gy is arm 1 and 3
alive4SD =  S_dose.loc[:,['ID']][(S_dose['survival_status']==0)]

In [None]:
# Selecting IDs of all dead patient in standard dose group
S_dose = rtog_18months[(rtog_18months['arm'] == 1) | (rtog_18months['arm'] == 3)] #60Gy is arm 1 and 3
dead4SD =  S_dose.loc[:,['ID']][(S_dose['survival_status']==1)]

In [None]:
dead_patient = list(dead_patient['ID'].values) # IDs of dead patients (all group 1,2,3,4)

In [None]:
alive_patient = list(alive_patient['ID'].values) # IDs of alive patient (all group 1,2,3,4)

In [None]:
alive4HD = alive4HD['ID'].values # IDs of alive patient in High Dose group

In [None]:
dead4HD = dead4HD['ID'].values # IDs of dead patient in High Dose Group

In [None]:
alive4SD = alive4SD['ID'].values # IDs of alive patient in Standard Dose Group

In [None]:
dead4SD = dead4SD['ID'].values # IDs of dead patient in Standard Dose Group 

In [None]:
print(f'All Group Alive Patient: {len(alive_patient)}, All Group Dead Patient: {len(dead_patient)}')

In [None]:
print(f'High Dose Group Alive Patient: {len(alive4HD)}, High Dose Group Dead Patient: {len(dead4HD)}')

In [None]:
print(f'Standard Dose Group Alive Patient: {len(alive4SD)}, Standard Dose Group Dead Patient: {len(dead4SD)}')

In [None]:
filenames = os.listdir('../cropped_64X64X64')

In [None]:
# moving alive patients to alive folder
for file in filenames:
    for i in range(0,len(alive_patient)):
        if file == alive_patient[i]:
            print(f'Alive patient:{file}')
            src1 = os.path.join('/Users/chufal/Desktop/cropped_64X64X64/',file)
            dst1 = ('/Users/chufal/Desktop/cropped_64X64X64/alive')
            shutil.move(src1,dst1)

In [None]:
# moving dead patient to dead folder
for file in filenames:
    for j in range(0,len(dead_patient)):
        if file == dead_patient[j]:
            print(f'Dead patient:{file}')
            src2 = os.path.join('/Users/chufal/Desktop/cropped_64X64X64/',file)
            dst2 = ('/Users/chufal/Desktop/cropped_64X64X64/dead')
            shutil.move(src2,dst2)