In [None]:
import os
import numpy as np
import pandas as pd
import dicom
import SimpleITK as sitk

In [None]:
# Define important paths
dicom_path = 'E:/DSB 2017/stage1/stage1'
stage1_labels_path = 'E:/DSB 2017/stage1_labels.csv'
raw_path = 'E:/LUNA16'
annotations_path = "E:/LUNA16/annotations.csv"

# DSB Metadata

In [None]:
dsb_metadata = pd.read_csv(stage1_labels_path, header = 0, names =['hex_id','cancer']) 

In [None]:
dsb_metadata.sort_values('hex_id',axis = 0, inplace= True)
dsb_metadata.insert(0,"int_id",range(0,len(dsb_metadata)))
dsb_metadata.insert(3,"x_len (px)",0)
dsb_metadata.insert(4,"y_len (px)",0)
dsb_metadata.insert(5,"z_len (px)",0)
dsb_metadata.insert(6,"x_mm/px",0.0)
dsb_metadata.insert(7,"y_mm/px",0.0)
dsb_metadata.insert(8,"z_mm/px",0.0)
dsb_metadata.insert(9,"rescale_intercept",0.0)
dsb_metadata.insert(10,"rescale_slope",0.0)

In [None]:
dsb_metadata['int_id'] = dsb_metadata['int_id'].apply(lambda x: '{0:0>4}'.format(x))

In [None]:
dsb_metadata

In [None]:
for p in range(0,1):#len(dsb_metadata)):
    dicom_patient_path = dicom_path + '/' + dsb_metadata['hex_id'][p]
    dicom_slices = [dicom.read_file(dicom_patient_path + '/' + s) for s in os.listdir(dicom_patient_path)] #read in the individual slices
    dicom_slices.sort(key = lambda x: int(x.ImagePositionPatient[2])) #sort slices based on vertical position
    
    dsb_metadata.loc[p, "x_len (px)"] = dicom_slices[0].Rows
    dsb_metadata.loc[p, "y_len (px)"] = dicom_slices[0].Columns
    dsb_metadata.loc[p, "z_len (px)"] = len(dicom_slices)
    
    dsb_metadata.loc[p, "x_mm/px"] = dicom_slices[0].PixelSpacing[0]
    dsb_metadata.loc[p, "y_mm/px"] = dicom_slices[0].PixelSpacing[1]
    try:
        dsb_metadata.loc[p, "z_mm/px"] = np.abs(dicom_slices[0].ImagePositionPatient[2] - dicom_slices[1].ImagePositionPatient[2])
    except:
        dsb_metadata.loc[p, "z_mm/px"] = np.abs(dicom_slices[0].SliceLocation - dicom_slices[1].SliceLocation)
    
    dsb_metadata.loc[p, "rescale_intercept"] = dicom_slices[0].RescaleIntercept
    dsb_metadata.loc[p, "rescale_slope"] = dicom_slices[0].RescaleSlope
    if p % 10 == 0:
        print 'patient 0 through ' + str(p) + ' complete'
    
    del dicom_slices

In [None]:
#Add Calculated Columns   
dsb_metadata["x_len (mm)"] = dsb_metadata["x_len (px)"] * dsb_metadata["x_mm/px"]
dsb_metadata["y_len (mm)"] = dsb_metadata["y_len (px)"] * dsb_metadata["y_mm/px"]
dsb_metadata["z_len (mm)"] = dsb_metadata["z_len (px)"] * dsb_metadata["z_mm/px"] 

dsb_metadata["volume (voxels)"] = dsb_metadata["x_len (px)"] * dsb_metadata["y_len (px)"] * dsb_metadata["z_len (px)"]
dsb_metadata["volume (m^3)"] = dsb_metadata["x_len (mm)"] * dsb_metadata["y_len (mm)"] * dsb_metadata["z_len (mm)"] / 10**9

In [None]:
dsb_metadata

In [None]:
dsb_metadata.to_csv('DSB Metadata.csv', index = False)

In [None]:
#Key Metrics
num_patients = len(dsb_metadata)
num_patients_w_cancer = dsb_metadata['cancer'].sum()
perc_patients_w_cancer = num_patients_w_cancer / float(num_patients)

total_num_slices = dsb_metadata['z_len (px)'].sum()
avg_num_slices = dsb_metadata['z_len (px)'].mean()

total_num_px = dsb_metadata['volume (voxels)'].sum()
avg_num_px = dsb_metadata['volume (voxels)'].mean()

avg_volume = dsb_metadata['volume (m^3)'].mean() 
max_volume = dsb_metadata['volume (m^3)'].max() 

In [None]:
print 'num_patients = ' + str(num_patients)
print 'num_patients_w_cancer = ' + str(num_patients_w_cancer)
print 'perc_patients_w_cancer = ' + str(perc_patients_w_cancer)
print 'total_num_slices = ' + str(total_num_slices)
print 'avg_num_slices = ' + str(avg_num_slices)
print 'total_num_px = ' + str(total_num_px)
print 'avg_num_pix = ' + str(avg_num_px)
print 'avg_volume = ' + str(avg_volume)
print 'max_volume = ' + str(max_volume)

# LUNA Metadata

In [None]:
cancer_annotations = pd.read_csv(annotations_path, header = 0)#Hard code the location of the annotations file
cancer_annotations.insert(5,'vol',0)
cancer_annotations.loc[:,'vol'] = 4/3 * 3.14159 * (cancer_annotations['diameter_mm'] / 2)  ** 3
counts = cancer_annotations.groupby('seriesuid').count()
volumes = cancer_annotations.groupby('seriesuid').sum()
cancer_annotations_processed = pd.merge(counts, volumes, left_index=True, right_index=True)[['vol_x','vol_y']]
cancer_annotations_processed.rename(index=str, columns={"vol_x": "nodule_count", "vol_y": "sum_nodule_volume"}, inplace = True)
cancer_annotations_processed

In [None]:
all_files = os.listdir(raw_path)
raw_patients = [i for i in all_files if '.mhd' in i]
raw_patients = sorted(raw_patients)
patient_ids = [i[:-4] for i in raw_patients] #I did this to remove the ".mhd" from the end of each of the patients numbers, just an aesthetic
luna_metadata = pd.DataFrame({'hex_id':patient_ids}, dtype = 'object')

In [None]:
luna_metadata.sort_values('hex_id',axis = 0, inplace= True)
luna_metadata.insert(0,"int_id",range(0,len(luna_metadata)))
luna_metadata.insert(2,"x_len (px)",0)
luna_metadata.insert(3,"y_len (px)",0)
luna_metadata.insert(4,"z_len (px)",0)
luna_metadata.insert(5,"x_mm/px",0.0)
luna_metadata.insert(6,"y_mm/px",0.0)
luna_metadata.insert(7,"z_mm/px",0.0)

In [None]:
luna_metadata

In [None]:
for p in range(0,len(raw_patients)):
    raw_slices = sitk.ReadImage(raw_path + '/' + luna_metadata['hex_id'][p] + '.mhd')
    luna_metadata.loc[p,'x_len (px)'] = raw_slices.GetSize()[0]
    luna_metadata.loc[p,'y_len (px)'] = raw_slices.GetSize()[1]
    luna_metadata.loc[p,'z_len (px)'] = raw_slices.GetSize()[2]
    
    luna_metadata.loc[p,'x_mm/px'] = raw_slices.GetSpacing()[0]
    luna_metadata.loc[p,'y_mm/px'] = raw_slices.GetSpacing()[1]
    luna_metadata.loc[p,'z_mm/px'] = raw_slices.GetSpacing()[2]
    
    if p % 10 == 0:
        print 'patient 0 through ' + str(p) + ' complete'
    
    del raw_slices

In [None]:
#Add Calculated Columns   
luna_metadata["x_len (mm)"] = luna_metadata["x_len (px)"] * luna_metadata["x_mm/px"]
luna_metadata["y_len (mm)"] = luna_metadata["y_len (px)"] * luna_metadata["y_mm/px"]
luna_metadata["z_len (mm)"] = luna_metadata["z_len (px)"] * luna_metadata["z_mm/px"] 

luna_metadata["volume (voxels)"] = luna_metadata["x_len (px)"] * luna_metadata["y_len (px)"] * luna_metadata["z_len (px)"]
luna_metadata["volume (m^3)"] = luna_metadata["x_len (mm)"] * luna_metadata["y_len (mm)"] * luna_metadata["z_len (mm)"] / 10**9

In [None]:
luna_metadata = pd.merge(luna_metadata, cancer_annotations_processed, how = 'left', left_on= 'hex_id', right_index=True)
luna_metadata['mean_nodule_volume'] = luna_metadata['sum_nodule_volume'] / luna_metadata['nodule_count']

In [None]:
luna_metadata.to_csv('LUNA Metadata.csv', index = False)

In [None]:
luna_metadata

In [None]:
#Key Metrics
num_patients = len(luna_metadata)

total_num_slices = luna_metadata['z_len (px)'].sum()
avg_num_slices = luna_metadata['z_len (px)'].mean()

total_num_px = luna_metadata['volume (voxels)'].sum()
avg_num_px = luna_metadata['volume (voxels)'].mean()

avg_volume = luna_metadata['volume (m^3)'].mean() 
max_volume = luna_metadata['volume (m^3)'].max() 

In [None]:
print 'num_patients = ' + str(num_patients)
print 'total_num_slices = ' + str(total_num_slices)
print 'avg_num_slices = ' + str(avg_num_slices)
print 'total_num_px = ' + str(total_num_px)
print 'avg_num_pix = ' + str(avg_num_px)
print 'avg_volume = ' + str(avg_volume)
print 'max_volume = ' + str(max_volume)