In [2]:
import pandas as pd
import numpy as np
import pydicom
import os
import matplotlib.pyplot as plt
import cv2
import math
%matplotlib inline

In [3]:
# Reduced Image pixel size and the depth to process data without cloud
IMG_PXL_SIZE = 50
HM_SLICES = 20

In [4]:
# Data Path
data_labels = pd.read_csv('D:/Documents/Major Project/Jupyter Notebooks/stage1_labels.csv',index_col=0)
data_dir = 'D:/Documents/Major Project/sample_images/'
patients = os.listdir(data_dir)
test_patients = pd.read_csv('D:/Documents/Major Project/Jupyter Notebooks/stage1_sample_submission.csv')

In [5]:
test_patients.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 0 to 197
Data columns (total 2 columns):
id        198 non-null object
cancer    198 non-null float64
dtypes: float64(1), object(1)
memory usage: 3.2+ KB


In [6]:
len(patients) # sample data of 20 patients

20

In [8]:
# Function to break a list of slices into chunks of lists
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [9]:
def mean(l):
    return sum(l)/len(l)

## Function to limit depth

This function will limit the depth of 3-D CT Scans to 20 slices. Because of variable number of slices, it can not be used with a neural network

In [10]:
# Function to process data
def process_data(patient, data_labels, img_pxl_size=20, hm_slices=20, vizualize = False):

    label = data_labels.get_value(patient,'cancer')
    path = data_dir + patient
    slices = [dicom.read_file(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
    #print(len(slices), label)
    #print(slices[0])
    #plt.imshow(slices[0].pixel_array)
    #plt.show()

    new_slices = []

    slices = [cv2.resize(np.array(each_slice.pixel_array),(IMG_PXL_SIZE,IMG_PXL_SIZE)) for each_slice in slices]

    chunk_sizes = math.ceil(len(slices) / HM_SLICES)



    for slice_chunk in chunks(slices,chunk_sizes):
        slice_chunk = list(map(mean, zip(*slice_chunk)))
        new_slices.append(slice_chunk)


    if len(new_slices) == HM_SLICES-1:
        new_slices.append(new_slices[-1])

    if len(new_slices) == HM_SLICES-2:
        new_slices.append(new_slices[-1])
        new_slices.append(new_slices[-1])

    if len(new_slices) == HM_SLICES+2:
        new_val = list(map(mean, zip(*[new_slices[HM_SLICES-1],new_slices[HM_SLICES]])))
        del new_slices[HM_SLICES]
        new_slices[HM_SLICES-1] = new_val

    if len(new_slices) == HM_SLICES+1:
        new_val = list(map(mean, zip(*[new_slices[HM_SLICES-1],new_slices[HM_SLICES]])))
        del new_slices[HM_SLICES]
        new_slices[HM_SLICES-1] = new_val


    if vizualize:
        fig = plt.figure()
        for num,each_slice in enumerate(slices[:12]):
            y = fig.add_subplot(4,5,num+1)
            #new_image = scipy.misc.imresize(np.array(each_slice.pixel_array),(IMG_PXL_SIZE,IMG_PXL_SIZE))
            #y.imshow(slices[0].pixel_array)
            #y.imshow(each_slice)

        plt.show()

    if label == 1: label = np.array([0,1])
    elif label == 0: label = np.array([1,0])

    return np.array(new_slices), label

## Function to process CT Scans and save them as #-Dimensional Arrays forneural network

** This function converted approximatey 150 Gigabytes of CT Scan data to just ~850 Megabytes of numpy arrays**

In [9]:
# Save processed data in a list

train_data = []

test_data = []

for num, patient in enumerate(patients):
        if num%50 == 0:
            print(num)

        try:
            img_data, label = process_data(patient, data_labels, img_pxl_size=IMG_PXL_SIZE, hm_slices=HM_SLICES)
            train_data.append([img_data,label])
            print(img_data.shape, label)

        except KeyError as e:
            test_data.append([img_data])
            print(img_data.shape , '\tThis is unlabeled data')

np.save('traindata-{}-{}-{}.npy'.format(IMG_PXL_SIZE,IMG_PXL_SIZE,HM_SLICES), train_data)
np.save('testdata-{}-{}-{}.npy'.format(IMG_PXL_SIZE,IMG_PXL_SIZE,HM_SLICES), test_data)
print('Finished processing')

0
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [0 1]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) 	This is unlabeled data
(20, 50, 50) [0 1]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) 	This is unlabeled data
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [0 1]
(20, 50, 50) [0 1]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) 	This is unlabeled data
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) 	This is unlabeled data
(20, 50, 50) [0 1]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) 	This is unlabeled data
(20, 50, 50) [0 1]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) 	This is unlabeled data
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) [1 0]
(20, 50, 50) 	This is unlabeled dat