In [1]:
import dicom
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import cv2
import math
import scipy.ndimage

from skimage import measure, morphology
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
% matplotlib inline

data_dir = os.getcwd() + '\\sample_images\\'
patients = os.listdir(data_dir)
labels_df = pd.read_csv('stage1_labels.csv',index_col = 0)

print(data_dir)
print(patients)
labels_df.head()

C:\Users\azkei\Desktop\lung_cancer_data\sample_images\
['00cba091fa4ad62cc3200a657aeb957e', '0a099f2549429d29b32f349e95fb2244', '0a0c32c9e08cc2ea76a71649de56be6d', '0a38e7597ca26f9374f8ea2770ba870d', '0acbebb8d463b4b9ca88cf38431aac69', '0bd0e3056cbf23a1cb7f0f0b18446068', '0c0de3749d4fe175b7a5098b060982a1', '0c37613214faddf8701ca41e6d43f56e', '0c59313f52304e25d5a7dcf9877633b1', '0c60f4b87afcb3e2dfa65abbbf3ef2f9', '0c98fcb55e3f36d0c2b6507f62f4c5f1', '0c9d8314f9c69840e25febabb1229fa4', '0ca943d821204ceb089510f836a367fd', '0d06d764d3c07572074d468b4cff954f', '0d19f1c627df49eb223771c28548350e', '0d2fcf787026fece4e57be167d079383', '0d941a3ad6c889ac451caf89c46cb92a', '0ddeb08e9c97227853422bd71a2a695e', '0de72529c30fe642bc60dcb75c87f6bd']


Unnamed: 0_level_0,cancer
id,Unnamed: 1_level_1
0015ceb851d7251b8f399e39779d1e7d,1
0030a160d58723ff36d73f41b170ec21,0
003f41c78e6acfa92430a057ac0b306e,0
006b96310a37b36cccb2ab48d10b49a3,1
008464bb8521d09a42985dd8add3d0d2,1


In [2]:
# Iterate through 5 patients in sample_image - checking out attributes
for patient in patients[:5]:
    label = labels_df.get_value(patient,'cancer')
    path = data_dir + patient
    # using dicom to read the dicom files using full patient path
    slices = [dicom.read_file(path+ '/' + s) for s in os.listdir(path)]
    # sorting dicom files using lambda function
    # x - referred to dicom files
    slices.sort(key = lambda x: int(x.ImagePositionPatient[2]))
    print(len(slices), slices[0].pixel_array.shape)

134 (512, 512)
128 (512, 512)
133 (512, 512)
110 (512, 512)
203 (512, 512)


<p>IMPORTANT NOTE: Not all images are the same size.......depth isnt the same and width, height are too large for CNN </p>

In [3]:
# No. of Instances in the sample_image
len(patients)

19

In [4]:
# Pre processing data
IMG_PX_SIZE = 50
HM_SLICES = 20

#def chunks(l, n):
    # Credit: Ned Batchelder
    # Link: http://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
 #   """Yield successive n-sized chunks from l."""
  #  for i in range(0, len(l), n):
   #     yield l[i:i + n]
        
def chunks(l, n): 
    for i in range(0, len(l), int(n)): 
        yield l[i:i + int(n)]

def mean(l):
    return sum(l)/len(l)

# Creating a 3D model of the images
def plot_3d(image, threshold=-300):
    
    # Position the scan upright, 
    # so the head of the patient would be at the top facing the camera
    p = image.transpose(2,1,0)
    
    verts, faces = measure.marching_cubes(p, threshold)

    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111, projection='3d')

    # Fancy indexing: `verts[faces]` to generate a collection of triangles
    mesh = Poly3DCollection(verts[faces], alpha=0.70)
    face_color = [0.45, 0.45, 0.75]
    mesh.set_facecolor(face_color)
    ax.add_collection3d(mesh)

    ax.set_xlim(0, p.shape[0])
    ax.set_ylim(0, p.shape[1])
    ax.set_zlim(0, p.shape[2])

    plt.show()


def process_data(patient, labels_df, img_px_size = 50, hm_slices=23, visualize = False):
    try:
        label = labels_df.get_value(patient, 'cancer')
        path = data_dir + patient
        slices = [dicom.read_file(path + '/' + s) for s in os.listdir(path)]
        slices.sort(key = lambda x: int(x.ImagePositionPatient[2]))
        new_slices = []

        slices = [cv2.resize(np.array(each_slice.pixel_array),(IMG_PX_SIZE,IMG_PX_SIZE)) for each_slice in slices]

        chunk_sizes = math.ceil(len(slices) / HM_SLICES)

        for slice_chunk in chunks(slices, chunk_sizes):
            slice_chunk = list(map(mean, zip(*slice_chunk)))
            new_slices.append(slice_chunk)

        if len(new_slices) == HM_SLICES-1:
            new_slices.append(new_slices[-1])

        if len(new_slices) == HM_SLICES-2:
            new_slices.append(new_slices[-1])
            new_slices.append(new_slices[-1])

        if len(new_slices) == HM_SLICES+2:
            new_val = list(map(mean, zip(*[new_slices[HM_SLICES-1],new_slices[HM_SLICES],])))
            del new_slices[HM_SLICES]
            new_slices[HM_SLICES-1] = new_val

        if len(new_slices) == HM_SLICES+1:
            new_val = list(map(mean, zip(*[new_slices[HM_SLICES-1],new_slices[HM_SLICES],])))
            del new_slices[HM_SLICES]
            new_slices[HM_SLICES-1] = new_val
        #print(len(slices), len(new_slices))
        
        if visualize:
            fig = plt.figure()
            for num, each_slice in enumerate(slices[:12]):
                y = fig.add_subplot(3,4,num+1)
                y.imshow(each_slice)
            plt.show()
            
        if label == 1:
            label = np.array([0,1])
        elif label == 1:
            label = np.array([0,1])
        
        return np.array(new_slices),label
    
    except Exception as e:
        # again, some patients are not labeled, but JIC we still want the error if something
        # else is wrong with our code
        print("blah",str(e))
        
        
much_data = []
for num, patient in enumerate(patients):
    if num%100 == 0:
        print(num)
    try:
        print("Num, Patient",num,patient)
        img_data, label = process_data(patient,labels_df,img_px_size=IMG_PX_SIZE, hm_slices=HM_SLICES)
        print("Image Data:",img_data, "Label:",label)
        much_data.append([img_data, label])
        
    except KeyError as e:
        print("This is unlabeled data")
        pass
    
np.save('muchdata--{}--{}--{}.npy'.format(IMG_PX_SIZE,IMG_PX_SIZE,HM_SLICES),much_data)

0
Num, Patient 0 00cba091fa4ad62cc3200a657aeb957e
Image Data: [[[-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  ..., 
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]]

 [[-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  ..., 
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]]

 [[-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  ..., 
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. 

Image Data: [[[-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  ..., 
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]]

 [[-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  ..., 
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]]

 [[-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  ..., 
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]]

 ..., 
 [[-2000. -2000. -2000. ..., -200

Image Data: [[[  15.22222222   41.88888889   22.11111111 ...,   41.66666667
     23.33333333   29.88888889]
  [  20.88888889   29.22222222   20.22222222 ...,   18.55555556
     29.88888889   28.22222222]
  [  29.           38.           16.66666667 ...,   23.55555556
     27.77777778   30.55555556]
  ..., 
  [  97.55555556  132.22222222  119.         ...,  135.11111111
    109.77777778  135.88888889]
  [ 141.66666667  115.88888889   72.         ...,  111.44444444
     96.44444444  100.        ]
  [ 114.22222222  104.33333333  128.33333333 ...,  127.66666667
    107.88888889   99.        ]]

 [[  29.66666667   20.           21.88888889 ...,   17.55555556
     24.11111111   42.11111111]
  [  28.55555556   27.33333333   38.22222222 ...,   35.88888889
     19.22222222   25.66666667]
  [  17.77777778   24.44444444   23.77777778 ...,   42.11111111   19.
     19.33333333]
  ..., 
  [ 125.22222222  121.88888889  108.22222222 ...,  120.11111111
    120.66666667  113.66666667]
  [ 117.88888889  

Image Data: [[[-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  ..., 
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]]

 [[-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  ..., 
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]]

 [[-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  ..., 
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]]

 ..., 
 [[-2000. -2000. -2000. ..., -200

Image Data: [[[-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  ..., 
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]]

 [[-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  ..., 
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]]

 [[-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  ..., 
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]
  [-2000. -2000. -2000. ..., -2000. -2000. -2000.]]

 ..., 
 [[-2000. -2000. -2000. ..., -200

<p> Each of this instance is a patient </p>

In [16]:
import tensorflow as tf
import numpy as np



save_dir = 'models/'

save_path = os.path.join(save_dir, 'best_validation')


IMG_SIZE_PX = 50
SLICE_COUNT = 20

n_classes = 2
batch_size = 10

x = tf.placeholder('float')
y = tf.placeholder('float')


keep_rate = 0.8
keep_prob = tf.placeholder(tf.float32)


def conv3d(x, W):
    return tf.nn.conv3d(x, W, strides=[1,1,1,1,1], padding='SAME')

def maxpool3d(x):
    #                        size of window         movement of window as you slide about
    return tf.nn.max_pool3d(x, ksize=[1,2,2,2,1], strides=[1,2,2,2,1], padding='SAME')


def train_neural_network(x):
    #                # 5 x 5 x 5 patches, 1 channel, 32 features to compute.
    weights = {'W_conv1':tf.Variable(tf.random_normal([3,3,3,1,32])),
               #       5 x 5 x 5 patches, 32 channels, 64 features to compute.
               'W_conv2':tf.Variable(tf.random_normal([3,3,3,32,64])),
               #                                  64 features
               'W_fc':tf.Variable(tf.random_normal([54080,1024])),
               'out':tf.Variable(tf.random_normal([1024, n_classes]))}

    biases = {'b_conv1':tf.Variable(tf.random_normal([32])),
               'b_conv2':tf.Variable(tf.random_normal([64])),
               'b_fc':tf.Variable(tf.random_normal([1024])),
               'out':tf.Variable(tf.random_normal([n_classes]))}

    #                            image X      image Y        image Z
    x = tf.reshape(x, shape=[-1, IMG_SIZE_PX, IMG_SIZE_PX, SLICE_COUNT, 1])

    conv1 = tf.nn.relu(conv3d(x, weights['W_conv1']) + biases['b_conv1'])
    conv1 = maxpool3d(conv1)
    

    conv2 = tf.nn.relu(conv3d(conv1, weights['W_conv2']) + biases['b_conv2'])
    conv2 = maxpool3d(conv2)

    fc = tf.reshape(conv2,[-1, 54080])
    fc = tf.nn.relu(tf.matmul(fc, weights['W_fc'])+biases['b_fc'])
    fc = tf.nn.dropout(fc, keep_rate)

    prediction = tf.matmul(fc, weights['out'])+biases['out']
    print ("Pred",prediction)
    much_data = np.load('muchdata--50--50--20.npy')
    train_data = much_data
    #validation_data = much_data[-2:]
    cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y) )
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    init = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
    hm_epochs = 5
    
    with tf.Session() as sess:
        sess.run(init)
        success_runs = 0
        total_runs = 0
        
        for epoch in range(hm_epochs):
            epoch_loss = 0
            for data in train_data:
                total_runs +=1
                try:
                    X = data[0]
                    Y = data[1]
                    _, c = sess.run([optimizer, cost], feed_dict={x: X, y: Y})
                    epoch_loss += c
                    success_runs += 1
                except Exception as e:
                    pass
            print('Epoch', epoch, 'completed out of',hm_epochs,'loss:',epoch_loss)

            correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
            accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
        
        # Save all variables of the TensorFlow graph to file.
        saver.save(sess=sess, save_path=save_path)
        print('Done. Finishing accuracy:')
        #print('Accuracy:',accuracy.eval({x:[i[0] for i in validation_data], y:[i[1] for i in validation_data]}))
        
        #print('fitment percent:',successful_runs/total_runs)

train_neural_network(x)

Epoch 0 completed out of 10 loss: 0
Epoch 1 completed out of 10 loss: 0
Epoch 2 completed out of 10 loss: 0
Epoch 3 completed out of 10 loss: 0
Epoch 4 completed out of 10 loss: 0
Epoch 5 completed out of 10 loss: 0
Epoch 6 completed out of 10 loss: 0
Epoch 7 completed out of 10 loss: 0
Epoch 8 completed out of 10 loss: 0
Epoch 9 completed out of 10 loss: 0
Done. Finishing accuracy:


In [15]:
much_data = np.load('muchdata--50--50--20.npy')

instances = 0

for data in much_data:
    instances = instances + 1
    X = data[0]
    Y = data[1]
   # print(X,Y)
    #print("There are",instances)