In [43]:
# Module imports
import os
import glob
import numpy as np
import pandas as pd
import dicom
import matplotlib.pyplot as plt
from scipy import stats
import math
%matplotlib inline


In [46]:
# Constants
data_folder = '/kaggle/dev/data-science-bowl-2017-data/'
train_data_folder = '/kaggle/dev/data-science-bowl-2017-data/stage1/'

In [47]:
# Validating if files exist

def verify_location(loc):
    if os.path.isdir(loc) or os.path.isfile(loc) :
        print('Found and verified location: ' + loc)
    else:
        raise Exception('Failed to verify location: ' + loc)
    return loc
    
verify_location(train_data_folder)


Found and verified location: /kaggle/dev/data-science-bowl-2017-data/stage1/


'/kaggle/dev/data-science-bowl-2017-data/stage1/'

In [48]:
# Extracting patient scan data
def folder_explorer(folder):
    patient_info = {}
    for d in os.listdir(folder):
        patient_info[d] = int(len(os.listdir(folder + d)))
    return patient_info

# Train Data
df_train_data_freq = folder_explorer(train_data_folder)
df_train_data = pd.DataFrame(list(df_train_data_freq.items()), columns=["id", "scans-per-patient"])
print(df_train_data)


                                    id  scans-per-patient
0     d9fb9617188fe99bdb464c126d2bd8c0                104
1     8a2de07f6e9dbb8c6e4bfad7e83b3f0a                123
2     e5cf847e616cc2fe94816ffa547d2614                153
3     0a0c32c9e08cc2ea76a71649de56be6d                133
4     605d3633c1625b4be151d38aad43de94                213
5     1098cb63ea33f752a850929234576bcb                158
6     d5c43054ba0f66d5017a8ddfde8c8c34                159
7     85ab88f093ca53a4fab5654e24c77ebe                188
8     f7a03adba817f2a2249b9dee0586f4be                139
9     bb4b43d0dc4d9d2b61150df6556f6490                201
10    fd0c2dfe0b0c58330675c3191cef0d5b                157
11    de881c07adc8d53e52391fac066ccb9f                137
12    19409b302d6c143d4f754146e91d4cfe                140
13    e6160ed0ff2eb214abd4df9a3c336c1d                411
14    9ca18e68b6b8d9c3112b4b69b7d6fad5                104
15    bedec8e1ad130a08faeec8ed81780d56                170
16    8b9a2837

In [73]:
def get_slice_location(dcm):
    return float(dcm[0x0020, 0x1041].value)

# Returns a list of images for that patient_id, in ascending order of Slice Location
def load_patient(patient_id):
    path = train_data_folder + '{}/*.dcm'.format(patient_id)
    files = glob.glob(path)
    imgs = {}
    for f in files:
        dcm = dicom.read_file(f)
        img = dcm.pixel_array
        img[img == -2000] = 0
        sl = get_slice_location(dcm)
        imgs[sl] = img
        
    # Not a very elegant way to do this
    sorted_imgs = [x[1] for x in sorted(imgs.items(), key=lambda x: x[0])]
    return sorted_imgs

# np_pat = np.array(pat)
# indices = range(0, len(pat), 10)

# print(np_pat[indices].shape[0])

# f, plots = plt.subplots(np_pat[indices].shape[0], 1, sharex='all', sharey='all', figsize=(10, 11))

# for i in range(np_pat[indices].shape[0]):
#     plots[i].axis('off')
#     plots[i].imshow(np_pat[i], cmap=plt.cm.bone)

In [81]:
def get_patient_data(patient_ids):
    patient_data = {}
    for patient_id in patient_ids:
        pat = load_patient(patient_id)
        np_pat = np.array(pat)
        indices = range(0, len(pat), 10)
        patient_data[patient_id] = np_pat[indices]
    return patient_data

get_patient_data(['d9fb9617188fe99bdb464c126d2bd8c0',
'8a2de07f6e9dbb8c6e4bfad7e83b3f0a',
'e5cf847e616cc2fe94816ffa547d2614',
'0a0c32c9e08cc2ea76a71649de56be6d',
'605d3633c1625b4be151d38aad43de94',
'1098cb63ea33f752a850929234576bcb',
'd5c43054ba0f66d5017a8ddfde8c8c34',
'85ab88f093ca53a4fab5654e24c77ebe',
'f7a03adba817f2a2249b9dee0586f4be',
'bb4b43d0dc4d9d2b61150df6556f6490',
'fd0c2dfe0b0c58330675c3191cef0d5b',
'de881c07adc8d53e52391fac066ccb9f',
'19409b302d6c143d4f754146e91d4cfe',
'e6160ed0ff2eb214abd4df9a3c336c1d',
'9ca18e68b6b8d9c3112b4b69b7d6fad5',
'bedec8e1ad130a08faeec8ed81780d56',
'8b9a28375988de6ea0b143d48b4a8dc9',
'00edff4f51a893d80dae2d42a7f45ad1',
'43fecc8947e4fbb47968dc8ef7d8f4ec',
'494c42cb61c1e4a02504c16fe09a8129',
'184c61740244f4ce8fb985af9bb3d8e8',
'e3423505ef6b43f03c5d7bde52a5a78c',
'43933b4021d93dd64854f318656c7d1e',
'd2a17180c72ce7e5e9cb3870ba7991d2',
'8bb7dd5fbfa5ecb95552d9c587f2fea5',
'e1f3a01e73d706b7e9c30c0a17a4c0b5',
'839502f9ff68fd778b435255690f3061',
'b84c43bed6c51182d7536619b747343a',
'9e922147900b3984c9345bdda573e882',
'21d449f3ae00ea302e5aa15d7df65465'])


{'00edff4f51a893d80dae2d42a7f45ad1': array([[[-1024, -1024, -1024, ..., -1024, -1024, -1024],
         [-1024, -1024, -1024, ..., -1024, -1024, -1024],
         [-1024, -1024, -1024, ..., -1024, -1024, -1024],
         ..., 
         [-1024, -1024, -1024, ..., -1024, -1024, -1024],
         [-1024, -1024, -1024, ..., -1024, -1024, -1024],
         [-1024, -1024, -1024, ..., -1024, -1024, -1024]],
 
        [[-1024, -1024, -1024, ..., -1024, -1024, -1024],
         [-1024, -1024, -1024, ..., -1024, -1024, -1024],
         [-1024, -1024, -1024, ..., -1024, -1024, -1024],
         ..., 
         [-1024, -1024, -1024, ..., -1024, -1024, -1024],
         [-1024, -1024, -1024, ..., -1024, -1024, -1024],
         [-1024, -1024, -1024, ..., -1024, -1024, -1024]],
 
        [[-1024, -1024, -1024, ..., -1024, -1024, -1024],
         [-1024, -1024, -1024, ..., -1024, -1024, -1024],
         [-1024, -1024, -1024, ..., -1024, -1024, -1024],
         ..., 
         [-1024, -1024, -1024, ..., -1024, 