Purpose: To analyze the data structures at various stages of the pipeline

In [116]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import scipy.ndimage
import matplotlib.pyplot as plt
import glob
import re
from skimage import measure, morphology

In [97]:
# constants

data = '/kaggle/dev/data-science-bowl-2017-data/'
stage1 = '/kaggle/dev/data-science-bowl-2017-data/stage1/'
sample_images = '/kaggle/dev/data-science-bowl-2017-data/sample_images/'
labels = '/kaggle/dev/data-science-bowl-2017-data/stage1_labels.csv'
stage1_processed = '/kaggle/dev/data-science-bowl-2017-data/stage1_processed/'
stage1_features = '/kaggle/dev/data-science-bowl-2017-data/stage1_features_mx/'
stage1_submission = '/kaggle/dev/data-science-bowl-2017-data/stage1_sample_submission.csv'
naive_submission = '/kaggle/dev/jovan/data-science-bowl-2017/data-science-bowl-2017/submissions/naive_submission.csv'

### Raw data: stage1

In [98]:
sample_ids = []

for d in os.listdir(sample_images):
    sample_ids.append(d)
    print("Patient '{}' has {} scans".format(d, len(os.listdir(sample_images + d))))

print('----')
print('Total patients {} Total DCM files {}'.format(len(os.listdir(stage1)), 
                                                      len(glob.glob(stage1 + '*/*.dcm'))))

Patient '0c98fcb55e3f36d0c2b6507f62f4c5f1' has 180 scans
Patient '0d19f1c627df49eb223771c28548350e' has 183 scans
Patient '0c37613214faddf8701ca41e6d43f56e' has 164 scans
Patient '0d2fcf787026fece4e57be167d079383' has 126 scans
Patient '0a38e7597ca26f9374f8ea2770ba870d' has 110 scans
Patient '00cba091fa4ad62cc3200a657aeb957e' has 134 scans
Patient '0d941a3ad6c889ac451caf89c46cb92a' has 177 scans
Patient '0a0c32c9e08cc2ea76a71649de56be6d' has 133 scans
Patient '0c60f4b87afcb3e2dfa65abbbf3ef2f9' has 136 scans
Patient '0c9d8314f9c69840e25febabb1229fa4' has 221 scans
Patient '0ca943d821204ceb089510f836a367fd' has 147 scans
Patient '0b20184e0cd497028bdd155d9fb42dc9' has 196 scans
Patient '0c0de3749d4fe175b7a5098b060982a1' has 123 scans
Patient '0c59313f52304e25d5a7dcf9877633b1' has 244 scans
Patient '0bd0e3056cbf23a1cb7f0f0b18446068' has 280 scans
Patient '0de72529c30fe642bc60dcb75c87f6bd' has 113 scans
Patient '0ddeb08e9c97227853422bd71a2a695e' has 171 scans
Patient '0a099f2549429d29b32f34

### Processed data: stage1_processed

In [118]:
for folder in glob.glob(stage1_processed + 'segment_lungs_fill_*'):
    m = re.match(r'segment_lungs_fill_([a-f0-9].*).npy', os.path.basename(folder))
    if(m.group(1) in sample_ids):
        x = np.load(stage1_processed + m.group(0))
        print('Patient {}: array shape: {}'.format(m.group(1), x.shape))

Patient 0d06d764d3c07572074d468b4cff954f: array shape: (304, 424, 424)
Patient 0a38e7597ca26f9374f8ea2770ba870d: array shape: (275, 320, 320)
Patient 0c0de3749d4fe175b7a5098b060982a1: array shape: (308, 355, 355)
Patient 0c60f4b87afcb3e2dfa65abbbf3ef2f9: array shape: (272, 330, 330)
Patient 00cba091fa4ad62cc3200a657aeb957e: array shape: (335, 306, 306)
Patient 0ddeb08e9c97227853422bd71a2a695e: array shape: (342, 392, 392)
Patient 0c59313f52304e25d5a7dcf9877633b1: array shape: (305, 390, 390)
Patient 0c9d8314f9c69840e25febabb1229fa4: array shape: (398, 309, 309)
Patient 0d2fcf787026fece4e57be167d079383: array shape: (315, 310, 310)
Patient 0a0c32c9e08cc2ea76a71649de56be6d: array shape: (332, 340, 340)
Patient 0bd0e3056cbf23a1cb7f0f0b18446068: array shape: (350, 340, 340)
Patient 0d941a3ad6c889ac451caf89c46cb92a: array shape: (354, 259, 259)
Patient 0de72529c30fe642bc60dcb75c87f6bd: array shape: (282, 308, 308)
Patient 0acbebb8d463b4b9ca88cf38431aac69: array shape: (365, 279, 279)
Patien

### Feature extraction: stage1_features_mx

In [127]:
for folder in glob.glob(stage1_features + '*.npy'):
    m = re.match(r'([a-f0-9].*).npy', os.path.basename(folder))
    if(m.group(1) in sample_ids):
        x = np.load(stage1_features + m.group(0))
        print('Patient {}: array shape: {}'.format(m.group(1), x.shape))

Patient 0c60f4b87afcb3e2dfa65abbbf3ef2f9: array shape: (90, 2048)
Patient 0a0c32c9e08cc2ea76a71649de56be6d: array shape: (110, 2048)
Patient 0de72529c30fe642bc60dcb75c87f6bd: array shape: (93, 2048)
Patient 0bd0e3056cbf23a1cb7f0f0b18446068: array shape: (116, 2048)
Patient 0c59313f52304e25d5a7dcf9877633b1: array shape: (101, 2048)
Patient 0d941a3ad6c889ac451caf89c46cb92a: array shape: (117, 2048)
Patient 00cba091fa4ad62cc3200a657aeb957e: array shape: (111, 2048)
Patient 0d06d764d3c07572074d468b4cff954f: array shape: (101, 2048)
Patient 0c0de3749d4fe175b7a5098b060982a1: array shape: (102, 2048)
Patient 0a099f2549429d29b32f349e95fb2244: array shape: (106, 2048)
Patient 0d19f1c627df49eb223771c28548350e: array shape: (121, 2048)
Patient 0c37613214faddf8701ca41e6d43f56e: array shape: (109, 2048)
Patient 0ddeb08e9c97227853422bd71a2a695e: array shape: (113, 2048)
Patient 0c98fcb55e3f36d0c2b6507f62f4c5f1: array shape: (119, 2048)
Patient 0acbebb8d463b4b9ca88cf38431aac69: array shape: (121, 204

### Traning: make_submit

In [129]:
for s in sample_ids:
    x = np.array([np.mean(np.load(stage1_features + s + ".npy"), axis=0))

[[ 0.45521843  2.23148155  1.03768289 ...,  1.10248542  1.19762778
   3.69490266]
 [ 0.48257342  2.60997295  0.93810481 ...,  0.84844297  1.1022824
   3.64931464]
 [ 0.50405788  2.7404685   1.04102635 ...,  0.831429    1.31895638
   3.73132658]
 ..., 
 [ 0.53901482  2.38087225  1.0558393  ...,  0.87788266  1.07262886
   3.87326431]
 [ 0.28989959  1.26477551  0.20334616 ...,  0.30925664  0.82562715
   3.22417712]
 [ 0.36895481  2.8777771   1.10733223 ...,  1.09140313  1.24432278
   3.95053434]]
