In [50]:
import glob
import os
import re
import pandas as pd
import numpy as np
import SimpleITK as sitk
import matplotlib.pyplot as plt

import skimage.transform
import scipy.ndimage
from skimage.morphology import ball, disk, dilation, binary_erosion, remove_small_objects, erosion, closing, reconstruction, binary_closing
from skimage.measure import label,regionprops, perimeter
from skimage.morphology import binary_dilation, binary_opening
from skimage.filters import roberts, sobel
from skimage import measure, feature
from skimage.segmentation import clear_border
from skimage import data
from tqdm import tqdm

import scipy.misc
from mpl_toolkits.mplot3d.art3d import Poly3DCollection


DATA_PATH = '/kaggle/dev/data-science-bowl-2017-data/stage1_processed/'
OUTPUT_FOLDER_ORIGINAL = '/kaggle_2/stage1_processed_chunks/'
OUTPUT_FOLDER_NZ = '/kaggle_2/stage1_processed_chunks_nz/'
PATIENT_SCANS = 'scan_segmented_lungs_fill_'
CHUNK_SIZE = 64
NUM_CLASSES = 7
OVERLAP_PERCENTAGE = 0.6

In [51]:
def normalize(image):
    MIN_BOUND = -1000.0
    MAX_BOUND = 400.0
    image = (image - MIN_BOUND) / (MAX_BOUND - MIN_BOUND)
    image[image>1] = 1.
    image[image<0] = 0.
    return image

def zero_center(image):
    PIXEL_MEAN = 0.25
    image = image - PIXEL_MEAN
    return image

In [52]:
completed_patients = []
for patients in glob.glob(OUTPUT_FOLDER_ORIGINAL + '*_X.npy'):
    n = re.match('([a-f0-9].*)_X.npy', os.path.basename(patients))
    completed_patients.append(n.group(1))

for folder in tqdm(glob.glob(DATA_PATH + PATIENT_SCANS + '*')):
    m = re.match(PATIENT_SCANS +'([a-f0-9].*).npy', os.path.basename(folder))
    scans = np.load(DATA_PATH + m.group(0))
    patient_uid = m.group(1)
    
    if patient_uid in completed_patients:
        print('Skipping already processed patient {}'.format(patient_uid))
        continue
        
        
    chunk_counter = 1
    step_size = int((CHUNK_SIZE*(1-OVERLAP_PERCENTAGE)))
    num_chunks_0 = int((scans.shape[0])/(step_size)) + 1
    num_chunks_1 = int((scans.shape[1])/(step_size)) + 1
    num_chunks_2 = int((scans.shape[2])/(step_size)) + 1
    chunk_list = []     
    
    start_index_0 = 0
    end_index_0 = 0
    for i in range(0, num_chunks_0):
        end_index_0 = start_index_0 + CHUNK_SIZE
        
        start_index_1 = 0
        end_index_1 = 0
        for j in range(0, num_chunks_1):
            end_index_1 = start_index_1 + CHUNK_SIZE
                       
            start_index_2 = 0
            end_index_2 = 0
            for k in range(0, num_chunks_2):
                end_index_2 = start_index_2 + CHUNK_SIZE

                end_index_0 = scans.shape[0] if  (end_index_0 > scans.shape[0]) else end_index_0
                end_index_1 = scans.shape[1] if  (end_index_1 > scans.shape[1]) else end_index_1
                end_index_2 = scans.shape[2] if  (end_index_2 > scans.shape[2]) else end_index_2
                                
                chunk = np.full((CHUNK_SIZE, CHUNK_SIZE, CHUNK_SIZE), -1000.0)
                
                end_index_0_chunks = end_index_0 - start_index_0
                end_index_1_chunks = end_index_1 - start_index_1
                end_index_2_chunks = end_index_2 - start_index_2
                
                chunk[0:end_index_0_chunks, 0:end_index_1_chunks, 0:end_index_2_chunks] = scans[start_index_0:end_index_0, start_index_1:end_index_1, start_index_2:end_index_2]
                chunk_list.append(chunk)
                
                chunk_counter += 1        
                start_index_2 += step_size
            start_index_1 += step_size
        start_index_0 += step_size
        
    X = np.ndarray([len(chunk_list), CHUNK_SIZE, CHUNK_SIZE, CHUNK_SIZE], dtype=np.int16)
    Y = np.zeros([len(chunk_list), NUM_CLASSES], dtype=np.int16)
    for m in range(0,len(chunk_list)):
        X[m,:,:] = chunk_list[m]
    
    np.save(OUTPUT_FOLDER_ORIGINAL + patient_uid + '_X.npy', X)
    np.save(OUTPUT_FOLDER_ORIGINAL + patient_uid + '_Y.npy', Y)
    
    print('processed patient:', patient_uid  , '_original shape:', scans.shape )
    print('_num_chunks:', len(chunk_list), '_X.shape:', X.shape, '_Y.shape:', Y.shape)
    
    # Normalizing and Zero Centering
    X_nz = normalize(X)
    X_nz = zero_center(X_nz)
    np.save(OUTPUT_FOLDER_NZ + patient_uid + '_X.npy', X_nz)
    np.save(OUTPUT_FOLDER_NZ + patient_uid + '_Y.npy', Y)
    
    # Clearning memory
    del X,Y,X_nz



  0%|          | 0/1595 [00:00<?, ?it/s][A
  0%|          | 4/1595 [00:00<00:44, 35.89it/s][A

Skipping already processed patient 0a0c32c9e08cc2ea76a71649de56be6d
Skipping already processed patient 7577cbd6961b0cab27f88727dcd2d6d3
Skipping already processed patient 28352e12fe29361dfd9613ed2e729192
Skipping already processed patient 7852cb521d7029ca08133476054e7bec
Skipping already processed patient d5c43054ba0f66d5017a8ddfde8c8c34
Skipping already processed patient 174c5f7c33ca31443208ef873b9477e5
Skipping already processed patient d43c9dd1be361b9302c9343af09cc23e



  1%|          | 8/1595 [00:00<00:46, 34.37it/s][A

Skipping already processed patient bc43e8a2cb05a45e73dea8c7e02f2cc1





processed patient: e188bdeea72bb41d980dc2556dc8aafa _original shape: (324, 320, 320)
_num_chunks: 2197 _X.shape: (2197, 64, 64, 64) _Y.shape: (2197, 7)


  1%|          | 9/1595 [00:12<1:35:30,  3.61s/it]

processed patient: 49433c1588cc078b825a0eff1dc2e816 _original shape: (313, 300, 300)
_num_chunks: 2197 _X.shape: (2197, 64, 64, 64) _Y.shape: (2197, 7)


  1%|          | 10/1595 [00:26<3:01:23,  6.87s/it]

processed patient: 1acbe17dc8f9f59d2fd167b2aa6c650f _original shape: (322, 370, 370)
_num_chunks: 2925 _X.shape: (2925, 64, 64, 64) _Y.shape: (2925, 7)


  1%|          | 11/1595 [01:21<9:21:37, 21.27s/it]

processed patient: a19a122fe9a790576b57c6bd5cf9ff5c _original shape: (318, 326, 326)
_num_chunks: 2548 _X.shape: (2548, 64, 64, 64) _Y.shape: (2548, 7)


  1%|          | 12/1595 [02:07<12:34:06, 28.58s/it]

processed patient: 6fd582d25eeb2250c2b0996c4216deb9 _original shape: (312, 350, 350)
_num_chunks: 2925 _X.shape: (2925, 64, 64, 64) _Y.shape: (2925, 7)


KeyboardInterrupt: 