In [27]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import dicom
import os
from scipy import ndimage as ndi
import matplotlib.pyplot as plt
import glob
import re
#import xmltodict
import pickle
import untangle
import uuid
from tqdm import tqdm
from decimal import Decimal
from skimage import measure, morphology
from skimage.morphology import ball, disk, dilation, binary_erosion, remove_small_objects, erosion, closing, reconstruction, binary_closing
from skimage.measure import label,regionprops, perimeter
from skimage.morphology import binary_dilation, binary_opening
from skimage.segmentation import clear_border
from skimage.filters import roberts, sobel

from mpl_toolkits.mplot3d.art3d import Poly3DCollection

DATA_PATH = '/kaggle_2/lidc_idri/data/'
DATA_PATH_XML = '/kaggle_2/lidc_idri/data/tcia-lidc-xml/'
DATA_PATH_SCANS = '/kaggle_2/lidc_idri/data/LIDC/DOI/'
DATA_PATH_POST_PROCESSED_SCANS = '/kaggle_2/lidc_idri/data/scans_resampled_unsegmented/'
DATA_PATH_NODULES = '/kaggle_2/lidc_idri/data/nodules_chunked/'
CHUNK_SIZE = 32

In [29]:
with open(DATA_PATH + 'patient_scans_map.pkl', 'rb') as f:
    patient_scans_map = pickle.load(f)

with open(DATA_PATH + 'patient_nodules_map.pkl', 'rb') as f:
    patient_nodules_map = pickle.load(f)
    
with open(DATA_PATH + 'patient_deduped_nodules_map.pkl', 'rb') as f:
    patient_deduped_nodules_map = pickle.load(f)

In [30]:
print(len(patient_scans_map.keys()))
print(len(patient_nodules_map.keys()))
print(len(patient_deduped_nodules_map.keys()))

1018
998
998


In [31]:
# Load the scans in given folder path
def load_scan(paths):
    slices = [dicom.read_file(path) for path in paths]
    slices.sort(key = lambda x: int(x.ImagePositionPatient[2]))
    try:
        slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
    except:
        slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
        
    for s in slices:
        s.SliceThickness = slice_thickness
    
    origin = np.array(list(reversed(slices[0].ImagePositionPatient)), dtype=np.float32)
    
    # Determine current pixel spacing
    spacing = np.array([slices[0].SliceThickness] + slices[0].PixelSpacing, dtype=np.float32)

    return slices, origin, spacing

def world_2_voxel(world_coordinates, origin, spacing):
    stretched_voxel_coordinates = np.absolute(world_coordinates - origin)
    voxel_coordinates = stretched_voxel_coordinates / spacing
    return voxel_coordinates

In [32]:
weird_chunks = {}
weird_chunk_count = 0
RESIZE_SPACING = [1,1,1]
items = list(patient_deduped_nodules_map.items())
for idx in tqdm(range(len(items))):
    patient_id = items[idx][0]
    nodules = items[idx][1]
#    print(patient_id)
    patient_scan_files = patient_scans_map[patient_id]['scans']
    patient_scan_files.sort()
    scan, origin, spacing = load_scan(patient_scan_files)
    scan_resampled = np.load(DATA_PATH_POST_PROCESSED_SCANS + "scan_%s.npy" % (patient_id))
#    print('Original scan', (len(scan), 512, 512))
#    print('Resampled scan', scan_resampled.shape)
#    print('Nodules',len(nodules))
    #print(nodules)
    #print('---')
    #print(patient_nodules_map[patient_id])

    X = np.ndarray((len(nodules), CHUNK_SIZE, CHUNK_SIZE, CHUNK_SIZE), dtype=np.float32)
    Y = np.ndarray([len(nodules), 1], dtype=np.float32)
    count = 0
    for nodule in nodules:
        coords = nodule['coords']
        coords_np = np.ndarray((len(coords), 4), dtype=np.float32)
        for idx in range(len(coords)):
            coords_np[idx] = coords[idx]
        #print(coords_np)
        minZ = np.amin(coords_np[:, 0], axis=0)
        maxZ = np.amax(coords_np[:, 0], axis=0)
        minY = np.amin(coords_np[:, 1], axis=0)
        maxY = np.amax(coords_np[:, 1], axis=0)
        minX = np.amin(coords_np[:, 2], axis=0)
        maxX = np.amax(coords_np[:, 2], axis=0)
        
        centerZ = int((minZ + maxZ) / 2.0)
        centerY = int((minY + maxY) / 2.0)
        centerX = int((minX + maxX) / 2.0)

        imageCoord = np.array((centerZ, centerY, centerX))
        
#        print('Pre process nodule coords', imageCoord)
        imageCoord = world_2_voxel(imageCoord, origin, RESIZE_SPACING)
#        print('Post process nodule coords', imageCoord)        
        
        Z1 = int(imageCoord[2]) - int(CHUNK_SIZE/2)
        Z2 = int(imageCoord[2]) + int(CHUNK_SIZE/2)
        Y1 = int(imageCoord[1]) - int(CHUNK_SIZE/2)
        Y2 = int(imageCoord[1]) + int(CHUNK_SIZE/2)
        X1 = int(imageCoord[0]) - int(CHUNK_SIZE/2)
        X2 = int(imageCoord[0]) + int(CHUNK_SIZE/2)
        
        X1 = 0 if (X1 < 0) else X1
        Y1 = 0 if (Y1 < 0) else Y1
        Z1 = 0 if (Z1 < 0) else Z1
        
        X2 = scan_resampled.shape[2] if (X2 > scan_resampled.shape[2]) else X2
        Y2 = scan_resampled.shape[1] if (Y2 > scan_resampled.shape[1]) else Y2
        Z2 = scan_resampled.shape[0] if (Z2 > scan_resampled.shape[0]) else Z2
        
#       print(int(minZ), int(maxZ), int(minY), int(maxY), int(minX), int(maxX))
#       print(int(centerZ), int(centerY), int(centerX))
#        print(Z1, Z2, Y1, Y2, X1, X2)
        
        if (Z2 > scan_resampled.shape[0] or Y2 > scan_resampled.shape[1] or X2 > scan_resampled.shape[2] or Z2 < Z1 or Y2 < Y1 or X2 < X1):
            print('Found weird chunk!')
            if patient_id in weird_chunks.keys():
                weird_chunks[patient_id].append({'shape': scan_resampled.shape, 'center': [centerZ, centerY, centerX], 'chunk_coords': [Z1, Z2, Y1, Y2, X1, X2]})
            else:
                weird_chunks[patient_id] = []
                weird_chunks[patient_id].append({'shape': scan_resampled.shape, 'center': [centerZ, centerY, centerX], 'chunk_coords': [Z1, Z2, Y1, Y2, X1, X2]})
            weird_chunk_count += 1
            continue
        
        chunk = np.full((CHUNK_SIZE, CHUNK_SIZE, CHUNK_SIZE), -1000.0, np.float32)
        chunk[0:Z2-Z1, 0:Y2-Y1, 0:X2-X1] = scan_resampled[Z1:Z2,Y1:Y2,X1:X2]

        X[count,:,:,:] = chunk
        if 'malignancy' in nodule:
            Y[count,] = nodule['malignancy']
        else:
            Y[count,] = 0.0
        count = count + 1
    np.save(DATA_PATH_NODULES + patient_id + '_X.npy', X)
    np.save(DATA_PATH_NODULES + patient_id + '_Y.npy', Y)

  1%|          | 7/998 [02:53<7:33:03, 27.43s/it] 

Found weird chunk!
Found weird chunk!


  1%|▏         | 14/998 [06:02<7:10:58, 26.28s/it]

Found weird chunk!
Found weird chunk!


  2%|▏         | 21/998 [08:14<5:33:58, 20.51s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!


  4%|▎         | 35/998 [14:55<7:45:20, 28.99s/it]

Found weird chunk!


  5%|▌         | 53/998 [21:12<5:58:41, 22.77s/it]

Found weird chunk!


  6%|▌         | 55/998 [21:53<5:43:41, 21.87s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!


  7%|▋         | 73/998 [27:58<4:33:02, 17.71s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!


  8%|▊         | 81/998 [30:26<5:03:01, 19.83s/it]

Found weird chunk!


  9%|▉         | 89/998 [33:32<5:39:47, 22.43s/it]

Found weird chunk!


 10%|█         | 102/998 [38:02<4:55:19, 19.78s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!


 12%|█▏        | 117/998 [43:35<4:49:27, 19.71s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!


 12%|█▏        | 118/998 [43:56<4:55:40, 20.16s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!


 12%|█▏        | 121/998 [44:54<4:54:57, 20.18s/it]

Found weird chunk!


 13%|█▎        | 126/998 [46:54<5:40:42, 23.44s/it]

Found weird chunk!
Found weird chunk!


 13%|█▎        | 132/998 [49:02<5:02:45, 20.98s/it]

Found weird chunk!


 14%|█▍        | 138/998 [51:04<4:49:35, 20.20s/it]

Found weird chunk!


 15%|█▍        | 145/998 [53:13<4:03:32, 17.13s/it]

Found weird chunk!


 15%|█▌        | 152/998 [55:47<5:25:48, 23.11s/it]

Found weird chunk!


 17%|█▋        | 172/998 [1:02:59<4:15:48, 18.58s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!


 18%|█▊        | 179/998 [1:06:07<5:53:17, 25.88s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!


 18%|█▊        | 183/998 [1:07:36<4:47:11, 21.14s/it]

Found weird chunk!


 20%|█▉        | 195/998 [1:11:32<4:51:30, 21.78s/it]

Found weird chunk!


 20%|██        | 200/998 [1:13:35<5:36:10, 25.28s/it]

Found weird chunk!
Found weird chunk!


 22%|██▏       | 224/998 [1:23:33<4:59:54, 23.25s/it]

Found weird chunk!


 24%|██▍       | 239/998 [1:29:38<4:38:47, 22.04s/it]

Found weird chunk!


 24%|██▍       | 243/998 [1:30:32<3:21:13, 15.99s/it]

Found weird chunk!
Found weird chunk!


 25%|██▌       | 253/998 [1:33:32<3:03:47, 14.80s/it]

Found weird chunk!


 26%|██▌       | 260/998 [1:35:25<3:20:55, 16.33s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!


 32%|███▏      | 318/998 [1:59:54<4:50:05, 25.60s/it]

Found weird chunk!


 32%|███▏      | 321/998 [2:00:59<4:09:01, 22.07s/it]

Found weird chunk!


 32%|███▏      | 322/998 [2:01:43<5:20:39, 28.46s/it]

Found weird chunk!


 39%|███▉      | 389/998 [2:28:00<4:26:34, 26.26s/it]

Found weird chunk!
Found weird chunk!


 39%|███▉      | 394/998 [2:29:53<3:39:32, 21.81s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!


 40%|███▉      | 396/998 [2:30:42<3:55:42, 23.49s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!


 41%|████      | 405/998 [2:35:16<6:05:15, 36.96s/it]

Found weird chunk!


 42%|████▏     | 416/998 [2:39:15<5:02:10, 31.15s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!


 45%|████▍     | 445/998 [2:51:30<3:28:29, 22.62s/it]

Found weird chunk!


 45%|████▍     | 446/998 [2:51:50<3:21:13, 21.87s/it]

Found weird chunk!


 46%|████▋     | 463/998 [2:58:34<3:43:00, 25.01s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!


 47%|████▋     | 471/998 [3:01:39<3:54:08, 26.66s/it]

Found weird chunk!


 48%|████▊     | 476/998 [3:03:19<3:07:40, 21.57s/it]

Found weird chunk!
Found weird chunk!


 48%|████▊     | 479/998 [3:04:49<3:59:32, 27.69s/it]

Found weird chunk!
Found weird chunk!


 49%|████▊     | 485/998 [3:06:42<2:55:49, 20.56s/it]

Found weird chunk!


 51%|█████     | 504/998 [3:12:37<2:13:52, 16.26s/it]

Found weird chunk!
Found weird chunk!


 51%|█████     | 510/998 [3:14:37<2:39:52, 19.66s/it]

Found weird chunk!


 51%|█████     | 511/998 [3:14:55<2:36:00, 19.22s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!


 52%|█████▏    | 514/998 [3:15:52<2:35:16, 19.25s/it]

Found weird chunk!
Found weird chunk!


 53%|█████▎    | 524/998 [3:19:54<3:20:57, 25.44s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!


 54%|█████▍    | 543/998 [3:25:56<2:23:04, 18.87s/it]

Found weird chunk!


 56%|█████▌    | 555/998 [3:30:19<2:54:50, 23.68s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!


 56%|█████▌    | 558/998 [3:31:01<2:11:07, 17.88s/it]

Found weird chunk!


 57%|█████▋    | 565/998 [3:33:13<2:24:02, 19.96s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!


 57%|█████▋    | 571/998 [3:35:24<2:33:46, 21.61s/it]

Found weird chunk!


 57%|█████▋    | 572/998 [3:35:49<2:41:20, 22.72s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!


 58%|█████▊    | 578/998 [3:38:34<3:23:11, 29.03s/it]

Found weird chunk!


 59%|█████▉    | 587/998 [3:41:03<1:53:55, 16.63s/it]

Found weird chunk!


 60%|█████▉    | 597/998 [3:44:21<1:48:37, 16.25s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!


 61%|██████    | 604/998 [3:48:57<2:36:15, 23.79s/it]

Found weird chunk!
Found weird chunk!


 62%|██████▏   | 622/998 [3:57:47<3:29:20, 33.40s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!


 63%|██████▎   | 627/998 [3:59:38<2:33:45, 24.87s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!


 64%|██████▍   | 639/998 [4:03:51<2:12:48, 22.20s/it]

Found weird chunk!


 66%|██████▌   | 659/998 [4:14:14<3:23:46, 36.07s/it]

Found weird chunk!


 67%|██████▋   | 671/998 [4:18:37<2:29:45, 27.48s/it]

Found weird chunk!


 67%|██████▋   | 673/998 [4:19:18<2:13:21, 24.62s/it]

Found weird chunk!


 68%|██████▊   | 674/998 [4:19:32<1:55:37, 21.41s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!


 68%|██████▊   | 679/998 [4:21:50<2:16:04, 25.59s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!


 69%|██████▉   | 687/998 [4:26:59<2:38:02, 30.49s/it]

Found weird chunk!


 70%|██████▉   | 694/998 [4:29:29<1:39:42, 19.68s/it]

Found weird chunk!
Found weird chunk!


 70%|██████▉   | 696/998 [4:30:05<1:37:48, 19.43s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!


 71%|███████   | 706/998 [4:33:53<2:01:42, 25.01s/it]

Found weird chunk!
Found weird chunk!


 72%|███████▏  | 718/998 [4:37:29<1:01:07, 13.10s/it]

Found weird chunk!


 73%|███████▎  | 724/998 [4:39:24<1:23:07, 18.20s/it]

Found weird chunk!
Found weird chunk!


 78%|███████▊  | 780/998 [4:59:40<1:14:12, 20.42s/it]

Found weird chunk!


 79%|███████▉  | 788/998 [5:03:25<1:51:53, 31.97s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!


 80%|███████▉  | 794/998 [5:05:36<1:09:30, 20.45s/it]

Found weird chunk!
Found weird chunk!


 80%|███████▉  | 797/998 [5:06:39<1:08:04, 20.32s/it]

Found weird chunk!
Found weird chunk!


 80%|████████  | 801/998 [5:08:12<1:15:37, 23.03s/it]

Found weird chunk!
Found weird chunk!


 82%|████████▏ | 817/998 [5:15:58<1:40:35, 33.34s/it]

Found weird chunk!


 82%|████████▏ | 823/998 [5:18:13<1:05:10, 22.35s/it]

Found weird chunk!


 83%|████████▎ | 824/998 [5:19:15<1:39:03, 34.16s/it]

Found weird chunk!
Found weird chunk!


 86%|████████▌ | 856/998 [5:30:07<53:02, 22.41s/it]  

Found weird chunk!


 86%|████████▌ | 857/998 [5:30:37<58:01, 24.69s/it]

Found weird chunk!


 88%|████████▊ | 875/998 [5:37:21<38:47, 18.92s/it]  

Found weird chunk!
Found weird chunk!


 88%|████████▊ | 878/998 [5:38:51<51:23, 25.70s/it]

Found weird chunk!
Found weird chunk!


 88%|████████▊ | 882/998 [5:41:02<1:01:26, 31.78s/it]

Found weird chunk!


 91%|█████████ | 905/998 [5:50:58<35:13, 22.72s/it]  

Found weird chunk!
Found weird chunk!


 91%|█████████ | 906/998 [5:51:19<34:07, 22.26s/it]

Found weird chunk!
Found weird chunk!


 92%|█████████▏| 914/998 [5:54:22<31:35, 22.56s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!


 92%|█████████▏| 916/998 [5:55:26<35:32, 26.00s/it]

Found weird chunk!
Found weird chunk!


 93%|█████████▎| 926/998 [5:59:17<27:39, 23.05s/it]

Found weird chunk!
Found weird chunk!


 94%|█████████▎| 934/998 [6:02:17<25:42, 24.10s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!
Found weird chunk!


 95%|█████████▍| 946/998 [6:07:12<19:51, 22.91s/it]

Found weird chunk!


 95%|█████████▌| 951/998 [6:08:42<15:07, 19.30s/it]

Found weird chunk!
Found weird chunk!


 97%|█████████▋| 965/998 [6:14:13<12:26, 22.62s/it]

Found weird chunk!


 99%|█████████▉| 992/998 [6:22:56<01:51, 18.61s/it]

Found weird chunk!
Found weird chunk!
Found weird chunk!


100%|██████████| 998/998 [6:26:25<00:00, 30.62s/it]


In [33]:
# Do NOT run this cell as it will overwrite already existing weird_chunks file on disk
# with open(DATA_PATH + 'weird_chunks.pkl', 'wb') as f:
#     pickle.dump(weird_chunks, f, pickle.HIGHEST_PROTOCOL)
    
# print('Saved weird_chunks map!')

Saved weird_chunks map!


In [34]:
with open(DATA_PATH + 'weird_chunks.pkl', 'rb') as f:
    weird_chunks = pickle.load(f)

In [37]:
total_weird_chunks = 0
for p_id, chunks in weird_chunks.items():
    total_weird_chunks += len(chunks)

total_weird_chunks = 0
for p_id, nodules in patient_deduped_nodules_map.items():
    total_weird_chunks += len(nodules)
total_weird_chunks

8419