In [1]:
import numpy as np 
import pandas as pd 
import os
import SimpleITK as sitk
from scipy import ndimage
from math import floor,ceil
from joblib import Parallel, delayed

In [2]:
def extract_voxels(DIRECTORY,PATIENT,DF,VIEWS=6,VOXEL_SIZE=64):
    
    itkimage = sitk.ReadImage('../data/'+DIRECTORY+'/'+PATIENT+'.mhd')
    img = sitk.GetArrayFromImage(itkimage)
    origin = np.array(list(reversed(itkimage.GetOrigin()))) # zyx
    spacing = np.array(list(reversed(itkimage.GetSpacing()))) # zyx

    # use only df rows pertaining to this patient
    DF['ix'] = range(DF.shape[0])
    dfsub = DF[DF['seriesuid']==PATIENT]

    # initialize list to hold voxels and corresponding indices
    trues = []
    falses = []
    trues_ix = []
    falses_ix = []
    count = 0

    # voxel shape with original image spacing
    voxel_shape = np.round(np.array([VOXEL_SIZE,VOXEL_SIZE,VOXEL_SIZE])/spacing)
    min_jitter = [ceil(m) for m in (voxel_shape/2)]
    max_jitter = [floor(m) for m in (img.shape-voxel_shape/2)]

    while count < max(VIEWS,VIEWS*dfsub.shape[0]):
        random_ZYX = np.array([np.random.randint(mi+1,ma-1) for mi,ma in zip(min_jitter,max_jitter)])
        voxel_max = (random_ZYX + np.round(voxel_shape/2)).astype(int)
        voxel_min = (random_ZYX - (voxel_shape-np.round(voxel_shape/2))).astype(int)

        voxel = img[voxel_min[0]:voxel_max[0],voxel_min[1]:voxel_max[1],voxel_min[2]:voxel_max[2]]

        # now let's see if this voxel contains at least one nodule
        nodules = 0
        for nodule in range(dfsub.shape[0]):
            row = dfsub.iloc[nodule]
            nodule_ZYX = np.array([row['coordZ'],row['coordY'],row['coordX']])
            nodule_ZYX = np.round(np.absolute(nodule_ZYX-origin)/ spacing)

            if (nodule_ZYX<voxel_max).all() and (nodule_ZYX>voxel_min).all():
                nodules += 1

        if nodules==0:

            count += 1
            maxdiam_ix = -1
            falses_ix.append(maxdiam_ix)

            # resize voxel spacing to 1mmx1mmx1mm
            voxel_norm = ndimage.interpolation.zoom(voxel,spacing,order=1,mode='nearest')
            voxel_norm = voxel_norm[0:64,0:64,0:64]
            voxel_norm = np.clip(voxel_norm, -1000, 400)
            voxel_norm = np.transpose(voxel_norm, (2,1,0)) #X,Y,Z 
            falses.append(voxel_norm)
        else:
            continue

    # for each nodule
    for nodule in range(dfsub.shape[0]):

        # set the row so that it's only referencing the nodule of choice
        row = dfsub.iloc[nodule]

        # grab diameter of nodule, divide by spacing to adjust to patient spacing
        nodule_rad = row['diameter_mm']/spacing/2

        # nodule coordinates based on original image origin
        nodule_ZYX = np.array([row['coordZ'],row['coordY'],row['coordX']])

        # shift nodule coordinates to origin of 0 instead of image origin
        nodule_ZYX = np.absolute(nodule_ZYX-origin)

        # adjust coordinates such that reflective of patient array spacing
        nodule_ZYX = np.round(nodule_ZYX / spacing)

        # nodule volume limits
        nodule_max = np.array([ceil(c) for c in nodule_ZYX+nodule_rad])
        nodule_min = np.array([floor(c) for c in nodule_ZYX-nodule_rad])
        nodule_shape = nodule_max-nodule_min

        # set max value of movement around nodule
        max_jitter = np.amin(np.stack((abs(nodule_max-img.shape),voxel_shape-nodule_shape)),axis=0)

        # set min value of movement around nodule
        min_jitter = nodule_min-max_jitter
        min_jitter[min_jitter > 0.] = 0.
        min_jitter = np.abs(min_jitter)

        #for the number of views, grab voxel around nodule
        for i in range(VIEWS):
            jitter = np.array([np.random.randint(mi+1,ma-1) for mi,ma in zip(min_jitter,max_jitter)])
            voxel_max = nodule_max + jitter
            voxel_min = (nodule_min - (voxel_shape-nodule_shape-jitter)).astype(int)

            voxel = img[voxel_min[0]:voxel_max[0],voxel_min[1]:voxel_max[1],voxel_min[2]:voxel_max[2]]

            # now let's see if this voxel contains more than one nodule
            maxdiam_ix = -1
            maxdiam = 0
            for nodule in range(dfsub.shape[0]):
                row = dfsub.iloc[nodule]
                nodule_ZYX = np.array([row['coordZ'],row['coordY'],row['coordX']])
                nodule_ZYX = np.round(np.absolute(nodule_ZYX-origin)/ spacing)

                if (nodule_ZYX<voxel_max).all() and (nodule_ZYX>voxel_min).all():
                    if row['diameter_mm'] > maxdiam:
                        maxdiam_ix = row['ix']
                        maxdiam = row['diameter_mm']

            trues_ix.append(maxdiam_ix)

            # resize voxel spacing to 1mmx1mmx1mm
            voxel_norm = ndimage.interpolation.zoom(voxel,spacing,order=1)
            voxel_norm = voxel_norm[0:64,0:64,0:64]
            voxel_norm = np.clip(voxel_norm, -1000, 400)
            voxel_norm = np.transpose(voxel_norm, (2,1,0)) #X,Y,Z 
            trues.append(voxel_norm)

    falses_array = np.stack(falses)
    falses_ix_array = np.array(falses_ix).astype('int32')
    if dfsub.shape[0]>0:
        trues_array = np.stack(trues)
        trues_ix_array = np.array(trues_ix).astype('int32')
        return trues_array,trues_ix_array,falses_array,falses_ix_array
    else:
        return None,None,falses_array,falses_ix_array

In [None]:
def main():
    path_raw = '../data/'
    path_save = '../LUNA_voxels/'

    df = pd.read_csv(path_raw+'CSVFILES/annotations_enhanced.csv')

    for directory in [d for d in os.listdir(path_raw) if 'subset' in d]:
        patients = [f.replace('.mhd','') for f in os.listdir(path_raw+directory) if '.mhd' in f]
        print (directory,'contains',len(patients),'patients')
        args = [(directory,patient,df) for patient in patients]
        results = Parallel(n_jobs=4,verbose=0)(delayed(extract_voxels)(arg[0],arg[1],arg[2]) for arg in args)

        #results is a list of (voxels, ixs)
        true = np.concatenate([r[0] for r in results if r[0] is not None])
        true_ix = np.concatenate([r[1] for r in results if r[1] is not None])
        false = np.concatenate([r[2] for r in results if r[2] is not None])
        false_ix = np.concatenate([r[3] for r in results if r[3] is not None])

        np.save(path_save+directory+'Xtrue.npy', true)
        np.save(path_save+directory+'IXtrue.npy', true_ix)
        np.save(path_save+directory+'Xrandom.npy', false)
        np.save(path_save+directory+'IXrandom.npy', false_ix)


In [None]:
if __name__=='__main__':
    main()

subset2 contains 89 patients




subset9 contains 88 patients




subset5 contains 89 patients




subset3 contains 89 patients




subset4 contains 89 patients




subset6 contains 89 patients




subset0 contains 89 patients


