In [1]:
import numpy as np 
import pandas as pd 
import os
from joblib import Parallel, delayed
import SimpleITK as sitk
from scipy import ndimage

In [2]:
def load_itk_image(filename):
    itkimage = sitk.ReadImage(filename)
    numpyImage = sitk.GetArrayFromImage(itkimage)
    numpyOrigin = np.array(list(reversed(itkimage.GetOrigin())))
    numpySpacing = np.array(list(reversed(itkimage.GetSpacing())))
    return numpyImage, numpyOrigin, numpySpacing

In [3]:
def worldToVoxelCoord(worldCoord, origin, spacing):
    stretchedVoxelCoord = np.absolute(worldCoord - origin)
    voxelCoord = stretchedVoxelCoord / spacing
    return voxelCoord

In [4]:
def resize_voxel(x, desired_shape):
    factors = np.array(x.shape).astype('float32') / np.array(desired_shape).astype('float32')
    output= ndimage.interpolation.zoom(x,1.0 / factors,order=1)
    assert output.shape == desired_shape, 'resize error'
    return output

In [5]:
def get_bounding_voxels_new(patient, df, n, VOXEL_SIZE,PATH):
    #given the nodule index and the nodule dataframe
    #return n jittered views of the nodule and n copies of the row index
    #from the row index we can look up the nodule size, malignancy, etc.
    df['ix'] = range(df.shape[0])
    dfsub = df[df['seriesuid']==patient]
    if dfsub.shape[0] == 0:
        return None
    # loading patient into array from mhd
    img,origin,spacing = load_itk_image(PATH + patient  + '.mhd')

    #now pick out VOXEL_SIZE mm of pixels in each dimension.
    numZpix = np.round(float(VOXEL_SIZE) / spacing[0])
    assert numZpix > 10, 'too few z pixels'
    numYpix = np.round(float(VOXEL_SIZE) / spacing[1])
    assert numYpix > 10, 'too few y pixels'
    numXpix = np.round(float(VOXEL_SIZE) / spacing[2])
    assert numXpix > 10, 'too few x pixels'


    voxels = []
    indices = []
    for i in range(int(n)):

        #choose a random nodule from this patient
        row = dfsub.iloc[ np.random.choice(dfsub.shape[0]) ]
        coords = (row['coordZ'], row['coordY'], row['coordX'])
        diameter_mm = row['diameter_mm']

        voxel_coords = worldToVoxelCoord(coords, origin, spacing)
        voxel_coords = np.round(voxel_coords)

        #fuzz 
        max_z_fuzz = int((numZpix/2) * (1 - diameter_mm / VOXEL_SIZE))
        max_y_fuzz = int((numYpix/2) * (1 - diameter_mm / VOXEL_SIZE))
        max_x_fuzz = int((numXpix/2) * (1 - diameter_mm / VOXEL_SIZE))
        zfuzz = np.random.randint(-max_z_fuzz, max_z_fuzz+1)
        yfuzz = np.random.randint(-max_y_fuzz, max_y_fuzz+1)
        xfuzz = np.random.randint(-max_x_fuzz, max_x_fuzz+1)

        z_start = np.clip(voxel_coords[0] + zfuzz - numZpix/2, 0, img.shape[0])
        z_end = np.clip(voxel_coords[0] + zfuzz +numZpix/2, 0, img.shape[0])
        y_start = np.clip(voxel_coords[1]+ yfuzz -numYpix/2, 0, img.shape[1])
        y_end = np.clip(voxel_coords[1] + yfuzz +numYpix/2, 0, img.shape[1])
        x_start = np.clip(voxel_coords[2]+xfuzz-numXpix/2, 0, img.shape[2])
        x_end = np.clip(voxel_coords[2]+xfuzz +numXpix/2, 0, img.shape[2])

        #now let's see if this voxel contains more than one nodule
        num_nodules = 0
        maxdiam_ix = -1
        maxdiam = 0
        for j in range(dfsub.shape[0]):
            row_j = dfsub.iloc[j]
            row_coords = (row_j['coordZ'], row_j['coordY'], row_j['coordX'])
            row_voxel_coords = worldToVoxelCoord(row_coords, origin, spacing)

            if (row_voxel_coords[2] > x_start+0 and row_voxel_coords[2] < x_end-0) and \
                (row_voxel_coords[1] > y_start+0 and row_voxel_coords[1] < y_end-0) and \
                (row_voxel_coords[0] > z_start+0 and row_voxel_coords[0] < z_end-0):

                #found one
                num_nodules += 1
                if row_j['diameter_mm'] > maxdiam:
                    maxdiam_ix = row_j['ix']
                    maxdiam = row_j['diameter_mm']

        if num_nodules == 0:
            print ('no nodules in region!')
            print (x_start, x_end, y_start, y_end, z_start, z_end)
            print (voxel_coords)

        assert num_nodules > 0, 'no nodules in region'
        # if num_nodules > 1:
            # print 'multiple nodules found in voxel. choosing largest'

        indices.append(maxdiam_ix)

        voxel = img[int(z_start):int(z_end),int(y_start):int(y_end),int(x_start):int(x_end)]


        # print voxel.shape, spacing
        voxel_norm = resize_voxel(voxel, (VOXEL_SIZE, VOXEL_SIZE, VOXEL_SIZE))

        # halfsize = size/2
        voxel_norm = np.clip(voxel_norm, -1000, 400)
        voxel_norm = np.transpose(voxel_norm, (2,1,0)) #X,Y,Z 
        voxels.append(voxel_norm)

    return np.stack(voxels), np.array(indices).astype('int32')

In [6]:
# def get_Xpositive_new(VOXEL_SIZE):
    
#     path_save = '../voxels/'
#     path_raw = '../data/'
    
#     df = pd.read_csv(path_raw+'annotations_enhanced.csv')
#     n_views = np.around(6*(64**3)/(VOXEL_SIZE**3))
    
#     for directory in [d for d in os.listdir(path_raw) if 'subset' in d]:
#         path_directory = path_raw+directory+'/'
#         luna_ids = [f.replace('.mhd','') for f in os.listdir(path_directory) if '.mhd' in f]
#         print (directory,'contains',len(luna_ids),'patients')
#         counter = 0
#         for patient in luna_ids:
#             results = get_bounding_voxels_new(path_directory,patient,df,n_views,VOXEL_SIZE)
#             counter +=1
#             if counter%20==0:
#                 print ('\t..completed',counter,'out of',len(luna_ids),'in this subset directory')
#             if results==None:
#                 continue
#             print ([r[0].shape for r in results if r is not None])
#             voxels = np.concatenate([r[0] for r in results if r is not None])
#             ixs = np.concatenate([r[1] for r in results if r is not None])
#             np.save(path_save+patient+'Xpositive.npy', voxels)
#             np.save(path_save+patient+'IXpositive.npy', ixs)

In [7]:
def get_Xpositive_new(VOXEL_SIZE):

    path_raw = '../data/'
    path_save = '../voxels/'
    
    df = pd.read_csv(path_raw+'annotations_enhanced.csv')
    n_views = np.around(6*(64**3)/(VOXEL_SIZE**3))
    
    for directory in [d for d in os.listdir(path_raw) if 'subset' in d]:
        path_directory = path_raw+directory+'/'
        luna_ids = [f.replace('.mhd','') for f in os.listdir(path_directory) if '.mhd' in f]
        print (directory,'contains',len(luna_ids),'patients')
        args = [(id,df,n_views) for id in luna_ids]
        results = Parallel(n_jobs=2,verbose=0)(delayed(get_bounding_voxels_new)(arg[0], arg[1],arg[2],VOXEL_SIZE,path_directory) for arg in args )

        #results is a list of (voxels, ixs)
        voxels = np.concatenate([r[0] for r in results if r is not None])
        ixs = np.concatenate([r[1] for r in results if r is not None])
        np.save(path_save+directory+'Xpositive.npy', voxels)
        np.save(path_save+directory+'IXpositive.npy', ixs)

In [8]:
def main(stage,rs):
    np.random.seed(rs)
    get_Xpositive_new(stage)
    exit()

In [9]:
main(32,42)

subset2 contains 89 patients




subset9 contains 88 patients




subset5 contains 89 patients




subset3 contains 89 patients




subset4 contains 89 patients




subset6 contains 89 patients




subset0 contains 89 patients




subset8 contains 88 patients




subset1 contains 89 patients




subset7 contains 89 patients


