# Coding exercise for Wang Lab
### Implementation of nnU-net for the segmentation of pancreas and pancreatic lesions on CT scan, with classication of the pancreatic lesions
### by Leo Chen
### August/September 2024

In [1]:
### IMPORTS
import os
import glob
#import util

import numpy as np
import pandas as pd
import random
import math
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
from torch.autograd import Variable
import torch.nn.init as init

from collections import defaultdict
from collections import Counter

from datetime import datetime

import SimpleITK as sitk
#import nibabel as nib

import json
import shutil

In [3]:
# check if cuda is working
torch.cuda.is_available()
torch.cuda.device_count()
torch.cuda.current_device()

0

In [11]:
### GLOBAL VARIABLES

# using GPU?
gpu = False


# directories where the files are
traindir = r'C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\train'
valdir = r'C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\validation'
testdir = r'C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\test'

csvpath = r'C:\Users\Leo\OneDrive\Documents\GitHub\WangLabQuiz\csv files\trainval_metadata.csv'   # csv with the image dimensions, image and mask file paths

nnUNet_raw_dir = r'C:\Users\Leo\OneDrive\Documents\UHN-MedImg3D-ML-quiz\nnUnet_raw'

In [12]:
### FUNCTIONS FOR SITK and IMAGE AUGMENTATION

def rotateImage(original, anglex, angley, anglez, interpolate='linear'):
    """ Returns the 'rotated' 3d image about the physical center that is resampled based on the 'original' image
    1. original - original image 
    2. angle x is roll / twisting the body like a rolling pin, turning in dance
    3. angle y is yaw / rotating the body like a propeller blade, like break dancing
    4. angle z - pitch / tilt along the superior/inferior axis (i.e trendelenburg)
    
    """

    if interpolate == 'linear':
        interpolator = sitk.sitkLinear
    elif interpolate == 'NN':
        interpolator = sitk.sitkNearestNeighbor

    radx = anglex * math.pi / 180
    rady = angley * math.pi / 180
    radz = anglez * math.pi / 180

    origin = np.array(original.GetOrigin())
    pixelcenter = np.array(sitk.GetSize(original)) / 2.
    physicalcenter = sitk.TransformContinuousIndexToPhysicalPoint(pixelcenter)

    transform = sitk.Euler3DTransform()
    transform.SetCenter(physicalcenter)
    transform.SetRotation(radz, rady, radx)    # note the order is z, y, x

    unitvecs = np.transpose(np.reshape(original.GetDirection(), (-1, 3)))
    #print(unitvecs)
    matrix = np.reshape(transform.GetMatrix(), (-1, 3))
    inverse = np.linalg.inv(matrix)


    # the transform matrix is actually mapping backwards: post to pre
    # therefore the forward transformation is the inverse matrix
    transformedunitvecs = inverse @ unitvecs   # new i, j, k are columns
    #print(transformedunitvecs)
    newdirection = transformedunitvecs.flatten('F')    # flatten by column

    print(newdirection)
    neworigin = (matrix @ (origin - physicalcenter)) + physicalcenter

    rotatedImage = sitk.Resample(original, original, transform, interpolator)
    rotatedImage.SetDirection(newdirection)
    rotatedImage.SetOrigin(neworigin)

    return rotatedImage

def flipImage(original):
    """Flips an SimpleITK over left/right axis"""
    flipped = sitk.Flip(original, [True, False, False])
    return flipped

def flipslice(original):
    """Flips a numpy slice (2d image) """
    # flips 2D slice (reverses x indices)
    flipped = np.flipud(original)  #np.fliplr(original)
    return flipped

def bbox_3D(img):
    """Finds the bounding box around a 3D image (numpy)
    returns rmin, rmax, cmin, cmax, zmin, zmax (r = row, c = column)"""
    try:    
        z = np.any(img, axis=(1, 2))    #z
        c = np.any(img, axis=(0, 1))    #x , (c = column)
        r = np.any(img, axis=(0, 2))    #y , (r = row)

        rmin, rmax = np.where(r)[0][[0, -1]]
        cmin, cmax = np.where(c)[0][[0, -1]]
        zmin, zmax = np.where(z)[0][[0, -1]]

        #x min max, y min max, z min max
        return [rmin, rmax, cmin, cmax, zmin, zmax]
    except:
        return -1, -1, -1, -1, -1, -1


def bbox_2D(img):
    """Finds the bounding box around a 2D image (numpy)
    returns rmin, rmax, cmin, cmax (r = row, c = column)
    If no elements exist, then returns (-1, -1, -1, -1)"""
    
    try:
        c = np.any(img, axis=0)    #y , (c = column)
        r = np.any(img, axis=1)    #x , (r = row)

        rmin, rmax = np.where(r)[0][[0, -1]]
        cmin, cmax = np.where(c)[0][[0, -1]]
    
        return rmin, rmax, cmin, cmax
    except:
        return -1, -1, -1, -1


def cropImage(image, threshold, xshift, yshift):
    """Crops SimpleITK image to remove pixels below a threshold (e.g. black space)
    Can also shift by *xshift and *yshift (random shifts in pixels) for augmentation"""
    # load image
    npy = sitk.GetArrayFromImage(image)

    # GET METADATA
    direction = image.GetDirection()
    spacing = image.GetSpacing()

    # CALCULATE BOUNDING BOX OF BODY (removes black space)
    mask = npy > threshold
    [xmin, xmax, ymin, ymax, zmin, zmax] = bbox_3D(mask)

    # check to make sure shifts do not extend outside boundaries of image
    if xmin + xshift < 0 or xmax + xshift > npy.shape[2]:
        xshift = 0

    if ymin + yshift < 0 or ymax + yshift > npy.shape[1]:
        yshift = 0

    # CROP IMAGE
    newnpy = npy[zmin:zmax, (ymin+yshift):(ymax+yshift), (xmin+xshift):(xmax+xshift)]

    newimage = sitk.GetImageFromArray(newnpy)
    topleft = [int(xmin+xshift), int(ymin+yshift), zmin]
    neworigin = image.TransformIndexToPhysicalPoint(topleft)

    newimage.SetOrigin(neworigin)
    newimage.SetDirection(direction)
    newimage.SetSpacing(spacing)

    return newimage


def squareImage(image):
    """Makes an SimpleITK image square by padding with zeros
    (square meaning width = height)"""
    [numcols, numrows, numslices] = image.GetSize()
    npy = sitk.GetArrayFromImage(image)

    if numcols < numrows:    #pad columns
        numzerostopad = numrows - numcols
        leftpad = int(numzerostopad / 2)
        rightpad = numzerostopad - leftpad

        newnpy = np.concatenate((np.zeros([numslices, numrows, leftpad]), npy, np.zeros([numslices, numrows, rightpad])), axis=2)

        topleft = [-leftpad, 0, 0]
        neworigin = image.TransformIndexToPhysicalPoint(topleft)

    elif numrows <= numcols:  #pad rows
        numzerostopad = numcols - numrows
        toppad = int(numzerostopad / 2)
        botpad = numzerostopad - toppad

        newnpy = np.concatenate((np.zeros([numslices, toppad, numcols]), npy, np.zeros([numslices, botpad, numcols])), axis=1)

        topleft = [0, -toppad, 0]
        neworigin = image.TransformIndexToPhysicalPoint(topleft)

    paddedimg = sitk.GetImageFromArray(newnpy)
    paddedimg.SetOrigin(neworigin)
    paddedimg.SetDirection(image.GetDirection())
    paddedimg.SetSpacing(image.GetSpacing())

    return paddedimg

def resampleImage(image, finalsize, interpolation='linear'):
    """Resamples SimpleITK image to finalsize x finalsize (width and height in pixels)
    Preserves the original physical size of the image and number of slices
    Changes the resolution so that the new image has numslices x *finalsize x *finalsize dimensions"""
    
    size = image.GetSize()
    numslices = size[2]
    squaresize = size[1]

    # RESAMPLE TO finalsize x finalsize
    finalnpy = np.zeros([numslices, finalsize, finalsize])
    reference = sitk.GetImageFromArray(finalnpy)
    reference.SetOrigin(image.GetOrigin())
    reference.SetDirection(image.GetDirection())

    spacing = image.GetSpacing()
    newspacing = np.zeros(3)
    newspacing[0:2] = (squaresize - 1) * np.array(spacing[0:2]) / (finalsize - 1)
    newspacing[2] = spacing[2]
    reference.SetSpacing(newspacing)


    # MAKING RESAMPLING FILTER
    resample = sitk.ResampleImageFilter()
    resample.SetReferenceImage(reference)
    if interpolation == 'linear':
        resample.SetInterpolator(sitk.sitkLinear)
    elif interpolation == 'NN':
        resample.SetInterpolator(sitk.sitkNearestNeighbor)

    # RESAMPLE TO finalsize x finalsize x n
    resampledimg = resample.Execute(image)

    return resampledimg


def projectImage(reference, moving, interpolate = 'linear'):
    """Projects an SimpleITK image (*moving onto *reference)
    interpolate* = linear or NN (nearest neighbor)"""
    
    resample = sitk.ResampleImageFilter()
    resample.SetReferenceImage(reference)
    if interpolate == 'linear':
        resample.SetInterpolator(sitk.sitkLinear)
    elif interpolate == 'NN':
        resample.SetInterpolator(sitk.sitkNearestNeighbor)

    resampledimg = resample.Execute(moving)

    return resampledimg


def resampleImageToVoxelSize(image, voxelx, voxely, voxelz, interpolation='linear'):
    """Resamples SimpleITK *image* to spacing *[voxelx, voxely, voxelz] in mm
    Preserves the original physical size of the image
    *voxelz is slice thickness (usually)
    *voxelx and *voxely are voxel width and height, respectively
    """
    
    original_spacing = image.GetSpacing()
    original_size = image.GetSize()
    
    new_spacing = [voxelx, voxely, voxelz]
    new_size = [int(round(osz*ospc/nspc)) for osz,ospc,nspc in zip(original_size, original_spacing, new_spacing)]
    # new dimension will be original size * original spacing / new spacing
    # based on physical distance formula: 
    #    original size (pixel) * original spacing (mm / pixel) = new size (pixel) * new spacing (mm / pixel)
    
    if interpolation == 'linear':
        interpolator = sitk.sitkLinear
    elif interpolation == 'NN':
        interpolator = sitk.sitkNearestNeighbor
    
    # creates new image
    new_image = sitk.Resample(image, new_size, sitk.Transform(), interpolator,
                         image.GetOrigin(), new_spacing, image.GetDirection(), 0,
                         image.GetPixelID())
    
    return new_image


def windowImage(image, window_width, window_center, output_min=0, output_max=255):
    """Normalizes SimpleITK *image* (CT scan) based on window specification
    (example, abdominal soft tissue window is W = 400, C = 50, or -350 to 450)
    Clips values above 0 and 1
    """
    
    window_min = window_center - window_width / 2
    window_max = window_center + window_width / 2
    
    output_min = 0
    output_max = 255
    
    windowed_image = sitk.IntensityWindowing(image, window_min, window_max, output_min, output_max)
    
    return windowed_image
    


# Data pre-processing


## Functions

In [13]:
### FUNCTIONS FOR READING/LOADING THE DATA

def getImageAndMaskFilePaths(train_or_val, subtype):
    '''
    returns a list of file paths, sorted alphabetically
    [image_niftis_gz, mask_niftis_gz, image_niftis_panorama, mask_niftis_panorama]
    input: training or validation images ('train' or 'val', and the subtype {0, 1, or 2}
    
    '''
    
    if train_or_val == 'train':
        rootdir = traindir        
    elif train_or_val == 'val':
        rootdir = valdir
        
    subtypedir = os.path.join(rootdir, 'subtype' + str(subtype))
    
    
    ## this gets all of the file paths in the subtype folder for the image and mask files
    
    # example file path for a training image of subtype 0:
    #   '/kaggle/input/pancreas/train/subtype0/quiz_0_041_0000.nii'
        
    image_niftis_gz = [os.path.join(subtypedir, file_name) for file_name in os.listdir(subtypedir) if '0000' in file_name]
    image_niftis_gz.sort()
    
    mask_niftis_gz = [file_name.partition("_0000")[0] + '.nii' for file_name in image_niftis_gz]
    
    
    
    ## however the actual CT / segmentation is within the zipped files:
    #   '/kaggle/input/pancreas/train/subtype0/quiz_0_041_0000.nii/PANORAMA_101960_00001_0000.nii'
    
    
    return [image_niftis_gz, mask_niftis_gz]
   
    
    
def getImageFileDetails(imageniftis, maskniftis, train_or_val):
    '''Creates a dataframe with the following:
        study ID // subtype // CT width/height/depth // pancreas width/height/depth // lesion width/height/depth  // pancreas xmin/xmax / ymin/ymax / zmin/zmax 
          // CT spacing x/y/z 
       
       Input is a list of image nifti file paths, mask nifti file paths (panorama), and {'train' or 'val'}
    '''
    
    ### THIS IS NOT USED TO SPEED UP TRAINING ###
    ### (WAS INITIALLY USED IN 2D and 3D MODELS FROM SCRATCH)
    
    maindf = pd.DataFrame({'ID': pd.Series(dtype='string'),
                   'train/val': pd.Series(dtype='string'),
                   'subtype': pd.Series(dtype='int'),
                   'CT width': pd.Series(dtype='int'),
                   'CT height': pd.Series(dtype='int'),
                   'CT depth': pd.Series(dtype='int'),
                   'panc width': pd.Series(dtype='int'),
                   'panc height': pd.Series(dtype='int'),
                   'panc depth': pd.Series(dtype='int'),
                   'lesion width': pd.Series(dtype='int'),
                   'lesion height': pd.Series(dtype='int'),
                   'lesion depth': pd.Series(dtype='int'),
                   'panc xmin': pd.Series(dtype='int'),
                   'panc xmax': pd.Series(dtype='int'),
                   'panc ymin': pd.Series(dtype='int'),
                   'panc ymax': pd.Series(dtype='int'),
                   'panc zmin': pd.Series(dtype='int'),
                   'panc zmax': pd.Series(dtype='int'),
                   'CT spacing x': pd.Series(dtype='float'),
                   'CT spacing y': pd.Series(dtype='float'),
                   'CT spacing z': pd.Series(dtype='float'),
                   'CT direction x': pd.Series(dtype='float'),
                   'CT direction y': pd.Series(dtype='float'),
                   'CT direction z': pd.Series(dtype='float'),        
                   'image path': pd.Series(dtype='string'),
                   'mask path': pd.Series(dtype='string')
                  })

    for i, imagepath in enumerate(imageniftis):
        maskpath = maskniftis[i]
        
        ## gets the subtype, ID for the current image
        # (format of the file name is: /kaggle/input/pancreas/train/subtype0/quiz_0_041_0000.nii/PANORAMA_101960_00001_0000.nii)
        subfolder = [folder for folder in imagepath.split('/') if 'quiz' in folder][0]   #gets the subdirectory with "quiz"
        
        subtype = subfolder.split('_')[1]
        ID = int(subfolder.split('_')[2])
        
        
        ## gets the dimensions of CT image in pixels
        img = sitk.ReadImage(imagepath)
        
        CTwidth = img.GetWidth()
        CTheight = img.GetHeight()
        CTdepth = img.GetDepth()
                
        ## gets the dimensions of the pancreas and lesion segmentations
        mask = sitk.ReadImage(maskpath)
        mask_vol = sitk.GetArrayFromImage(mask)
    
        [xmin1, xmax1, ymin1, ymax1, zmin1, zmax1] = bbox_3D(np.int64(mask_vol) == 1)   # 1 = pancreas
        [xmin2, xmax2, ymin2, ymax2, zmin2, zmax2] = bbox_3D(np.int64(mask_vol) == 2)   # 2 = lesion
 
        width1 = xmax1 - xmin1
        height1 = ymax1 - ymin1
        depth1 = zmax1 - zmin1
        
        width2 = xmax2 - xmin2
        height2 = ymax2 - ymin2
        depth2 = zmax2 - zmin2

        ## gets the spacing (mm) and 'direction' vectors of x/y/z axis
        spacing = img.GetSpacing()
        direction = img.GetDirection()
        directionx = str(direction[0:2])
        directiony = str(direction[3:5])
        directionz = str(direction[6:8])
        
        ## eventual numpy file names
        image_filename =  os.path.join(numpydir, str(train_or_val) + '_' + "{:03d}".format(ID) + '_image.npy')
        mask_filename = os.path.join(numpydir, str(train_or_val) + '_' + "{:03d}".format(ID) + '_mask.npy')

        
        # study ID // subtype // CT width/height/depth // pancreas width/height/depth // lesion width/height/depth  // pancreas xmin/xmax / ymin/ymax / zmin/zmax 
          #   // CT spacing x/y/z 
        df = pd.DataFrame({'train/val':train_or_val, 'ID':ID, 'subtype':subtype, 'CT width':CTwidth, 'CT height':CTheight, 'CT depth':CTdepth, 
                           'panc width':width1, 'panc height':height1, 'panc depth':depth1, 
                           'lesion width':width2, 'lesion height':height2, 'lesion depth':depth2,
                           'panc xmin':xmin1, 'panc xmax':xmax1, 'panc ymin': ymin1, 'panc ymax':ymax1, 'panc zmin': zmin1, 'panc zmax':zmax1,
                           'CT spacing x':spacing[0], 'CT spacing y':spacing[1], 'CT spacing z':spacing[2], 
                           'CT direction x':directionx, 'CT direction y':directiony, 'CT direction z':directionz, 
                           'image path': imagepath, 'mask path': maskpath, 'image npy': image_filename, 'mask npy':mask_filename}, index = [0])
                           
        
        maindf = pd.concat([maindf, df])
                 
            
    return maindf

def getImageFileDataFrame(imageniftis_gz, maskniftis_gz, train_or_val):
    '''Creates a dataframe with the following:
        study ID // train/val // subtype // image gz path // mask gz path // image pano path // mask pano path
       
       Input is a list of image nifti file paths, mask nifti file paths (panorama), and {'train' or 'val'}
    '''
    
    
    maindf = pd.DataFrame({'ID': pd.Series(dtype='string'),
                           'train/val': pd.Series(dtype='string'),
                           'subtype': pd.Series(dtype='string'),
                           'image gz path': pd.Series(dtype='string'),
                           'mask gz path': pd.Series(dtype='string'),
                          })

    for i, imagepath_gz in enumerate(imageniftis_gz):
        maskpath_gz = maskniftis_gz[i]
        
        ## gets the subtype, ID for the current image
        # (format of the file name is: /kaggle/input/pancreas/train/subtype0/quiz_0_041_0000.nii/PANORAMA_101960_00001_0000.nii)
        subfolder = [folder for folder in imagepath_gz.split('/') if 'quiz' in folder][0]   #gets the subdirectory with "quiz"
        
        subtype = subfolder.split('_')[1]
        ID = subfolder.split('_')[2]
        
        
        
        # study ID // subtype // CT width/height/depth // pancreas width/height/depth // lesion width/height/depth  // pancreas xmin/xmax / ymin/ymax / zmin/zmax 
          #   // CT spacing x/y/z 
        df = pd.DataFrame({'train/val':train_or_val, 'ID':ID, 'subtype':subtype,
                           'image gz path': imagepath_gz, 'mask gz path': maskpath_gz
                           }, index = [0])
                           
        
        maindf = pd.concat([maindf, df])
                 
            
    return maindf

In [6]:
# this loop runs through all of the training and validation images, and builds a dataframe with the metadata

firstloop = True

for train_or_val in ['train', 'val']:
    for subtype in range(3):
        images_gz, masks_gz = getImageAndMaskFilePaths(train_or_val, subtype)
        
        subtypedf = getImageFileDataFrame(images_gz, masks_gz, train_or_val)

        if firstloop:
            maindf = subtypedf
            firstloop = False
        else:
            maindf = pd.concat([maindf, subtypedf])
        

# saving the metadata to csv file

maindf.to_csv(csvpath, index = False)

print('done')

done


In [14]:
# load main dataframe with all of information
maindf = pd.read_csv(csvpath, dtype='string')

traindf =  maindf[maindf['train/val'] == 'train']
valdf = maindf[maindf['train/val'] == 'val']


In [15]:
print (traindf)

      ID train/val subtype                                      image gz path  \
0    041     train       0  C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\tr...   
1    060     train       0  C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\tr...   
2    066     train       0  C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\tr...   
3    070     train       0  C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\tr...   
4    077     train       0  C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\tr...   
..   ...       ...     ...                                                ...   
247  497     train       2  C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\tr...   
248  501     train       2  C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\tr...   
249  508     train       2  C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\tr...   
250  527     train       2  C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\tr...   
251  535     train       2  C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\tr...   

                           

## Copying data to folders based on nnUnet data organization

### nnUnet data folder structure
nnUNet_raw/ <br>
1. Dataset001_BrainTumour <br>
    - dataset.json <br>
    - imagesTr <br>
    - imagesTs  # optional  <br>
    - labelsTr  <br>
<br>
- imagesTr contains the images belonging to the training cases. nnU-Net will perform pipeline configuration, training with cross-validation, as well as finding postprocessing and the best ensemble using this data.
- imagesTs (optional) contains the images that belong to the test cases. nnU-Net does not use them! This could just be a convenient location for you to store these images. Remnant of the Medical Segmentation Decathlon folder structure.
- labelsTr contains the images with the ground truth segmentation maps for the training cases.
- dataset.json contains metadata of the dataset.

1. nnUNet_raw/Dataset002_Heart/
    1. ├── dataset.json
    2. ├── imagesTr
        1. ├── la_003_0000.nii.gz
        2. ├── la_004_0000.nii.gz
        3. ├── ...
    3. ├── imagesTs
        1. ├── la_001_0000.nii.gz
        2. ├── la_002_0000.nii.gz
        3. ├── ...
    4. └── labelsTr
        1. ├── la_003.nii.gz
        2. ├── la_004.nii.gz
        3. ├── ...

In [16]:
def copyRawFiles(df, images_folder, labels_folder):
    for i, row in df.iterrows():
        if i % 10 == 9:
            print(i+1)
        ID = row['ID']
    
        old_image_path = row['image gz path']
        old_mask_path = row['mask gz path']
    
        image_img = sitk.ReadImage(old_image_path)
        mask_img = sitk.ReadImage(old_mask_path, sitk.sitkInt16)    # get rid of rounding error for segmentations
        new_mask_img = projectImage(image_img, mask_img, interpolate='NN')   # ensures that the voxel spacing is equal
        
        ### new file name scheme
        ### new_image_name = quiz_[ID]_0000.nii.gz
        ### new_mask_name = quiz_[ID].nii.gz
    
        new_image_name = 'quiz_' + ID + '_0000.nii.gz'
        new_mask_name = 'quiz_' + ID + '.nii.gz'
    
        new_image_path = os.path.join(images_folder, new_image_name)
        new_mask_path = os.path.join(labels_folder, new_mask_name)
    
        sitk.WriteImage(image_img, new_image_path)        
        sitk.WriteImage(new_mask_img, new_mask_path)

def copyRawFilesWithLesionClass(df, images_folder, labels_folder):
    for i, row in df.iterrows():
        if i % 10 == 9:
            print(i+1)
        ID = row['ID']
    
        old_image_path = row['image gz path']
        old_mask_path = row['mask gz path']
    
        image_img = sitk.ReadImage(old_image_path)
        mask_img = sitk.ReadImage(old_mask_path, sitk.sitkInt16)    # get rid of rounding error for segmentations
        new_mask_img = projectImage(image_img, mask_img, interpolate='NN')   # ensures that the voxel spacing is equal
        
        ### new file name scheme
        ### new_image_name = quiz_[ID]_0000.nii.gz
        ### new_mask_name = quiz_[ID].nii.gz
    
        new_image_name = old_image_path.split('\\')[-1]
        new_mask_name = old_mask_path.split('\\')[-1] + '.gz'
    
        new_image_path = os.path.join(images_folder, new_image_name)
        new_mask_path = os.path.join(labels_folder, new_mask_name)

        sitk.WriteImage(image_img, new_image_path)        
        sitk.WriteImage(new_mask_img, new_mask_path)

In [14]:
# initial copy -- got rid of lesion in the file name

raw_folder = r'C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\nnUnet_raw'
data_folder = r'C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\nnUnet_raw\Dataset001_Pancreas'

imagesTr_folder = os.path.join(data_folder, 'imagesTr')
labelsTr_folder = os.path.join(data_folder, 'labelsTr')

imagesTs_folder = os.path.join(data_folder, 'imagesTs')
labelsTs_folder = os.path.join(data_folder, 'labelsTs')

for folder in [raw_folder, data_folder, imagesTr_folder, imagesTs_folder, labelsTr_folder, labelsTs_folder]:
    if not os.path.exists(folder):
        os.makedirs(folder)


print('copying training files...')

copyRawFiles(traindf, imagesTr_folder, labelsTr_folder)

print(' -- done')

print('copying val files')
copyRawFiles(valdf, imagesTs_folder, labelsTs_folder)

    
print(' -- done')

copying training files...
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
 -- done
copying val files


NameError: name 'images_Ts_folder' is not defined

In [17]:
# 2nd copy with classification -- kept lesion labels

raw_folder = r'C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\nnUnet_raw'
data_folder = r'C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\nnUnet_raw\Dataset002_PancreasLesion'

imagesTr_folder = os.path.join(data_folder, 'imagesTr')
labelsTr_folder = os.path.join(data_folder, 'labelsTr')

imagesTs_folder = os.path.join(data_folder, 'imagesTs')
labelsTs_folder = os.path.join(data_folder, 'labelsTs')

for folder in [raw_folder, data_folder, imagesTr_folder, imagesTs_folder, labelsTr_folder, labelsTs_folder]:
    if not os.path.exists(folder):
        os.makedirs(folder)


print('copying training files...')

copyRawFilesWithLesionClass(traindf, imagesTr_folder, labelsTr_folder)

print('  -- done')


print('copying val files')
copyRawFilesWithLesionClass(valdf, imagesTs_folder, labelsTs_folder)

    
print(' -- done')


copying training files...
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
  -- done
copying val files
260
270
280
 -- done


### dataset.json

In [18]:
raw_folder = r'C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\nnUnet_raw'
data_folder = r'C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\nnUnet_raw\Dataset002_PancreasLesion'


In [19]:
dataset_json_path = os.path.join(data_folder, 'dataset.json')

dictionary = {
    "channel_names": {  # formerly modalities
        "0": "CT", 
        }, 
    
    "labels": {  # THIS IS DIFFERENT NOW!
        "background": 0,
        "pancreas": 1,
        "lesion": 2
        }, 
 
    "numTraining": 252, 
 
    "file_ending": ".nii.gz",
 
    "overwrite_image_reader_writer": "SimpleITKIO"  # optional! If not provided nnU-Net will automatically determine the ReaderWriter
}
 
json_object = json.dumps(dictionary, indent=4)
 
with open(dataset_json_path, "w") as outfile:
    outfile.write(json_object)

### lesion_class.json

In [10]:

## Makes a dictionary with the study ID: lesion class labels

# load main dataframe with all of information
maindf = pd.read_csv(csvpath, dtype='string')

lesion_dict = pd.Series(maindf.subtype.values,index=maindf.ID).to_dict()

lesion_json_path = os.path.join(data_folder, 'lesion_class.json')

json_object = json.dumps(lesion_dict, indent=4)
 
with open(lesion_json_path, "w") as outfile:
    outfile.write(json_object)


In [7]:
print(lesion_dict)

{'041': '0', '060': '0', '066': '0', '070': '0', '077': '0', '117': '0', '126': '0', '139': '0', '145': '0', '150': '0', '159': '0', '160': '0', '163': '0', '199': '0', '219': '0', '220': '0', '222': '0', '224': '0', '235': '0', '285': '0', '288': '0', '292': '0', '298': '0', '302': '0', '306': '0', '310': '0', '313': '0', '318': '0', '323': '0', '324': '0', '327': '0', '329': '0', '360': '0', '370': '0', '374': '0', '386': '0', '390': '0', '392': '0', '399': '0', '400': '0', '401': '0', '430': '0', '441': '0', '443': '0', '446': '0', '448': '0', '449': '0', '457': '0', '458': '0', '459': '0', '475': '0', '498': '0', '502': '0', '509': '0', '511': '0', '523': '0', '526': '0', '533': '0', '539': '0', '540': '0', '542': '0', '544': '0', '003': '1', '006': '1', '007': '1', '010': '1', '016': '1', '018': '1', '025': '1', '027': '1', '029': '1', '030': '1', '031': '1', '034': '1', '036': '1', '059': '1', '072': '1', '075': '1', '079': '1', '086': '1', '087': '1', '096': '1', '102': '1', '11

# Pre-processing nnUnet

## Pre-processing

!nnUNetv2_plan_and_preprocess -d DATASET_ID --verify_dataset_integrity
<br>
<br>
!nnUNet_raw="/kaggle/working/nnUnet_raw" nnUNet_preprocessed="/kaggle/working/nnUNet_preprocessed" nnUNet_results="/kaggle/working/nnUNet_results" nnUNetv2_plan_and_preprocess -d 001 --verify_dataset_integrity  -pl nnUNetPlannerResEncM
<br>
(for ResNet encoder)

In [20]:
!nnUNetv2_plan_and_preprocess -d 002 --verify_dataset_integrity
#!nnUNet_raw="/kaggle/working/nnUnet_raw" nnUNet_preprocessed="/kaggle/working/nnUNet_preprocessed" nnUNet_results="/kaggle/working/nnUNet_results" nnUNetv2_plan_and_preprocess -d 001 --verify_dataset_integrity

Fingerprint extraction...
Dataset002_PancreasLesion
Using <class 'nnunetv2.imageio.simpleitk_reader_writer.SimpleITKIO'> reader/writer

####################
verify_dataset_integrity Done. 
If you didn't see any error messages then your dataset is most likely OK!
####################

Using <class 'nnunetv2.imageio.simpleitk_reader_writer.SimpleITKIO'> reader/writer
Experiment planning...

############################
INFO: You are using the old nnU-Net default planner. We have updated our recommendations. Please consider using those instead! Read more here: https://github.com/MIC-DKFZ/nnUNet/blob/master/documentation/resenc_presets.md
############################

Dropping 3d_lowres config because the image size difference to 3d_fullres is too small. 3d_fullres: [ 59. 118. 181.], 3d_lowres: [59, 118, 181]
2D U-Net configuration:
{'data_identifier': 'nnUNetPlans_2d', 'preprocessor_name': 'DefaultPreprocessor', 'batch_size': 132, 'patch_size': (128, 192), 'median_image_size_in_voxels': a


  0%|          | 0/252 [00:00<?, ?it/s]
  0%|          | 1/252 [00:02<10:05,  2.41s/it]
  1%|1         | 3/252 [00:02<02:45,  1.50it/s]
  4%|4         | 11/252 [00:02<00:33,  7.29it/s]
  6%|6         | 16/252 [00:02<00:20, 11.33it/s]
  8%|8         | 21/252 [00:02<00:14, 15.87it/s]
 10%|#         | 26/252 [00:03<00:12, 18.08it/s]
 12%|#1        | 30/252 [00:03<00:10, 21.29it/s]
 13%|#3        | 34/252 [00:03<00:10, 20.83it/s]
 15%|#5        | 38/252 [00:03<00:08, 24.17it/s]
 17%|#7        | 43/252 [00:03<00:07, 29.33it/s]
 19%|#8        | 47/252 [00:03<00:06, 31.61it/s]
 20%|##        | 51/252 [00:03<00:05, 33.58it/s]
 22%|##1       | 55/252 [00:03<00:05, 35.14it/s]
 24%|##4       | 61/252 [00:03<00:04, 41.55it/s]
 26%|##6       | 66/252 [00:04<00:04, 43.81it/s]
 28%|##8       | 71/252 [00:04<00:03, 45.39it/s]
 30%|###       | 76/252 [00:04<00:03, 46.44it/s]
 32%|###2      | 81/252 [00:04<00:03, 47.23it/s]
 34%|###4      | 86/252 [00:04<00:03, 47.84it/s]
 36%|###6      | 91/252 [00:04

## Training

can use ' -p nnUNetResEncUNetMPlans ' (for ResNet encoder)

In [None]:
'''
usage: nnUNetv2_train [-h] [-tr TR] [-p P]
                      [-pretrained_weights PRETRAINED_WEIGHTS]
                      [-num_gpus NUM_GPUS] [--use_compressed] [--npz] [--c]
                      [--val] [--val_best] [--disable_checkpointing]
                      [-device DEVICE]
                      dataset_name_or_id configuration fold

positional arguments:
  dataset_name_or_id    Dataset name or ID to train with
  configuration         Configuration that should be trained
  fold                  Fold of the 5-fold cross-validation. Should be an int
                        between 0 and 4.

options:
  -h, --help            show this help message and exit
  -tr TR                [OPTIONAL] Use this flag to specify a custom trainer.
                        Default: nnUNetTrainer
  -p P                  [OPTIONAL] Use this flag to specify a custom plans
                        identifier. Default: nnUNetPlans
  -pretrained_weights PRETRAINED_WEIGHTS
                        [OPTIONAL] path to nnU-Net checkpoint file to be used
                        as pretrained model. Will only be used when actually
                        training. Beta. Use with caution.
  -num_gpus NUM_GPUS    Specify the number of GPUs to use for training
  --use_compressed      [OPTIONAL] If you set this flag the training cases
                        will not be decompressed. Reading compressed data is
                        much more CPU and (potentially) RAM intensive and
                        should only be used if you know what you are doing
  --npz                 [OPTIONAL] Save softmax predictions from final
                        validation as npz files (in addition to predicted
                        segmentations). Needed for finding the best ensemble.
  --c                   [OPTIONAL] Continue training from latest checkpoint
  --val                 [OPTIONAL] Set this flag to only run the validation.
                        Requires training to have finished.
  --val_best            [OPTIONAL] If set, the validation will be performed
                        with the checkpoint_best instead of checkpoint_final.
                        NOT COMPATIBLE with --disable_checkpointing! WARNING:
                        This will use the same 'validation' folder as the
                        regular validation with no way of distinguishing the
                        two!
  --disable_checkpointing
                        [OPTIONAL] Set this flag to disable checkpointing.
                        Ideal for testing things out and you dont want to
                        flood your hard drive with checkpoints.
  -device DEVICE        Use this to set the device the training should run
                        with. Available options are 'cuda' (GPU), 'cpu' (CPU)
                        and 'mps' (Apple M1/M2). Do NOT use this to set which
                        GPU ID! Use CUDA_VISIBLE_DEVICES=X nnUNetv2_train
                        [...] instead!
'''

## Modifying 'splits_final.json' to include the lesion labels


In [64]:
old_splits_path = r'C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\nnUNet_preprocessed\Dataset001_Pancreas\splits_final.json'
new_splits_path = r'C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\nnUNet_preprocessed\Dataset002_PancreasLesion\splits_final.json'

with open(old_splits_path) as f:
    splits_json_dict = json.load(f)



In [71]:
print(splits_json_dict)

[{'train': ['quiz_2_002', 'quiz_1_003', 'quiz_2_004', 'quiz_2_005', 'quiz_1_006', 'quiz_1_007', 'quiz_1_010', 'quiz_2_015', 'quiz_1_016', 'quiz_1_025', 'quiz_2_026', 'quiz_1_027', 'quiz_2_028', 'quiz_1_029', 'quiz_1_031', 'quiz_1_036', 'quiz_0_041', 'quiz_2_044', 'quiz_2_051', 'quiz_2_054', 'quiz_2_055', 'quiz_2_058', 'quiz_0_060', 'quiz_0_066', 'quiz_2_071', 'quiz_1_072', 'quiz_0_077', 'quiz_1_079', 'quiz_1_086', 'quiz_1_087', 'quiz_1_096', 'quiz_2_099', 'quiz_2_109', 'quiz_2_111', 'quiz_2_112', 'quiz_1_115', 'quiz_1_116', 'quiz_0_117', 'quiz_1_119', 'quiz_1_120', 'quiz_1_122', 'quiz_1_124', 'quiz_1_128', 'quiz_1_131', 'quiz_1_133', 'quiz_2_134', 'quiz_0_139', 'quiz_1_142', 'quiz_0_145', 'quiz_1_147', 'quiz_0_150', 'quiz_0_159', 'quiz_0_160', 'quiz_0_163', 'quiz_1_177', 'quiz_2_178', 'quiz_1_180', 'quiz_1_181', 'quiz_1_182', 'quiz_1_185', 'quiz_1_188', 'quiz_1_190', 'quiz_1_193', 'quiz_1_196', 'quiz_2_198', 'quiz_0_199', 'quiz_2_200', 'quiz_2_202', 'quiz_1_204', 'quiz_1_205', 'quiz_1_

In [69]:
def get_subtype_using_ID(df, ID):
    subdf = (df['ID'] == ID)
    subtype = df.loc[df['ID'] == ID]['subtype'].item()
    
    return(subtype)

for k, fold in enumerate(splits_json_dict):
    for j, train_val in enumerate(fold):
        for i, quiz_ID in enumerate(fold[train_val]):
            ID = quiz_ID.split('_')[-1]
            
            ## gets the new ID with the lesion
            subtype = get_subtype_using_ID(maindf, ID)
            
            new_name = 'quiz_' + str(subtype) + '_' + ID

            splits_json_dict[k][train_val][i] = new_name   #update the dictionary
        

In [72]:
new_splits_json_object = json.dumps(splits_json_dict, indent=4)
 
with open(new_splits_path, "w") as outfile:
    outfile.write(new_splits_json_object)

0


NameError: name 'nnUNetDatasetClassify' is not defined

In [87]:
print(ds.dataset)

{'quiz_0_041': {'data_file': 'C:\\Users\\Leo\\Documents\\UHN-MedImg3D-ML-quiz\\nnUNet_preprocessed\\Dataset002_PancreasLesion\\nnUNetPlans_3d_fullres\\quiz_0_041.npz', 'properties_file': 'C:\\Users\\Leo\\Documents\\UHN-MedImg3D-ML-quiz\\nnUNet_preprocessed\\Dataset002_PancreasLesion\\nnUNetPlans_3d_fullres\\quiz_0_041.pkl'}, 'quiz_0_060': {'data_file': 'C:\\Users\\Leo\\Documents\\UHN-MedImg3D-ML-quiz\\nnUNet_preprocessed\\Dataset002_PancreasLesion\\nnUNetPlans_3d_fullres\\quiz_0_060.npz', 'properties_file': 'C:\\Users\\Leo\\Documents\\UHN-MedImg3D-ML-quiz\\nnUNet_preprocessed\\Dataset002_PancreasLesion\\nnUNetPlans_3d_fullres\\quiz_0_060.pkl'}, 'quiz_0_066': {'data_file': 'C:\\Users\\Leo\\Documents\\UHN-MedImg3D-ML-quiz\\nnUNet_preprocessed\\Dataset002_PancreasLesion\\nnUNetPlans_3d_fullres\\quiz_0_066.npz', 'properties_file': 'C:\\Users\\Leo\\Documents\\UHN-MedImg3D-ML-quiz\\nnUNet_preprocessed\\Dataset002_PancreasLesion\\nnUNetPlans_3d_fullres\\quiz_0_066.pkl'}, 'quiz_0_070': {'data_

In [17]:
from nnunetv2.training.dataloading.data_loader_3d_classify import nnUNetDataLoader3Dclassify
from nnunetv2.utilities.plans_handling.plans_handler import PlansManager, ConfigurationManager
from nnunetv2.training.dataloading.nnunet_dataset_classify import nnUNetDatasetClassify
from nnunetv2.utilities.label_handling.label_handling import LabelManager
from batchgenerators.utilities.file_and_folder_operations import load_json, join
from nnunetv2.training.dataloading.base_data_loader_classify import nnUNetDataLoaderBaseClassify
from nnunetv2.training.dataloading.nnunet_dataset_classify import nnUNetDatasetClassify

import numpy as np

In [43]:


folder = r'C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\nnUNet_preprocessed\Dataset002_PancreasLesion\nnUNetPlans_3d_fullres'
ds = nnUNetDatasetClassify(folder, None, 0)


nnUNet_preprocessed = r'C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\nnUNet_preprocessed\Dataset002_PancreasLesion'
dataset_json = load_json(r'C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\nnUNet_preprocessed\Dataset002_PancreasLesion\dataset.json')

plans = load_json(join(nnUNet_preprocessed, 'nnUNetPlans.json'))

plans_manager = PlansManager(plans)

label_manager = plans_manager.get_label_manager(dataset_json)

dl = nnUNetDataLoaderBaseClassify(data=ds,
                                  batch_size=3,
                                  patch_size=[64, 128, 192],
                                  final_patch_size= [64, 128, 192],
                                  label_manager=label_manager
                                  )

In [3]:
print(dl)

<nnunetv2.training.dataloading.base_data_loader_classify.nnUNetDataLoaderBaseClassify object at 0x000002AFAAC2AA80>


In [4]:


dl3D = nnUNetDataLoader3Dclassify(data=ds,
                                  batch_size=3,
                                  patch_size=[64, 128, 192],
                                  final_patch_size= [64, 128, 192],
                                  label_manager=label_manager
                                  )

In [44]:
selected_keys = dl.get_indices()
# preallocate memory for data and seg
data_all = np.zeros(dl.data_shape, dtype=np.float32)
seg_all = np.zeros(dl.seg_shape, dtype=np.int16)
#lesion_all = np.zeros((len(selected_keys), 3))          # probability classes, one hot
lesion_all = np.zeros(len(selected_keys))

case_properties = []

for j, i in enumerate(selected_keys):     # 'i' is the key (quiz_2_413) and 'j' is the index 0-2
    if j == 0:
        # oversampling foreground will improve stability of model training, especially if many patches are empty
        # (Lung for example)
        #force_fg = self.get_do_oversample(j)
        force_fg = True
    
        data, seg, properties, lesion = dl._data.load_case(i)
        case_properties.append(properties)
    
        # If we are doing the cascade then the segmentation from the previous stage will already have been loaded by
        # self._data.load_case(i) (see nnUNetDataset.load_case)
        shape = data.shape[1:]
        dim = len(shape)
        bbox_lbs, bbox_ubs = dl.get_bbox(shape, force_fg, properties['class_locations'])

In [47]:
print(i)
print(bbox_lbs, bbox_ubs)

pancreas_locs = properties['class_locations'][1]
lesion_locs = properties['class_locations'][2]

quiz_1_034
[21, -8, -12] [85, 120, 180]


In [49]:
print(pancreas_locs[0])

[ 0 37 51 13]


In [48]:
print(pancreas_locs.shape)
print(lesion_locs.shape)
print(data.shape)
print(seg.shape)

(10000, 4)
(10000, 4)
(1, 75, 113, 169)
(1, 75, 113, 169)


In [5]:
dict = dl3D.generate_train_batch()

In [6]:
for key, value in dict.items():
    print(key)

data
target
keys
lesion_class


In [10]:
data = dict['data']
target = dict['target']
keys = dict['keys']
lesion_class = dict['lesion_class']

In [8]:
print(keys)
print(lesion_class)

['quiz_1_516' 'quiz_1_025' 'quiz_0_313']
[1. 1. 0.]


In [14]:
print(data.shape)
print(target.shape)

(3, 1, 64, 128, 192)
(3, 1, 64, 128, 192)
1.0


In [12]:
img = data[0, 0, :]
seg = target[0, 0, :]

In [3]:
pickle_path = r'C:\Users\Leo\Documents\UHN-MedImg3D-ML-quiz\nnUNet_preprocessed\Dataset002_PancreasLesion\nnUNetPlans_3d_fullres\quiz_0_041.pkl'

pickle = pd.read_pickle(pickle_path)

In [4]:
print(pickle)

{'sitk_stuff': {'spacing': (0.7049999833106995, 0.7049999833106995, 0.801025390625), 'origin': (-164.0625, -180.4687042236328, 1647.5), 'direction': (1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)}, 'spacing': [0.801025390625, 0.7049999833106995, 0.7049999833106995], 'shape_before_cropping': (106, 116, 161), 'bbox_used_for_cropping': [[0, 106], [0, 116], [0, 161]], 'shape_after_cropping_and_before_resampling': (106, 116, 161), 'class_locations': {1: array([[ 0, 33, 45, 55],
       [ 0, 29, 25, 22],
       [ 0, 24, 25, 38],
       ...,
       [ 0, 27, 55, 34],
       [ 0,  8, 38, 70],
       [ 0, 35, 44, 28]], dtype=int64), 2: array([[ 0, 30, 39, 51],
       [ 0, 34, 43, 31],
       [ 0, 34, 43, 28],
       ...,
       [ 0, 30, 41, 37],
       [ 0, 29, 36, 27],
       [ 0, 28, 53, 38]], dtype=int64)}}


In [11]:
class_locations = pickle['class_locations']


In [12]:
print(class_locations)


{1: array([[ 0, 33, 45, 55],
       [ 0, 29, 25, 22],
       [ 0, 24, 25, 38],
       ...,
       [ 0, 27, 55, 34],
       [ 0,  8, 38, 70],
       [ 0, 35, 44, 28]], dtype=int64), 2: array([[ 0, 30, 39, 51],
       [ 0, 34, 43, 31],
       [ 0, 34, 43, 28],
       ...,
       [ 0, 30, 41, 37],
       [ 0, 29, 36, 27],
       [ 0, 28, 53, 38]], dtype=int64)}


In [13]:
eligible_classes_or_regions = [i for i in class_locations.keys() if len(class_locations[i]) > 0]

print(eligible_classes_or_regions)

[1, 2]
