1. Dataset and DataLoader

2. Visualize one of outputs from data loader (greyscale)

3. Unet model

4. show training loop (but no train)

5. load saved model (best one from our trained models)

6. show test loop and run it

7. show the result and visualization (greyscale)

In [1]:
import os
import numpy as np
import nibabel
import sys
from glob import glob
import argparse
import json
import random
import matplotlib.pyplot as plt
import PIL

import torch
from torch.utils.data import Dataset
from torchvision import transforms
import torchvision.transforms.functional as TF

# 1. Convert NIfTI to Numpy

First, we need to convert the original data to 2D slices because our model is for 2D image. We define *convert_to_npy()* function to convert 3D nifti image files to 2D slice of numpy files. Also, this function saves an image number, slice number, the exsistence of cancer, and subset as a JSON format. The *convert_to_npy()* function is at dataset.py. The JSON file is used to load a dataset.

Second, we need to split the slices into the train set and test set. The *create_data_subsets()* function performs that task. You can choose how to split the data by examples(patients) or by slices and a split ratio between two sets. The *create_data_subsets()* function is at dataset.py.

In [4]:
def convert_to_npy(data_path, path):
    # load a path of image files and sort them
    train_data_path = os.path.join(datapath, 'imagesTr')
    label_data_path = os.path.join(datapath, 'labelsTr')
    images = sorted(os.listdir(train_data_path))
    labels = sorted(os.listdir(label_data_path))

    # set the location to converted image files
    image_saved_path =path +'npy_images/'
    
    # create a directory of the converted image files
    try:
        os.mkdir(image_saved_path)
    except OSError:
        print ("Creation of the directory %s failed" % image_saved_path)
    else:
        print ("Successfully created the directory %s " % image_saved_path)

    # set the location to converted label files
    label_saved_path = path + 'npy_labels/'
    # create a directory of the converted label files
    try:
        os.mkdir(label_saved_path)
    except OSError:
        print ("Creation of the directory %s failed" % label_saved_path)
    else:
        print ("Successfully created the directory %s " % label_saved_path)
        
    data_index = {}

    for img, label in zip(images,labels):
        # Load 3D CT images
        image_number = str(''.join(filter(str.isdigit, img))).zfill(3)
        training_image = nibabel.load(os.path.join(train_data_path, img))
        training_label = nibabel.load(os.path.join(label_data_path, label))

        for k in range(training_label.shape[2]):
            # axial cuts are made along the z axis (slice) 
            image_2d = np.array(training_image.get_fdata()[:, :, k], dtype='int16') # I checked: all values in the nifti files were integers, ranging from -1024 to approx 3000
            label_2d = np.array(training_label.get_fdata()[:, :, k], dtype='uint8') # only contains 1s and 0s
            slice_number = str(k).zfill(3)
            slice_index = image_number+'_'+slice_number
            
            if len(np.unique(label_2d))!=1:
              contains_cancer = True
            else:
              contains_cancer = False

            data_index[slice_index] = {
                'image': int(image_number),
                'slice': int(slice_number),
                'cancer': contains_cancer,
                'subset': None
            }

            np.save((image_saved_path+'image_{}_{}.npy'.format(image_number,slice_number)), image_2d)
            np.save((label_saved_path +'label_{}_{}.npy'.format(image_number,slice_number)), label_2d)
            
        print(f'Saved slices of image {image_number}')
    
    with open(path+"data_index.json", "w") as json_file:
        json.dump(data_index,json_file)

def create_data_subsets(path, split_on, split):
    data_index_file = path+"data_index.json"
    
    with open(data_index_file) as json_file:
        data_index = json.load(json_file)

        if split_on == "examples":
          image_index = [v['image'] for _, v in data_index.items()]
          unique_images = set(image_index)
          test_length = int(len(unique_images)*split)
          test_images = random.sample(unique_images,test_length)
          test_slices = [k for k,v in data_index.items() if v['image'] in test_images]
          for k,_ in data_index.items():
            if k in test_slices:
              data_index[k]['subset'] = 'test'
            else:
              data_index[k]['subset'] = 'train'

        if split_on == "slices":

            cancer_slice_index = [k for k,v  in data_index.items() if v['cancer'] is True]
            non_cancer_slice_index = [k for k, v in data_index.items() if v['cancer'] is False]
            cancer_slice_length = len(cancer_slice_index)
            non_cancer_slice_length = len(non_cancer_slice_index)
            slice_length = cancer_slice_length + non_cancer_slice_length

            proportion_cancer = cancer_slice_length/(non_cancer_slice_length+cancer_slice_length)
            test_length = int(slice_length*split)
            test_cancer_slices_length = int(test_length*proportion_cancer)
            test_non_cancer_slices_length = test_length-test_cancer_slices_length

            test_slices = [*random.sample(cancer_slice_index, test_cancer_slices_length),
                           *random.sample(non_cancer_slice_index,test_non_cancer_slices_length)]

            for k,_ in data_index.items():
                if k in test_slices:
                    data_index[k]['subset'] = 'test'
                else:
                    data_index[k]['subset'] = 'train'

    with open(path+"data_index_subsets.json", "w") as json_file:
        json.dump(data_index, json_file)

In [3]:
data_path = './Task10_Colon'
path = './data/'
split_on = "slices"
split = 0.1

convert_to_npy(data_path, path)
create_data_subsets(path, split_on, split)

# 2. Dataset and Data Loader

**1) Dataset**

We creates a Custom Dataset; ColonDataset. It uses JSON file which created from *create_data_subsets()* function. *ColonDataset()* class is at **data_loading.py**.  

We can set a sampling method. Otherwise, it uses the original dataset. Three sampling methods are undersample, oversample and only_tumor. Each sampling methods are defined as *get_undersample_files()*, *get_oversample_files()*, *get_only_tumor_files*, and *get_original_dataset()* at **data_loading.py**. 

When the data set is for train, it transforms the data set by resize, random crop, horizontal flipping, vertical flipping, and normalization. Otherwise, it transforms the data set by resize and normalization.

After creating ColonDataset; we split the dataset into train and valid set. This logic is implemented in the *load_datasets()* function. The load_datasets() function is at *train.py*.

**2) Data Loader**

We use *DataLoader* from **pytorch**(torch.utils.data.DataLoader). At *load_dataloader()* function, we creates two data loaders of train and valid set and put both of them in the dictionary as 'train' and 'val'. This dictionary is going to use in the training loop later.

In [5]:
def get_undersample_files(json_dir):
  with open(json_dir) as json_file:
    data_index = json.load(json_file)

  index_with_cancer = [k for k,v in data_index.items() if (v['cancer'] == True) & (v['subset']=='train')]
  index_no_cancer = [k for k,v in data_index.items() if (v['cancer'] == False) & (v['subset']=='train')]

  # randomly draw indices from set of indices of slices without cancer
  # same number of slices with and without cancer tissue
  rand_index_no_cancer = random.choices(index_no_cancer,k=len(index_with_cancer))

  image_files, label_files = [], []

  # add file names of images and labels to list
  for slice_index in rand_index_no_cancer:
    image = 'image_'+slice_index + '.npy'
    label = 'label_'+slice_index + '.npy'
    image_files.append(image)
    label_files.append(label)

  for slice_index in index_with_cancer:
    image = 'image_'+slice_index + '.npy'
    label = 'label_'+slice_index+ '.npy'
    image_files.append(image)
    label_files.append(label)
  
  return(image_files,label_files)

# oversample
def get_oversample_files(json_dir):

  with open(json_dir) as json_file:
    data_index = json.load(json_file)

  index_with_cancer = [k for k,v in data_index.items() if (v['cancer'] == True) & (v['subset']=='train')]
  index_no_cancer = [k for k,v in data_index.items() if (v['cancer'] == False) & (v['subset']=='train')]

  # randomly draw indices from set of indices of slices without cancer
  # same number of slices with and without cancer tissue
  rand_index_with_cancer = random.choices(index_with_cancer,k=len(index_no_cancer))

  image_files, label_files = [], []

  # add file names of images and labels to list

  for slice_index in rand_index_with_cancer:
    image = 'image_'+slice_index + '.npy'
    label = 'label_'+slice_index + '.npy'
    image_files.append(image)
    label_files.append(label)

  for slice_index in index_no_cancer:
    image = 'image_'+slice_index + '.npy'
    label = 'label_'+slice_index+ '.npy'
    image_files.append(image)
    label_files.append(label)
  
  return(image_files,label_files)

# only_tumor_files() returns a list of all files that contain cancer tissue.
# no files without cancer tissue will be returned
def get_only_tumor_files(json_dir):
  with open(json_dir) as json_file:
    data_index = json.load(json_file)

  index_with_cancer = [k for k,v in data_index.items() if (v['cancer'] == True) & (v['subset']=='train')]

  image_files, label_files = [], []

  # add file names of images and labels to list
  for slice_index in index_with_cancer:
    image = 'image_'+slice_index + '.npy'
    label = 'label_'+slice_index+ '.npy'
    image_files.append(image)
    label_files.append(label)
  
  return(image_files,label_files)

# get_original_dataset() returns a dataset without any sampling method
def get_original_dataset(json_dir, test):
    with open(json_dir) as json_file:
        data_index = json.load(json_file)

    if test is True:
        file_index = [k for k, v in data_index.items() if v['subset'] == 'test']
    else:
        file_index = [k for k,v in data_index.items() if v['subset'] == 'train']

    image_files, label_files = [], []

    # add file names of images and labels to list
    for slice_index in file_index:
        image = 'image_' + slice_index + '.npy'
        label = 'label_' + slice_index + '.npy'
        image_files.append(image)
        label_files.append(label)

    return(image_files, label_files)

# dataset class for primary colon cancer dataset
class ColonDataset(Dataset):
    """Colon Cancer dataset."""
    def __init__(self, image_dir, label_dir, json_dir, image_size, torch_transform, balance_dataset=None, test=None):
        """
        Args:
            image_dir: Path to image folder.
            label_dir: Path to label folder.
            csv_dir: Path to csv file, which gives information whether slice contains annotated cancer pixels.
            balance_dataset (optional): options to create a dataset with balanced numbers of slices
                containing cancer tissue or not containing cancer
                'oversample': uniformly draws samples from minority class to reach equal size
                'undersample': uniformly draws samples from majority class to reach equal size
                'only_tumor': only includes slices with cancer tissue
                None: no balance method is applied
        """
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.json_dir = json_dir
        self.image_size = image_size
        self.test = test
        self.balance_dataset = balance_dataset
        self.torch_transform = torch_transform
        if self.test is None:
            if self.balance_dataset == "undersample":
                self.image_files, self.label_files = get_undersample_files(self.json_dir)
            if self.balance_dataset == "oversample":
                self.image_files, self.label_files = get_oversample_files(self.json_dir)
            if self.balance_dataset == 'only_tumor':
                self.image_files, self.label_files = only_tumor_files(self.json_dir)
        if (self.balance_dataset is None) or (self.test is True):
            self.image_files, self.label_files = get_original_dataset(self.json_dir, self.test)

    def __len__(self):
      return len(self.image_files)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        image_path = os.path.join(self.image_dir,
                                  self.image_files[idx])
        label_path = os.path.join(self.label_dir,
                                  self.label_files[idx])
        
        image = np.load(image_path)
        label = np.load(label_path)
        if self.torch_transform:
            x, y = self.transform(image, label)

        return [x, y]

    def transform(self, image, label):
      # to PIL
      image = PIL.Image.fromarray(image)
      label = PIL.Image.fromarray(label)

      # Resize
      if self.test == None:
        image = TF.resize(image, size=(self.image_size+44, self.image_size+44))
        label = TF.resize(label, size=(self.image_size+44, self.image_size+44))
      else:
        image = TF.resize(image, size=(self.image_size, self.image_size))
        label = TF.resize(label, size=(self.image_size, self.image_size))

      # Random crop
      if self.test == None:
        i, j, h, w = transforms.RandomCrop.get_params(
            image, output_size=(self.image_size, self.image_size))
        image = TF.crop(image, i, j, h, w)
        label = TF.crop(label, i, j, h, w)

      # Random horizontal flipping
      if self.test == None:
        if random.random() > 0.5:
            image = TF.hflip(image)
            label = TF.hflip(label)

      # Random vertical flipping
      if self.test == None:
        if random.random() > 0.5:
            image = TF.vflip(image)
            label = TF.vflip(label)

      # Transform to tensor
      image = torch.from_numpy(np.array(image)) # to_tensor: /opt/conda/conda-bld/pytorch_1587428094786/work/torch/csrc/utils/tensor_numpy.cpp:141: UserWarning: The given NumPy array is not writeable, and PyTorch does not support non-writeable tensors. 
      image = image.unsqueeze(0).type(torch.FloatTensor)
      label = torch.from_numpy(np.array(np.expand_dims(label, 0))).type(torch.FloatTensor)
      # Normalize
      image = TF.normalize(image, mean=(-531.28,), std=(499.68,))


      return image, label

In [None]:
def load_datasets(split_ratio, img_path, label_path, json_file, image_size, transforms. dataset_type):
    """load the dataset (ColonDataset) and split the dataset into train and validation set   

    Return:
        train_dataset: a dataset for training
        val_dataset: a dataset for validation
    """
    dataset = ColonDataset(
        image_dir=img_path,
        label_dir=label_paths,
        json_dir= json_file,
        image_size=image_size,
        torch_transform=transforms,
        balance_dataset=dataset_type
    )
    # determine train and validation set size and split randomly
    train_size = int(split_ratio*len(dataset))
    val_size = len(dataset)-train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

    return train_dataset, val_dataset

def load_dataloader(batch_size, train, valid):
    """load a dataloader using train dataset and validation dataset (ColonDataset)

    Args:
        args: the object which store arguments from the parser
        train: train dataset (ColonDataset)
        valid: validation dataset (ColonDataset)
        
    Return:
        dataloader: a dataloader for training 
    """
    dataloader = {
       'train': DataLoader(train, shuffle=True, batch_size=batch_size, num_workers=4),
        'val': DataLoader(valid, shuffle=True, batch_size=batch_size, num_workers=4)
    }
    return dataloader

In [None]:
split_ratio = 0.9
batch_size = 12
img_path = "./data/npy_images"
label+path = "./data/npy_labels"
json_file = "./data/data_index_subsets.json"
transforms = True
dataset_type = "upsample"
image_size = 256

train, valid = load_datasets(split_ratio, img_path, label_path, json_file, image_size, transforms. dataset_type):
train_dataloader = load_dataloader(batch_size, train, valid)

# 3. Visualize images from the first batch of the data loader