# Detecting Cancer in Gigapixel Medical Images
## Applied Deep Learning (Spring 2018) 
### Akarsh Zingade, Kiran Ramesh, Arjun D'Cunha

### YouTube [demo](https://www.youtube.com/watch?v=royB3p2m9pM). GitHub [repo](https://github.com/kira-95/adl_cancer_detection).

Note: The 22 slides and tumor masks prepared by Prof. Joshua Gordon can be found [here](https://drive.google.com/drive/folders/1rwWL8zU9v0M27BtQKI52bF6bVLW82RL5?usp=sharing). The super set of this dataset can be found at [CAMELYON16](https://camelyon17.grand-challenge.org/Data/)


### Summary

We base our approach on the the work by Google AI's [Lui et al. (2017)](https://arxiv.org/abs/1703.02442) in "Detecting Cancer Metastases on Gigapixel Pathology Images". We use ImageNet pretrained architecture and then use transfer learning to solve the problem of detecting cancer cells in the images. We train it using a sliding-window based approach, where we train the model using the patches extracted using the sliding windows. Once the model is trained, we create a heatmap of the prediction on medical slides that were not used during training.


### Flow of the Notebook.

1. Load the train and test slides.
2. Extract patches for train and test slides.
3. Split the train patches into train and validation set
4. Save the train, validation and test slides.


#### Train Slides: 031, 064, 075, 084, 091, 094, 096, 101
#### Test Slides:  016, 078, 110

In [None]:
# from google.colab import drive
# try:
#   drive.mount('/content/gdrive')
# except: 
#   print ("already mounted")

In [None]:
# Define the level to be used for training the model.
lev = 4

# Define the window size for the sliding window.
patch_size = 299

# Define the center size to label the patch as tumorous or as healthy.
patch_centre = 128


In [None]:
# # Install the OpenSlide C library and Python bindings
# !apt-get install openslide-tools
# !pip install openslide-python


### Import the relevant modules. 
#### Import the Garbage Collection module to free objects

In [None]:
from openslide import open_slide, __library_version__ as openslide_lib_version, __version__ as openslide_version

import numpy as np
import random, os, glob, time
from skimage.color import rgb2gray
import gc

In [None]:

def read_slide(slide, x, y, level, width, height, as_float=False):
    """ Read a region from the slide
    Return a numpy RBG array
    """
    
    im = slide.read_region((x,y), level, (width, height))
    im = im.convert('RGB') # drop the alpha channel
    if as_float:
        im = np.asarray(im, dtype=np.float32)
    else:
        im = np.asarray(im)
    assert im.shape == (height, width, 3)
    return im

def find_tissue_pixels(image, intensity=0.8):
    """ Return tissue pixels for an image
    """
    
    im_gray = rgb2gray(image)
    assert im_gray.shape == (image.shape[0], image.shape[1])
    indices = np.where(im_gray <= intensity)
    return zip(indices[0], indices[1])
  
def apply_mask(im, mask, color=(1,0,0)):
    """ Return the mask as an image
    """
    
    masked = np.zeros(im.shape)
    for x,y in mask: masked[x][y] = color
    return masked

In [None]:
def get_patches(slide, tumor_mask, lev, x0, y0, patch_size):
    """
    Return the patch of given size for a given coordinate
    
    slide: Input slide
    tumor_mask: Input mask for the slide
    lev: Level of the slide
    x0: x-coordinate value for the patch
    y0: y-coordinate value for the patch
    """
    # read RGB patch
    patch_image = read_slide(slide,
                             x = x0-(patch_size//2)*(2**lev),
                             y = y0-(patch_size//2)*(2**lev), 
                             level = lev,
                             width = patch_size,
                             height = patch_size)
    
    # read tumor mask
    patch_mask = read_slide(tumor_mask,
                            x = x0-(patch_size//2)*(2**lev),
                            y = y0-(patch_size//2)*(2**lev), 
                            level = lev,
                            width = patch_size,
                            height = patch_size)
    
    patch_mask = patch_mask[:,:,0]
    
    # make tissue mask
    tissue_pixels = find_tissue_pixels(patch_image)
    patch_tissue = apply_mask(patch_image, tissue_pixels)
    
    return patch_image, patch_mask, patch_tissue

In [None]:
def check_patch_centre(patch_mask, patch_centre):
    """
    Check if there are any tumour in the patch.
    patch_mask: Mask of the patch
    patch_centre: Center window to observe the patch.
    """
    offset = int((patch_mask.shape[0]-patch_centre)/2)
    
    sum_ = np.sum(patch_mask[offset:offset+patch_centre, offset:offset+patch_centre])
    
    return sum_>0

In [None]:
def generate_images(slide_path, tumor_mask_path, lvl, window_size, num_pos_imgs, num_neg_imgs):
  """
  Generate the patches for the training and test slides
  
  slide_path: Path to the slide.
  lvl: The level at which the predictions are being evaluated
  tumor_mask_path: Path to the tumor mask slide
  window_size: Sliding Window size
  num_pos_imgs: Number of Tumorous patches to return
  num_neg_imgs: Number of healthy patches to return
  
  """
  patch_images = []
  patch_labels = []
  
  stride = 80
 
  slide = open_slide(slide_path)
  print ("Read WSI from %s with width: %d, height: %d" % (slide_path, slide.level_dimensions[0][0], slide.level_dimensions[0][1]))

  tumor_mask = open_slide(tumor_mask_path)
  print ("Read tumor mask from %s" % (tumor_mask_path))
  
  slide_image = read_slide(slide, 
                         x=0, 
                         y=0, 
                         level=lvl, 
                         width=slide.level_dimensions[lvl][0], 
                         height=slide.level_dimensions[lvl][1])
  
  tumor_mask_image = read_slide(tumor_mask, 
                         x=0, 
                         y=0, 
                         level=lvl, 
                         width=slide.level_dimensions[lvl][0], 
                         height=slide.level_dimensions[lvl][1])
  
  tumor_mask_image = tumor_mask_image[:,:,0]
  
  count = 0
  
  for i in range(0, slide.level_dimensions[lvl][1] - window_size - stride, stride):
    for j in range(0, slide.level_dimensions[lvl][0] - window_size - stride, stride):
      
      patch = slide_image[i:i+window_size, j:j+window_size]
      tumor_mask_patch = tumor_mask_image[i:i+window_size, j:j+window_size]
      
      
      
      tissue_pixels = find_tissue_pixels(patch)
      tissue_pixels = list(tissue_pixels)
      percent_tissue = len(tissue_pixels) / float(patch.shape[0] * patch.shape[0]) * 100
    
      if check_patch_centre(tumor_mask_patch, 128) and percent_tissue > 50:
        patch_images.append(patch)
        patch_labels.append(1)
        continue
        
      if percent_tissue > 50:
        patch_images.append(patch)
        patch_labels.append(0)
      else:
        if np.random.uniform() > 0.9:
          patch_images.append(patch)
          patch_labels.append(0)
        
      count += 1
      if count % 2000 == 0:
        print(count)
   
  tumor_idxs = [idx for idx in range(len(patch_labels)) if int(patch_labels[idx]) == 1]
  normal_idxs = [idx for idx in range(len(patch_labels)) if int(patch_labels[idx]) == 0]
  np.random.shuffle(tumor_idxs)
  np.random.shuffle(normal_idxs)
  tumor_idxs = tumor_idxs[:num_pos_imgs]
  normal_idxs = normal_idxs[:num_neg_imgs]
  idxs = tumor_idxs + normal_idxs
  np.random.shuffle(idxs)
  patch_images = [patch_images[idx] for idx in idxs]
  patch_labels = [patch_labels[idx] for idx in idxs]
        
  return patch_images, patch_labels

## Extract patches for train slides. 

In [None]:
X = []
y = []

TRAIN_SLIDES = ['016', '031', '064', '075', '084','091', '096', '101']
SLIDES_DIR = './slides/'

for num in TRAIN_SLIDES:
  print (num)
  slide_path = os.path.join(SLIDES_DIR, 'tumor_' + num + '.tif')
  tumor_mask_path = os.path.join(SLIDES_DIR, 'tumor_' + num + '_mask.tif')  
  patch_images, patch_labels = generate_images(slide_path, tumor_mask_path, 4, 299, 400,100)
  X.extend(patch_images)
  y.extend(patch_labels)

## Extract patches for test slides. 

In [None]:
X_test = []
y_test = []

TEST_SLIDES = ['078','016','110']#['091','110']
SLIDES_DIR = './slides/'

for num in TEST_SLIDES:
  print (num)
  slide_path = os.path.join(SLIDES_DIR, 'tumor_' + num + '.tif')
  tumor_mask_path = os.path.join(SLIDES_DIR, 'tumor_' + num + '_mask.tif')  
  patch_images, patch_labels = generate_images(slide_path, tumor_mask_path, 4, 299, 400,100)
  X_test.extend(patch_images)
  y_test.extend(patch_labels)

In [None]:
# Split the train set into train and validation set
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y, shuffle=True)

In [None]:
del X,y
gc.collect()

In [None]:
# Get the count of images for each class across train, val and test dataset

try:
  unique, counts = np.unique(np.argmax(y_train,axis=1), return_counts=True)
except:
  unique, counts = np.unique(y_train, return_counts=True)
print (dict(zip(unique, counts)))
try:
  unique, counts = np.unique(np.argmax(y_val,axis=1), return_counts=True)
except:
  unique, counts = np.unique(y_val, return_counts=True)
print (dict(zip(unique, counts)))

try:
  unique, counts = np.unique(np.argmax(y_test,axis=1), return_counts=True)
except:
  unique, counts = np.unique(y_test, return_counts=True)
print (dict(zip(unique, counts)))


### Save the Train, Validation and Test Images

In [None]:
dataset = {
    'X_train' : X_train,
    'y_train' : y_train,
    'X_val' : X_val,
    'y_val' : y_val,
    'X_test' : X_test,
    'y_test' : y_test,
}

In [None]:
np.save('./dataset_final',dataset)