In [0]:
 # Install the OpenSlide C library and Python bindings
!apt-get install openslide-tools
!pip install openslide-python

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  libopenslide0
Suggested packages:
  libtiff-tools
The following NEW packages will be installed:
  libopenslide0 openslide-tools
0 upgraded, 2 newly installed, 0 to remove and 7 not upgraded.
Need to get 92.5 kB of archives.
After this operation, 268 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libopenslide0 amd64 3.4.1+dfsg-2 [79.8 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 openslide-tools amd64 3.4.1+dfsg-2 [12.7 kB]
Fetched 92.5 kB in 0s (1,128 kB/s)
Selecting previously unselected package libopenslide0.
(Reading database ... 130824 files and directories currently installed.)
Preparing to unpack .../libopenslide0_3.4.1+dfsg-

In [0]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
import random
import numpy as np
from openslide import open_slide, __library_version__ as openslide_version
from PIL import Image
from skimage.color import rgb2gray

In [0]:
from google.colab import drive
import os

drive.mount('/content/gdrive')
os.chdir('gdrive/My Drive/Colab Notebooks')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


#Processing Data





## Splitting Slides into Train/Validation/Testing

In [0]:
from os import listdir
from os.path import isfile, join
import os.path

images = []
mypath = "./slides"

onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
for fn in onlyfiles:
  if (os.path.splitext(fn)[0] not in images) and ('mask' not in os.path.splitext(fn)[0]):
    images.append(os.path.splitext(fn)[0])

# remove unusual tumor images
images.remove("tumor_038")
images.remove("tumor_038 (1)")
images.remove("tumor_099")
print(images)


['tumor_091', 'tumor_001', 'tumor_002', 'tumor_005', 'tumor_012', 'tumor_016', 'tumor_059', 'tumor_075', 'tumor_023', 'tumor_057', 'tumor_035', 'tumor_019', 'tumor_081', 'tumor_084', 'tumor_094', 'tumor_110', 'tumor_096', 'tumor_101', 'tumor_031', 'tumor_064', 'tumor_078']


In [0]:
print("Number of slides: %d" %(len(images)))

Number of slides: 21


In [0]:
import random 

random.seed(1)
random.shuffle(images)

train_data = images[:15]
val_data = images[15:18]
test_data = images[18:21]
print("#train: %d, #validation: %d, #test: %d" %(len(train_data), len(val_data), len(test_data)))

#train: 15, #validation: 3, #test: 3


In [0]:
# create directories

import os 
# !rm -rf data1/
!rm -rf data/
def create_level_dir(dir_name):
  z1_dir = os.path.join(dir_name, 'z5')
  z2_dir = os.path.join(dir_name, 'z4')
  z3_dir = os.path.join(dir_name, 'z3')
  z4_dir = os.path.join(dir_name, 'z2')
  z5_dir = os.path.join(dir_name, 'z1')
  
  os.mkdir(z1_dir)
  os.mkdir(z2_dir)
  os.mkdir(z3_dir)
  os.mkdir(z4_dir)
  os.mkdir(z5_dir)
  return z1_dir, z2_dir, z3_dir, z4_dir, z5_dir
  
def label_directory(dir_name):
  z1_dir, z2_dir, z3_dir, z4_dir, z5_dir = create_level_dir(dir_name)
  os.mkdir(os.path.join(z1_dir, 'tumor'))
  os.mkdir(os.path.join(z1_dir, 'no_tumor'))
  os.mkdir(os.path.join(z2_dir, 'tumor'))
  os.mkdir(os.path.join(z2_dir, 'no_tumor'))
  os.mkdir(os.path.join(z3_dir, 'tumor'))
  os.mkdir(os.path.join(z3_dir, 'no_tumor'))
  os.mkdir(os.path.join(z4_dir, 'tumor'))
  os.mkdir(os.path.join(z4_dir, 'no_tumor'))
  os.mkdir(os.path.join(z5_dir, 'tumor'))
  os.mkdir(os.path.join(z5_dir, 'no_tumor'))
  
# base_dir = os.getcwd()
# img_num = slide_path.split('_')[1].strip(".tif")

base_dir = 'data/'
train_dir = os.path.join(base_dir, 'train')
val_dir =  os.path.join(base_dir, 'val') 
test_dir = os.path.join(base_dir, 'test') 

if not os.path.exists(base_dir):
  os.mkdir(base_dir)
  
if not os.path.exists(train_dir):
  os.mkdir(train_dir)
  label_directory(train_dir)

if not os.path.exists(val_dir):
  os.mkdir(val_dir)
  label_directory(val_dir)

if not os.path.exists(test_dir):
  os.mkdir(test_dir)
  create_level_dir(test_dir)
  
print('train_dir: %s, test_dir: %s, val_dir: %s ' %(train_dir, test_dir, val_dir))

train_dir: data/train, test_dir: data/test, val_dir: data/val 


## Classifying the Data by Labels

In [0]:
def read_slide(slide, x, y, level, width, height, as_float=False, mask=False):
    im = slide.read_region((x,y), level, (width, height))
    im = im.convert('RGB') # drop the alpha channel
    if as_float:
        im = np.asarray(im, dtype=np.float32)
    else:
        im = np.asarray(im)
    assert im.shape == (height, width, 3)
    return (im if not mask else im[:, :, 0])

# overlaying a mask over image
def apply_mask(im, mask):
    masked =np.zeros_like(im)
    for x,y in mask: 
      masked[x][y] = 1
    return masked
  
# search for grey regions
def find_tissue_pixels(image, intensity=0.8):
    im_gray = rgb2gray(image)
    assert im_gray.shape == (image.shape[0], image.shape[1])
    indices = np.where(im_gray <= intensity)
    return zip(indices[0], indices[1])

In [0]:
import math
import cv2
def preprocess(im_name, level, is_train = False, is_val = False, is_test = False):
  # Process images of lower zoom level, which have memory issues if try to process whole slice
  slide_path = os.path.join(mypath, im_name)+ ".tif"
  mask_path = os.path.join(mypath, im_name) + "_mask.tif"
  slide = open_slide(slide_path)
  tumor_mask = open_slide(mask_path)
  x_dim = (slide.level_dimensions[level][0])
  y_dim = (slide.level_dimensions[level][1])
  
  x_num= int(math.ceil(x_dim/ 299))
  y_num = int(math.ceil(y_dim/299))
  idx = 0
  for i in range(x_num):
    lower_x = i*299 * (2**level)
    for j in range(y_num):
      lower_y = j * 299 *(2**level)
      img_window = read_slide(slide, 
                     x=lower_x, 
                     y=lower_y, 
                     level=level, 
                     width=299, 
                     height=299)
      mask_window = read_slide(tumor_mask, 
                     x=lower_x, 
                     y=lower_y, 
                     level=level, 
                     width=299, 
                     height=299,
                     mask = True)

      tissue_pixels = find_tissue_pixels(img_window)
      percent_tissue = len(tissue_pixels) / float(img_window.shape[0] * img_window.shape[0])
      tissue_region = apply_mask(img_window, tissue_pixels)
      tissue_threshold = .3

      if is_test:
        if percent_tissue> tissue_threshold:
          try:
            img_fn = 'level_'+str(level) +'_' + str(idx)
            img_output_fn = "data/test/z"+ str(level)+"/"+ img_fn + ".jpg" 
            cv2.imwrite(img_output_fn, img_window)
            print(img_output_fn)
          except:
            pass
      else:
        if is_train:
          dir_route = 'train'
        if is_val:
          dir_route = 'val'
        if percent_tissue> tissue_threshold:
          img_fn = 'level_'+str(level) +'_' + str(idx)
          if np.max(mask_window) > 0:
            img_output_fn = "data/" + dir_route + "/z" + str(level)+"/tumor/" + img_fn + ".jpg" 
          else:
            img_output_fn= "data/"+ dir_route + "/z" + str(level)+"/no_tumor/" + img_fn + ".jpg"
          print(img_output_fn)
          cv2.imwrite(img_output_fn, img_window)
      idx = idx+1

In [0]:
levels = [5, 4, 3, 2, 1]
# levels = [5, 4, 3, 2]

for level in levels:
  for val_fn in train_data:
    preprocess(val_fn, level, is_train = True)
  print("Done with zoom %d" %level)
  
for level in levels:
  for val_fn in val_data:
    preprocess(val_fn, level, is_val = True)
  print("Done with zoom %d" %level)
  
for level in levels:
  for val_fn in test_data:
    preprocess(val_fn, level, is_test = True)
  print("Done with zoom %d" %level)

## Count of Train/Test/Val Images

In [0]:
"""
I had to run zoom level 1 on local because colab crashed. 
Thus, the number of images in tumor/no_tumor for zoom level 1 is commented out.
"""
!find data/train/z5/tumor -type f | wc -l
!find data/train/z5/no_tumor -type f | wc -l
!find data/train/z4/tumor -type f | wc -l
!find data/train/z4/no_tumor -type f | wc -l
!find data/train/z3/tumor -type f | wc -l
!find data/train/z3/no_tumor -type f | wc -l
!find data/train/z2/tumor -type f | wc -l
!find data/train/z2/no_tumor -type f | wc -l
# !find data/train/z1/tumor -type f | wc -l
# !find data/train/z1/no_tumor -type f | wc -l

!find data/val/z5/tumor -type f | wc -l
!find data/val/z5/no_tumor -type f | wc -l
!find data/val/z4/tumor -type f | wc -l
!find data/val/z4/no_tumor -type f | wc -l
!find data/val/z3/tumor -type f | wc -l
!find data/val/z3/no_tumor -type f | wc -l
!find data/val/z2/tumor -type f | wc -l
!find data/val/z2/no_tumor -type f | wc -l
# !find data/val/z1/tumor -type f | wc -l
# !find data/val/z1/no_tumor -type f | wc -l

!find data/test/z5 -type f | wc -l
!find data/test/z4 -type f | wc -l
!find data/test/z3 -type f | wc -l
!find data/test/z2 -type f | wc -l
# !find data/test/z1 -type f | wc -l

74
90
227
401
703
1605
2376
6770
9
37
19
141
40
541
104
2121
60
218
828
3149


# Data Collection for Multi-zoom Level Training

I ran the following code on local because there was a problem when reading the slides on google drive (because a lot of people were using them)

In [0]:
import random 

random.seed(1)
random.shuffle(images)

train_data = images[:17]
val_data = images[17:]
print("#train: %d, #validation: %d" %(len(train_data), len(val_data)))

In [0]:
# create directories

import os 
!rm -rf data_zoom/

def create_level_dir(dir_name):
  z0_dir = os.path.join(dir_name, 'z3')
  z1_dir = os.path.join(dir_name, 'z4')
  os.mkdir(z0_dir)
  os.mkdir(z1_dir)
  return z0_dir, z1_dir
  
def label_directory(dir_name):
  z0_dir, z1_dir = create_level_dir(dir_name)
  os.mkdir(os.path.join(z0_dir, 'tumor'))
  os.mkdir(os.path.join(z0_dir, 'no_tumor'))
  os.mkdir(os.path.join(z1_dir, 'tumor'))
  os.mkdir(os.path.join(z1_dir, 'no_tumor'))
  

base_dir = 'data_zoom'
train_dir = os.path.join(base_dir, 'train')
val_dir =  os.path.join(base_dir, 'val') 

if not os.path.exists(base_dir):
  os.mkdir(base_dir)
  
if not os.path.exists(train_dir):
  os.mkdir(train_dir)
  label_directory(train_dir)

if not os.path.exists(val_dir):
  os.mkdir(val_dir)
  label_directory(val_dir)

print('train_dir: %s, val_dir: %s ' %(train_dir, val_dir))

In [0]:
def read_slide(slide, x, y, level, width, height, as_float=False, mask=False):
    im = slide.read_region((x,y), level, (width, height))
    im = im.convert('RGB') # drop the alpha channel
    if as_float:
        im = np.asarray(im, dtype=np.float32)
    else:
        im = np.asarray(im)
    assert im.shape == (height, width, 3)
    return (im if not mask else im[:, :, 0])

# overlaying a mask over image
def apply_mask(im, mask):
    masked =np.zeros_like(im)
    for x,y in mask: 
      masked[x][y] = 1
    return masked
  
# search for grey regions
def find_tissue_pixels(image, intensity=0.8):
    im_gray = rgb2gray(image)
    assert im_gray.shape == (image.shape[0], image.shape[1])
    indices = np.where(im_gray <= intensity)
    return list(zip(indices[0], indices[1]))

In [0]:
def get_patch_centered_at(im_slide, x, y, level):
  # returns a 299 x 299 patch that is 
  # centered at (x, y) for specified zoom level
  
  downsamples = int(im_slide.level_downsamples[level])
  x, y = int(x / downsamples), int(y / downsamples)
  patch = np.zeros((299, 299, 3), dtype = int)
  low_x, hi_x , low_y, hi_y = x - 150, x + 149, y - 150, y + 149
  lowb_x, hib_x = max(0, low_x), min(im_slide.level_dimensions[level][0], hi_x)
  lowb_y, hib_y = max(0, low_y), min(im_slide.level_dimensions[level][1], hi_y)
  width = hib_x - lowb_x
  height = hib_y - lowb_y
  start_x, end_x = lowb_x - low_x, hib_x - low_x
  start_y, end_y = lowb_y - low_y, hib_y - low_y
  patch[start_x: end_x, start_y: end_y] = read_slide(im_slide, lowb_x * downsamples, lowb_y * downsamples, level, width=height, height=width)
  return patch

In [0]:
import math
import cv2
base_dir = 'data_zoom'

def preprocess(im_name, level, is_train = False, is_val = False, is_test = False):
  # Process images of lower zoom level, which have memory issues if try to process whole slice
  try:
    slide_path = os.path.join(mypath, im_name)+ ".tif"
    mask_path = os.path.join(mypath, im_name) + "_mask.tif"
    slide = open_slide(slide_path)
    tumor_mask = open_slide(mask_path)
    x_dim = (slide.level_dimensions[level][0]) #15360
    y_dim = (slide.level_dimensions[level][1]) #13440
  except OSError:
    print("could not load %s" %(im_name))
  
  x_num= int(math.ceil(x_dim/ 299))
  y_num = int(math.ceil(y_dim/299))
  idx = 0
  print("loaded slide {}".format(im_name))
  for i in range(x_num):
    center_x = (i*299 + 150) * (2**level)
    for j in range(y_num):
      
      try:
        center_y = (j * 299 + 150) *(2**level)

        image0 = get_patch_centered_at(slide, center_x, center_y, level)
        image1 = get_patch_centered_at(slide, center_x, center_y, level + 1)
        mask = get_patch_centered_at(tumor_mask, center_x, center_y,  level)
        
        tissue_pixels = find_tissue_pixels(image1)
        percent_tissue = len(tissue_pixels) / float(image0.shape[0] * image0.shape[0])
        tissue_threshold = 0.5

        if percent_tissue> tissue_threshold:
          img_fn = "{0}_{1}".format(im_name, idx)
          dir_route = 'train' if is_train else 'val'
          label = 'tumor' if np.max(mask) > 0 else 'no_tumor'
          img_output_fn1 = "%s/%s/z%s/%s/%s.jpg" %(base_dir, dir_route, str(level), label, img_fn)
          img_output_fn2 = "%s/%s/z%s/%s/%s.jpg" %(base_dir, dir_route, str(level + 1), label, img_fn)
          if (i % 10 == 0 and j % 10 == 0):
            print("writing image to {}".format(img_output_fn1))
          cv2.imwrite(img_output_fn1, image0)
          cv2.imwrite(img_output_fn2, image1)
        idx = idx+1
      
      except ValueError:
        print("error on window")

In [0]:
for val_fn in train_data:
  print(val_fn)
  preprocess(val_fn, 3, is_train = True)
  print("Done with one")


for val_fn in val_data:
  preprocess(val_fn, 3, is_val = True)
  print("Done with one")

In [0]:
!find data_zoom/train/z4/tumor -type f | wc -l
!find data_zoom/train/z4/no_tumor -type f | wc -l
!find data_zoom/train/z3/tumor -type f | wc -l
!find data_zoom/train/z3/no_tumor -type f | wc -l

!find data_zoom/val/z4/tumor -type f | wc -l
!find data_zoom/val/z4/no_tumor -type f | wc -l
!find data_zoom/val/z3/tumor -type f | wc -l
!find data_zoom/val/z3/no_tumor -type f | wc -l

The following was the output for above code when I ran on local


```
    876
   47739
     874
   47739
     181
    9813
     181
    9813
  
```


 


# Code For Undersampling Training Data

In [0]:
LEVELS = [2, 3, 4, 5]

In [0]:
for level in LEVELS:
    data = 'data'
    train_dir = '{}/train/z{}'
    val_dir = '{}/val/z{}'
    train_tumor_dir = os.path.join(train_dir, 'tumor')
    train_no_tumor_dir = os.path.join(train_dir, 'no_tumor')

    num_tumor = len(os.listdir(train_tumor_dir.format(data, level)))
    num_no_tumor = len(os.listdir(train_no_tumor_dir.format(data, level)))
    print('\nWorking with level: {}\n'.format(level))
    print('Unbalanced data:')
    print('num tumor: ', num_tumor)
    print('num no tumor: ', num_no_tumor)

    no_tumor_undersampled = set(random.sample(os.listdir(train_no_tumor_dir.format(data, level)), num_tumor))

    all_files = set(os.listdir(train_no_tumor_dir.format(data, level)))
    to_be_removed = all_files - no_tumor_undersampled
    for file in to_be_removed:
        os.remove(os.path.join(train_no_tumor_dir.format(data, level), file))

    print('\nAfter undersampling:')
    print('num tumor: ', len(os.listdir(train_tumor_dir.format(data, level))))
    print('num no tumor: ', len(os.listdir(train_no_tumor_dir.format(data, level))))


Working with level: 2

Unbalanced data:
('num tumor: ', 476)
('num no tumor: ', 477)

After undersampling:
('num tumor: ', 476)
('num no tumor: ', 477)

Working with level: 3

Unbalanced data:
('num tumor: ', 585)
('num no tumor: ', 585)

After undersampling:
('num tumor: ', 585)
('num no tumor: ', 585)

Working with level: 4

Unbalanced data:
('num tumor: ', 202)
('num no tumor: ', 202)

After undersampling:
('num tumor: ', 202)
('num no tumor: ', 202)

Working with level: 5

Unbalanced data:
('num tumor: ', 67)
('num no tumor: ', 67)

After undersampling:
('num tumor: ', 67)
('num no tumor: ', 67)
