<a href="https://colab.research.google.com/github/iamsusiep/tumor_challenge/blob/master/data_extract.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
 # Install the OpenSlide C library and Python bindings
!apt-get install openslide-tools
!pip install openslide-python

Reading package lists... Done
Building dependency tree       
Reading state information... Done
openslide-tools is already the newest version (3.4.1+dfsg-2).
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 11 not upgraded.


In [0]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from openslide import open_slide, __library_version__ as openslide_version
import os
from PIL import Image
from skimage.color import rgb2gray

In [0]:
from google.colab import drive
import os

drive.mount('/content/gdrive')
os.chdir('gdrive/My Drive/Colab Notebooks')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


OSError: ignored

#Processing Data





## Splitting Slides into Train/Validation/Testing

In [0]:
from os import listdir
from os.path import isfile, join
import os.path

images = []
mypath = "./slides"

onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
for fn in onlyfiles:
  if (os.path.splitext(fn)[0] not in images) and ('mask' not in os.path.splitext(fn)[0]):
    images.append(os.path.splitext(fn)[0])

# remove unusual tumor images
images.remove("tumor_038")
images.remove("tumor_038 (1)")
images.remove("tumor_099")
print(images)


['tumor_091', 'tumor_001', 'tumor_002', 'tumor_005', 'tumor_012', 'tumor_016', 'tumor_059', 'tumor_075', 'tumor_023', 'tumor_057', 'tumor_035', 'tumor_019', 'tumor_081', 'tumor_084', 'tumor_094', 'tumor_110', 'tumor_096', 'tumor_101', 'tumor_031', 'tumor_064', 'tumor_078']


In [0]:
print("Number of slides: %d" %(len(images)))

Number of slides: 21


In [0]:
import random 

random.seed(1)
random.shuffle(images)

train_data = images[:15]
val_data = images[15:18]
test_data = images[18:21]
print("#train: %d, #validation: %d, #test: %d" %(len(train_data), len(val_data), len(test_data)))

#train: 15, #validation: 3, #test: 3


In [0]:
# create directories

import os 
# !rm -rf data1/
!rm -rf data/
def create_level_dir(dir_name):
  z1_dir = os.path.join(dir_name, 'z5')
  z2_dir = os.path.join(dir_name, 'z4')
  z3_dir = os.path.join(dir_name, 'z3')
  z4_dir = os.path.join(dir_name, 'z2')
  os.mkdir(z1_dir)
  os.mkdir(z2_dir)
  os.mkdir(z3_dir)
  os.mkdir(z4_dir)
  return z1_dir, z2_dir, z3_dir, z4_dir
  
def label_directory(dir_name):
  z1_dir, z2_dir, z3_dir, z4_dir = create_level_dir(dir_name)
  os.mkdir(os.path.join(z1_dir, 'tumor'))
  os.mkdir(os.path.join(z1_dir, 'no_tumor'))
  os.mkdir(os.path.join(z2_dir, 'tumor'))
  os.mkdir(os.path.join(z2_dir, 'no_tumor'))
  os.mkdir(os.path.join(z3_dir, 'tumor'))
  os.mkdir(os.path.join(z3_dir, 'no_tumor'))
  os.mkdir(os.path.join(z4_dir, 'tumor'))
  os.mkdir(os.path.join(z4_dir, 'no_tumor'))
  
# base_dir = os.getcwd()
# img_num = slide_path.split('_')[1].strip(".tif")

base_dir = 'data/'
train_dir = os.path.join(base_dir, 'train')
val_dir =  os.path.join(base_dir, 'val') 
test_dir = os.path.join(base_dir, 'test') 

if not os.path.exists(base_dir):
  os.mkdir(base_dir)
  
if not os.path.exists(train_dir):
  os.mkdir(train_dir)
  label_directory(train_dir)

if not os.path.exists(val_dir):
  os.mkdir(val_dir)
  label_directory(val_dir)

if not os.path.exists(test_dir):
  os.mkdir(test_dir)
  create_level_dir(test_dir)
  
print('train_dir: %s, test_dir: %s, val_dir: %s ' %(train_dir, test_dir, val_dir))

train_dir: data/train, test_dir: data/test, val_dir: data/val 


## Classifying the Data by Labels

In [0]:
def read_slide(slide, x, y, level, width, height, as_float=False, mask=False):
    im = slide.read_region((x,y), level, (width, height))
    im = im.convert('RGB') # drop the alpha channel
    if as_float:
        im = np.asarray(im, dtype=np.float32)
    else:
        im = np.asarray(im)
    assert im.shape == (height, width, 3)
    return (im if not mask else im[:, :, 0])

# overlaying a mask over image
def apply_mask(im, mask):
    masked =np.zeros_like(im)
    for x,y in mask: 
      masked[x][y] = 1
    return masked
  
# search for grey regions
def find_tissue_pixels(image, intensity=0.8):
    im_gray = rgb2gray(image)
    assert im_gray.shape == (image.shape[0], image.shape[1])
    indices = np.where(im_gray <= intensity)
    return zip(indices[0], indices[1])

In [0]:
# mypath = '/Users/sujipark/documents/slides'
"""
12224
27648
and 
24448
55296
"""
def get_images_masks(im_name, level_num, x_ = 0, y_ = 0, partition = False):
  slide_path = os.path.join(mypath, im_name)+ ".tif"
  mask_path = os.path.join(mypath, im_name) + "_mask.tif"
  slide = open_slide(slide_path)
  if partition is True:
    slide_image = read_slide(slide, 
                         x=x_, 
                         y=y_, 
                         level=level_num, 
                         width=299, 
                         height=299)
    tumor_mask = open_slide(mask_path)
    mask_image = read_slide(tumor_mask, 
                         x=x_, 
                         y=y_, 
                         level=level_num, 
                         width=299, 
                         height=299,
                         mask = True)
  else:
    slide_image = read_slide(slide, 
                         x=x_, 
                         y=y_, 
                         level=level_num, 
                         width=int(slide.level_dimensions[level_num][0]), 
                         height=int(slide.level_dimensions[level_num][1]))
    tumor_mask = open_slide(mask_path)
    mask_image = read_slide(tumor_mask, 
                         x=x_, 
                         y=y_, 
                         level=level_num, 
                         width=int(slide.level_dimensions[level_num][0]), 
                         height=int(slide.level_dimensions[level_num][1]),
                         mask = True)
  tissue_pixels = list(find_tissue_pixels(slide_image))
  
  percent_tissue = len(tissue_pixels) / float(slide_image.shape[0] * slide_image.shape[0]) * 100
#   print ("%d tissue_pixels pixels  (%.1f percent of the image)" % (len(tissue_pixels), percent_tissue)) 
  tissue_regions = apply_mask(slide_image, tissue_pixels)
#   plt.imshow(tissue_regions)
  return slide_image, mask_image, tissue_regions

In [0]:
import math
import cv2
def preprocess(im_name, level, is_train = False, is_val = False, is_test = False):
  # Process images of lower zoom level, which have memory issues if try to process whole slice
  slide_path = os.path.join(mypath, im_name)+ ".tif"
  mask_path = os.path.join(mypath, im_name) + "_mask.tif"
  slide = open_slide(slide_path)
  tumor_mask = open_slide(mask_path)
  x_dim = (slide.level_dimensions[level][0]) #15360
  y_dim = (slide.level_dimensions[level][1]) #13440
  
  x_num= int(math.ceil(x_dim/ 299))
  y_num = int(math.ceil(y_dim/299))
#   print('x_dim, y_dim, x_num, y_num: ', x_dim, y_dim, x_num, y_num)
  idx = 0
  for i in range(x_num):
    lower_x = i*299 * (2**level)
    for j in range(y_num):
      lower_y = j * 299 *(2**level)
#       print('lower_x, lower_y:', (lower_x, lower_y))
      
      img_window = read_slide(slide, 
                     x=lower_x, 
                     y=lower_y, 
                     level=level, 
                     width=299, 
                     height=299)
      mask_window = read_slide(tumor_mask, 
                     x=lower_x, 
                     y=lower_y, 
                     level=level, 
                     width=299, 
                     height=299,
                     mask = True)

      tissue_pixels = find_tissue_pixels(img_window)
      percent_tissue = len(tissue_pixels) / float(img_window.shape[0] * img_window.shape[0])
#       print(percent_tissue)
#       print(type(tissue_pixels))
      tissue_region = apply_mask(img_window, tissue_pixels)
      tissue_threshold = .3

      if is_test:
        if percent_tissue> tissue_threshold:
          try:
            img_fn = 'level_'+str(level) +'_' + str(idx)
            img_output_fn = "data/test/z"+ str(level)+"/"+ img_fn + ".jpg" 
            cv2.imwrite(img_output_fn, img_window)
            print(img_output_fn)
          except:
            pass
      else:
        if is_train:
          dir_route = 'train'
        if is_val:
          dir_route = 'val'
#         print('percent_tissue:', percent_tissue)
        if percent_tissue> tissue_threshold:
          img_fn = 'level_'+str(level) +'_' + str(idx)
          if np.max(mask_window) > 0:
            img_output_fn = "data/" + dir_route + "/z" + str(level)+"/tumor/" + img_fn + ".jpg" 
          else:
            img_output_fn= "data/"+ dir_route + "/z" + str(level)+"/no_tumor/" + img_fn + ".jpg"
          print(img_output_fn)
          cv2.imwrite(img_output_fn, img_window)
      idx = idx+1

In [0]:
levels = [5, 4, 3, 2]

for level in levels:
  for val_fn in train_data:
    preprocess(val_fn, level, is_train = True)
  print("Done with zoom %d" %level)
  
for level in levels:
  for val_fn in val_data:
    preprocess(val_fn, level, is_val = True)
  print("Done with zoom %d" %level)
  
for level in levels:
  for val_fn in test_data:
    preprocess(val_fn, level, is_test = True)
  print("Done with zoom %d" %level)

## Count of Train/Test/Val Images

In [0]:
!find data/train/z5/tumor -type f | wc -l
!find data/train/z5/no_tumor -type f | wc -l
!find data/train/z4/tumor -type f | wc -l
!find data/train/z4/no_tumor -type f | wc -l
!find data/train/z3/tumor -type f | wc -l
!find data/train/z3/no_tumor -type f | wc -l
!find data/train/z2/tumor -type f | wc -l
!find data/train/z2/no_tumor -type f | wc -l

!find data/val/z5/tumor -type f | wc -l
!find data/val/z5/no_tumor -type f | wc -l
!find data/val/z4/tumor -type f | wc -l
!find data/val/z4/no_tumor -type f | wc -l
!find data/val/z3/tumor -type f | wc -l
!find data/val/z3/no_tumor -type f | wc -l
!find data/val/z2/tumor -type f | wc -l
!find data/val/z2/no_tumor -type f | wc -l

!find data/test/z5 -type f | wc -l
!find data/test/z4 -type f | wc -l
!find data/test/z3 -type f | wc -l
!find data/test/z2 -type f | wc -l

74
90
227
401
703
1605
2376
6770
9
37
19
141
40
541
104
2121
60
218
828
3149
