In [1]:
!apt-get install openslide-tools

!pip install openslide-python

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-430
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  libopenslide0
Suggested packages:
  libtiff-tools
The following NEW packages will be installed:
  libopenslide0 openslide-tools
0 upgraded, 2 newly installed, 0 to remove and 7 not upgraded.
Need to get 92.5 kB of archives.
After this operation, 268 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libopenslide0 amd64 3.4.1+dfsg-2 [79.8 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 openslide-tools amd64 3.4.1+dfsg-2 [12.7 kB]
Fetched 92.5 kB in 1s (174 kB/s)
Selecting previously unselected package libopenslide0.
(Reading database ... 135004 files and directories currently installed.)
Preparing to unpack .../libopenslide0_3.4.1+dfsg-2_

In [0]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from openslide import open_slide, __library_version__ as openslide_version

In [0]:
import os
from PIL import Image
from skimage.color import rgb2gray
from random import randint

In [0]:
from google.colab import drive

In [5]:
drive.mount('/gdrive')
drive_root = '/gdrive/My Drive/slides/'
train_path = '/gdrive/My Drive/train_images/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /gdrive


In [0]:
def finding_tissue_by_pixel(image, intensity=0.7):
    img_gray = rgb2gray(image)
    a1 = image.shape[0]
    a2 =  image.shape[1]

    assert img_gray.shape == (a1,a2)
    indices = np.where(img_gray <= intensity)
    return zip(indices[0], indices[1])

In [0]:
def reading_slides(slide, x, y, level, width, height, as_float=False):
    img = slide.read_region((x,y), level, (width, height))
    img = img.convert('RGB') 

    if as_float:
        img = np.asarray(img, dtype=np.float32)

    else:
        img = np.asarray(img)

 
    return img

In [0]:
img_mask = []

for filename in os.listdir(drive_root):
  if filename[-8:] == 'mask.tif':

    pair = (filename.split('_mask')[0] + '.tif', filename)
    
    img_mask.append(pair)

In [0]:
image_number = 0
slide_number = 0
tumor_images = 0

In [0]:
for image_file, mask_file in img_mask:

  mask_path = os.path.join(drive_root, mask_file)
  slide_path = os.path.join(drive_root, image_file)

  slide = open_slide(slide_path)
  t_mask = open_slide(mask_path)
  w = slide.level_dimensions[4][0]
  h = slide.level_dimensions[4][1]

  slide_image = reading_slides(slide, x = 0, y = 0, level = 4, width = w, height = h)
  mask_image = reading_slides(t_mask, x = 0, y = 0, level = 4, width = w, height = h)


  x_range, y_range = np.array(slide.level_dimensions[4]) - 500

  
  while slide_number < 1000:
    x, y = randint(0, x_range), randint(0, y_range)
    new_img = slide_image[y:y+480, x:x+480]
    new_mask = mask_image[y:y+480, x:x+480]

    assert new_img.shape == (480,480,3)
    assert new_mask.shape == (480,480,3)

    tissue_p = list(finding_tissue_by_pixel(new_img))
    deno = float(new_img.shape[0] * new_img.shape[0]) * 100
    percent_tissue = len(tissue_p) / deno
    
    if percent_tissue > 60.00:
      image_number += 1
      slide_number += 1
      plt.imsave(os.path.join(train_path,"train_{}.png".format(image_number)),new_img)
      plt.imsave(os.path.join(train_path,"train_mask_{}.png".format(image_number)),new_mask)
      print("Image {} has been stored".format(image_number))

In [0]:
masks = [filename for filename in os.listdir(train_path) if "mask" in filename]
assert len(masks) == 21000

In [0]:
for m in masks:

  mask_image = np.array(Image.open(os.path.join(train_path, m)).convert('RGB'))
  mask_image = mask_image[:,:,0]
  
  plt.imsave(os.path.join(train_path, m), mask_img)
  if np.max(mask_img) != 0:
    tumor_images += 1

In [0]:
print("Images with cancer percentage: {}%".format(round(cancer_imgs/210),2))

In [0]:
files = os.listdir(train_path)
pairs = []

In [0]:
for filename in files:
  if filename[6:10] == 'mask':
    img_file = filename.split("mask_")[0] + filename.split("mask_")[1]
    pairs.append((os.path.join(train_path,img_file),os.path.join(train_path,filename)))

In [0]:
img_file, mask_img_file = pairs[0]
img = Image.open(img_file).convert('RGB')
mask_img = Image.open(mask_img_file).convert('L')

In [0]:
plt.figure(figsize=(5,5), dpi=100)
plt.imshow(img)
plt.imshow(mask_img, cmap='green', alpha=0.3)
plt.show()

In [0]:
img_file, mask_img_file = os.path.join(train_path, 'train_6199.png'), os.path.join(train_path, 'train_mask_6199.png')
img = Image.open(img_file).convert('RGB')
mask_img = Image.open(mask_img_file).convert('L')

plt.figure(figsize=(5,5), dpi=100)
plt.imshow(img)
plt.imshow(mask_img, cmap='blue', alpha=0.6)
plt.show()

In [0]:
img_file, mask_img_file = os.path.join(train_path, 'train_1698.png'), os.path.join(train_path, 'train_mask_1698.png')
img = Image.open(img_file).convert('RGB')
mask_img = Image.open(mask_img_file).convert('L')

plt.figure(figsize=(5,5), dpi=100)
plt.imshow(img)
plt.imshow(mask_img, cmap='jet', alpha=0.5)
plt.show()