In [1]:
## This notebook loads images saved in "images" folder into SlideManager. 
## The images are split into 512 x 512 patches and saved to a new directory called "patches".
## Images saved in "patches//0" are benign, whereas "patches//1" are neoplastic.

In [2]:
import random
import datetime
import numpy as np
import h5py
from skimage.filters import threshold_otsu
from skimage import io
from matplotlib import pyplot as plt
import tifffile
import os
import sys
import shutil
from skimage.draw import polygon as ski_polygon
import openslide
from preprocessing.datamodel import SlideManager
from preprocessing.processing import split_negative_slide, split_positive_slide, create_tumor_mask, rgb2gray
from preprocessing.util import TileMap

%matplotlib inline

In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [4]:
def create_dir(path): ## Creates directory if path doesn't exist
    if os.path.exists(path) == False:
        os.mkdir(path)

In [5]:
# Define directory and slide manager
DIR = ''
mgr = SlideManager(cam16_dir=DIR)

# Get annotated slides
slides_met = mgr.met_slides
N_met = len(slides_met)
print('Number of annotated slides:', N_met)

# Get normal slides
slides_negative = mgr.negative_slides
N_negative = len(slides_negative)
print('Number of normal slides:', N_negative)

Number of annotated slides: 2
Number of normal slides: 2


In [6]:
level = 0 ## Level of downsampling (0 = no downsampling)
tile_size = 512 # size in px of patches
poi = 0.20
overlap = tile_size // 2 # Overlap between patches

## Extract and save negative patches from negative slides

In [7]:
print('Number of negative slides = ' + str(N_negative))

create_dir(DIR + 'patches//')
create_dir(DIR + 'patches//0//')

for i in range(N_negative): # specify which slides to write from
    negative_slide = mgr.negative_slides[i] #Number indicates which negative slide
    
    # Load the slide into numpy array
    arr = np.asarray(negative_slide.get_full_slide(level=4))
    
    # Convert it to gray scale
    arr_gray = rgb2gray(arr)
    
    # Calculate otsu threshold
    threshold = threshold_otsu(arr_gray)

    ## Define patch iterator
    tile_iter = split_negative_slide(
        negative_slide, level=level,
        otsu_threshold=threshold,  # otsu threshold calculated earlier
        tile_size=tile_size,
        overlap=overlap, # overlap between patches
        poi_threshold=poi) # only select tiles with at least 5% tissue

    # Initialize the map with the slide itself
    tm = TileMap(negative_slide)

    cur = 0
    for tile, bounds in tile_iter:
        tifffile.imwrite(DIR + 'patches//0//' + str(negative_slide.name) + '_x' + \
                         str(bounds[0][0]) + ',y' + str(bounds[0][1]) + '_class0.tif', tile, photometric = 'rgb')
        cur += 1
        tm.add_tile(bounds)                            

    print('\n{} tiles written'.format(cur) + ' for ' + str(negative_slide.name))

Number of negative slides = 2

121 tiles written for case00_img01

108 tiles written for case01_img03


## Extract and save positive patches from positive slides

In [8]:
print('Number of annotated slides = ' + str(N_met))
create_dir(DIR + 'patches//1//')

for i in range(N_met):
    positive_slide = mgr.met_slides[i] #Number indicates which positive slide
    
    tile_iter = split_positive_slide(
        positive_slide, level=level,
        tile_size=tile_size,
        overlap=overlap, # overlap between patches
        poi_threshold=poi) # only select tiles with at least X% tumor

    # initialize the map with the slide itself
    tm = TileMap(positive_slide)
    tm.image

    #write image patches to TIF files
    cur = 0
    for tile, bounds in tile_iter:
        tifffile.imwrite(DIR + 'patches//1//' + str(positive_slide.name) + '_x' + \
                         str(bounds[0][0]) + ',y' + str(bounds[0][1]) + '_class1.tif', tile, photometric = 'rgb')
        cur += 1
        tm.add_tile(bounds)                            

    print('\n{} tiles written'.format(cur) + ' for ' + str(positive_slide.name))

Number of annotated slides = 2

111 tiles written for case00_img00

70 tiles written for case01_img02


## Extract and save negative patches from positive slides

In [9]:
for i in range(N_met):
    positive_slide = mgr.met_slides[i] #Number indicates which positive slide
    size = positive_slide.level_dimensions[level]
    met_mask = np.zeros((size[1], size[0]), dtype=np.uint8)
    downsample = positive_slide.level_downsamples[level]
    start_pos = (0,0)

    #delete rr and/or cc if they exist
    try:
        del rr_met
    except NameError:
        pass
    try:
        del cc_met
    except NameError:
        pass

    ## Collect coordinates of all annotated regions in slide and save as array
    ii = 0
    for i, annotation in enumerate(positive_slide.annotations):
        c_values, r_values = list(zip(*annotation.polygon))
        r = np.array(r_values, dtype=np.float32)
#         r -= start_pos[1]
        r /= downsample
        r = np.array(r + 0.5, dtype=np.int32)

        c = np.array(c_values, dtype=np.float32)
#         c -= start_pos[0]
        c /= downsample
        c = np.array(c + 0.5, dtype=np.int32)

        rr_temp, cc_temp = ski_polygon(r, c, shape=met_mask.shape)

        try:
            rr_met #if rr_met exists, append to existing array
            rr_met = np.append(rr_met, rr_temp) #this will perform if rr already exists
            cc_met = np.append(cc_met, cc_temp)
        except NameError:
            rr_met = rr_temp #initializes if it it the first loop
            cc_met = cc_temp
        ii += 1
    met_mask[rr_met,cc_met] = 1 #rr, cc contains pixel coordinates of ALL met regions in slide

    # Create and save patches, excluding ones that contain positive met pixels
    negative_mask = np.zeros((size[1], size[0]), dtype=np.uint8) #mask of negative tissue regions (NO positive patches)
    downsample = positive_slide.level_downsamples[level]
    start_pos = (0,0)

    ## Use Otsu threshold to get tissue mask
    # load the slide into numpy array
    arr = np.asarray(positive_slide.get_full_slide(level=0))
    # convert it to gray scale
    arr_gray = rgb2gray(arr)
    # calculate otsu threshold
    threshold = threshold_otsu(arr_gray)

    # initialize the map with the slide itself
    tm = TileMap(positive_slide)

    # create a new and unconsumed tile iterator
    tile_iter = split_negative_slide(positive_slide, level=level,
                                     otsu_threshold=threshold,
                                     tile_size=tile_size, 
                                     overlap=overlap,
                                     poi_threshold=poi)
    ii = 0
    cur = 0
    for tile, bounds in tile_iter: #bounds is (X,Y),(width,height), coordinates on full-resolution slide    
        #get coordinates of patch on full-resolution slide
        X = bounds[0][0] #X coordinate of top left corner on full-resolution slide
        Y = bounds[0][1] #Y coordinate of top left corner on full-resolution slide
        width = bounds[1][0] #width of patch
        height = bounds[1][1] #height of patch

        #convert coordinates to downsampled slide, assign 0 or 1 with skimage.polygon
        r = np.array((Y, Y, Y+height, Y+height), dtype=np.float32)
        r -= start_pos[1]
        r /= downsample
        r = np.array(r + 0.5, dtype=np.int32)

        c = np.array((X, X+width, X+width, X), dtype=np.float32)
        c -= start_pos[0]
        c /= downsample
        c = np.array(c + 0.5, dtype=np.int32)
        rr_tissue, cc_tissue = ski_polygon(r, c, shape=negative_mask.shape) ##patch coords of TISSUE

        if 1 in met_mask[rr_tissue, cc_tissue]: #if coordinates of current patch correspond to a 1 in BE mask
            pass
#             print('do not add patch' + str(bounds))
        else:
            tm.add_tile(bounds)
            negative_mask[rr_tissue, cc_tissue] = 1
            tifffile.imwrite(DIR + 'patches//0//' + str(positive_slide.name) + '_x' + \
                         str(bounds[0][0]) + ',y' + str(bounds[0][1]) + '_class0.tif', tile, photometric = 'rgb')
            cur += 1

        ii += 1

    print('\n{} tiles written'.format(cur) + ' for ' + str(positive_slide.name))


13 tiles written for case00_img00

2 tiles written for case01_img02


## Normalize each channel of all patches to 0 - 255

In [10]:
path = 'patches//0//'
for file in os.listdir(path):
    im = io.imread(path + file)
    
    ## Normalize patch to 0 - 255 for ch 0 and ch 1 (ch2 empty)
    im[:,:,0] = im[:,:,0].astype('float64')*255/im[:,:,0].max().astype('float64')
    im[:,:,1] = im[:,:,1].astype('float64')*255/im[:,:,1].max().astype('float64')

    tifffile.imwrite(path + file, im, photometric = 'rgb')

In [11]:
path = 'patches//1//'
for file in os.listdir(path):
    im = io.imread(path + file)
    ## Normalize patch to 0 - 255 for ch 0 and ch 1 (ch2 empty)
    im[:,:,0] = im[:,:,0].astype('float64')*255/im[:,:,0].max().astype('float64')
    im[:,:,1] = im[:,:,1].astype('float64')*255/im[:,:,1].max().astype('float64')

    tifffile.imwrite(path + file, im, photometric = 'rgb')