In [1]:
import math
import os
import numpy as np
import openslide
import shutil
from pathlib import Path
from matplotlib import pyplot as plt
from PIL import Image
from openslide import OpenSlideError
import pandas as pd
# from scipy.ndimage.morphology import binary_fill_holes
# from skimage.color import rgb2gray
# from skimage.feature import canny
# from skimage.morphology import binary_closing, binary_dilation, disk

In [2]:
def open_one_slide(filename):
    try:
        slide = openslide.open_slide(filename)
    except OpenSlideError:
        slide = None
    except FileNotFoundError:
        slide = None
    return slide

def create_tile_generator(slide, tile_size, overlap):
    generator = DeepZoomGenerator(slide, tile_size=tile_size, overlap=overlap, limit_bounds=True)
    return generator


# Determine 20x Magnification Zoom Level
def get_20x_zoom_level(slide, generator):
    highest_zoom_level = generator.level_count - 1  # 0-based indexing
    # PROPERTY_NAME_OBJECTIVE_POWER:
    try:
        mag = int(slide.properties[openslide.PROPERTY_NAME_OBJECTIVE_POWER])
        # `mag / 20` gives the downsampling factor between the slide's
        # magnification and the desired 20x magnification.
        # `(mag / 20) / 2` gives the zoom level offset from the highest
        # resolution level, based on a 2x downsampling factor in the
        # generator.
        offset = math.floor((mag / 20) / 2)
        level = highest_zoom_level - offset
    except (ValueError, KeyError) as e:
        # In case the slide magnification level is unknown, just
        # use the highest resolution.
        level = highest_zoom_level
    return level

def get_all_patient_slides(path):
    slide_files = []
    patient_ids = []
    for x in path.iterdir():
        if x.is_dir():
            for y in x.iterdir():
                slide_files.append(str(y))
                name = str(y).split('/')[-1]
                name = name.split('.')[0][:15]
                patient_ids.append(name)
    return slide_files, patient_ids

In [3]:
# Different function for tile extraction
def get_tiles_from_generator(generator, cols, rows):
    for col in range(cols):
        for row in range(rows):
            t = generator.get_tile(zoom, (col, row))
            tile = rgb2gray(t)
            # print(tile.shape)
            tile = 1 - tile
            # 8-bit depth complement, from 1 (dense tissue)
            # to 0 (plain background).
            tile = canny(tile)
            tile = binary_closing(tile, disk(10))
            tile = binary_dilation(tile, disk(10))
            tile = binary_fill_holes(tile)
            #print(tile)
            #plt.imshow(tile)
            #plt.show()
            percentage = tile.mean()
            if count == 100:
                break
            if percentage <= threshold:
                print()
                plt.imshow(t)
                plt.show()
                count += 1

def get_tile_from_original_slide(slide, patch_size, target_mag, save_dir, patient_id):
    print("Processing tile extraction for patient ", patient_id)
    if ('aperio.AppMag' not in slide.properties):
        return
    magnification = float(slide.properties['aperio.AppMag'])
    extract_patch_size = int(patch_size * magnification / target_mag)
    w, h = slide.level_dimensions[0]
    w = w // extract_patch_size * extract_patch_size
    h = h // extract_patch_size * extract_patch_size
    count = 0
    num_patch = 0
    if (save_dir/patient_id).exists():
        # shutil.rmtree(save_dir/patient_id)
        return
    (save_dir/patient_id).mkdir()
    for i in range(0, w, extract_patch_size):
        for j in range(0, h, extract_patch_size):
            patch = slide.read_region((i, j), level=0, size=[extract_patch_size, extract_patch_size])
            patch = patch.resize([patch_size, patch_size])
            patch_gray = patch.convert('1')
            ave_pixel_val = np.array(patch_gray).mean()
            if ave_pixel_val < threshold:
                num_patch += 1
                img_name = patient_id + '_' + str(count).zfill(4) + '.png'
                tile_name = save_dir/patient_id/img_name
                patch.save(str(tile_name))
            count += 1
    # print(f"Number of tissue patch is {num_patch}")
    # print(f"Number of total patches is {count}")
    print("Finished processing -------")


In [4]:
# some globals, can be changed to customize
target_mag = 20
threshold = 220/255
patch_size = 512
num_tiles_per_person = 200
data_dir = Path('../SVS_Raw/')
folders = [x for x in data_dir.iterdir() if x.is_dir()]
print(len(folders))
all_slides = []
all_patients = []
for folder in folders:
    curr_slide = list(folder.glob('*.svs'))[0]
    slide_name = str(curr_slide)
    print(slide_name)
    curr_patient = slide_name.split('/')[-1][:15]
    print(curr_patient)
    all_patients.append(curr_patient)
    all_slides.append(slide_name)


54
../SVS_Raw/122600cd-5205-43ff-b2fa-a22df695c1d3/TCGA-XF-A8HH-01Z-00-DX1.22E9DD6D-F231-422F-A349-9FF60B65D50E.svs
TCGA-XF-A8HH-01
../SVS_Raw/06bf471f-a488-4506-bc45-96e7e466bb85/TCGA-4Z-AA89-01Z-00-DX1.6DD89FC0-A062-41F1-AED3-FF8975FAADF3.svs
TCGA-4Z-AA89-01
../SVS_Raw/1ccdaf02-b51c-4624-a958-5c7d78c11d20/TCGA-5N-A9KI-01Z-00-DX1.1BAE5EFF-859F-4D0A-8DDC-527E2901F7D4.svs
TCGA-5N-A9KI-01
../SVS_Raw/1f05e999-0586-4467-8a29-2556e6a840c5/TCGA-G2-AA3F-01Z-00-DX8.D27EA057-F808-44FE-AF5D-9F7237D59488.svs
TCGA-G2-AA3F-01
../SVS_Raw/186a13bd-b0af-488f-a658-ab57736a6a6b/TCGA-E7-A3Y1-01Z-00-DX1.D3E81A0B-1D20-4916-8872-81D469A1E276.svs
TCGA-E7-A3Y1-01
../SVS_Raw/0e0b2c60-6f20-4eab-b41c-d046b63261b4/TCGA-BT-A2LA-01Z-00-DX1.B76379CE-99AB-4598-B55B-0FA800B8DB74.svs
TCGA-BT-A2LA-01
../SVS_Raw/1bb84e18-4962-4e14-8be5-957d3d1d90ac/TCGA-BT-A20J-01Z-00-DX1.EB1BC7DB-9BF1-467D-B897-9BA2130320CB.svs
TCGA-BT-A20J-01
../SVS_Raw/0fa7cb35-e412-4350-9fc8-b4fafb0399a9/TCGA-BT-A42B-01Z-00-DX1.00FCBDE2-728B-4024-A51

In [5]:
if (Path('../all_patients_id.txt').exists()):
    print("Found existing patients, adding to it...")
    with open('../all_patients_id.txt', 'r') as fread:
        existing_patients = fread.read().split('\n')
        all_patients.extend(existing_patients)
    
with open('../all_patients_id.txt', 'w') as fwrite:
    for p in all_patients:
        fwrite.write(p)
        fwrite.write('\n')
        
save_dir_root = Path('../tile_data/')
if not save_dir_root.exists():
    #shutil.rmtree(save_dir_root)
    save_dir_root.mkdir()


Found existing patients, adding to it...


In [6]:
all_patients

['TCGA-XF-A8HH-01',
 'TCGA-4Z-AA89-01',
 'TCGA-5N-A9KI-01',
 'TCGA-G2-AA3F-01',
 'TCGA-E7-A3Y1-01',
 'TCGA-BT-A2LA-01',
 'TCGA-BT-A20J-01',
 'TCGA-BT-A42B-01',
 'TCGA-ZF-AA52-01',
 'TCGA-G2-A2EK-01',
 'TCGA-C4-A0F6-01',
 'TCGA-GV-A3JZ-01',
 'TCGA-DK-A3IL-01',
 'TCGA-G2-A2EF-01',
 'TCGA-ZF-A9R7-01',
 'TCGA-PQ-A6FI-01',
 'TCGA-UY-A8OC-01',
 'TCGA-G2-A2ES-01',
 'TCGA-UY-A9PA-01',
 'TCGA-E7-A7DU-01',
 'TCGA-FD-A3SS-01',
 'TCGA-E7-A6ME-01',
 'TCGA-G2-A3IB-01',
 'TCGA-G2-A2EC-01',
 'TCGA-G2-AA3D-01',
 'TCGA-GV-A3QI-01',
 'TCGA-2F-A9KW-01',
 'TCGA-GC-A3WC-01',
 'TCGA-G2-A2EC-01',
 'TCGA-XF-AAN8-01',
 'TCGA-BT-A3PK-01',
 'TCGA-G2-A2EO-01',
 'TCGA-FD-A3B7-01',
 'TCGA-ZF-AA5H-01',
 'TCGA-DK-A1AB-01',
 'TCGA-G2-A2EJ-01',
 'TCGA-CU-A3KJ-01',
 'TCGA-G2-A2EC-01',
 'TCGA-SY-A9G0-01',
 'TCGA-CF-A47S-01',
 'TCGA-BT-A2LD-01',
 'TCGA-G2-A2EJ-01',
 'TCGA-CU-A0YN-01',
 'TCGA-ZF-AA5P-01',
 'TCGA-FD-A3B4-01',
 'TCGA-K4-A4AB-01',
 'TCGA-2F-A9KR-01',
 'TCGA-HQ-A5ND-01',
 'TCGA-E7-A97Q-01',
 'TCGA-G2-A2EF-01',


In [7]:
def get_thumbnail(save_dir, slide, patient_id, target_mg=20):

    magnification = float(slide.properties['aperio.AppMag'])
    print(magnification)

    extract_patch_size = int(patch_size * magnification / target_mag)
    print(extract_patch_size)
    w, h = slide.level_dimensions[0]

    th_w = int(w / extract_patch_size * 10)
    th_h = int(h / extract_patch_size * 10)
    thumbnail = slide.get_thumbnail((th_w, th_h))
#     thumbnail_name = '{:s}/{:s}_thumbnail.png'.format(save_dir, slide_name)
    save_path = os.path.join(save_dir, f"thumbnail_{patient_id}.png")
    thumbnail.save(save_path)

In [None]:
# example = "../slide_data/TCGA-DK-A2I6-01/TCGA-DK-A2I6-01Z-00-DX1.7BF86D3B-D4F7-47F9-B021-7FD6B673A238.svs"
# slide = openslide.open_slide(str(example))
# save_dir = Path('../tile_data/')
# patient = 'TCGA-DK-A2I6-01'
thumb_save_dir = '../thumbnail_data'
for i in range(len(all_slides)):
    slide = openslide.open_slide(str(all_slides[i]))
    get_thumbnail(thumb_save_dir, slide, all_patients[i])
    get_tile_from_original_slide(slide, patch_size, target_mag, save_dir_root, all_patients[i])

40.0
1024
Processing tile extraction for patient  TCGA-XF-A8HH-01
40.0
1024
Processing tile extraction for patient  TCGA-4Z-AA89-01
40.0
1024
Processing tile extraction for patient  TCGA-5N-A9KI-01
40.0
1024
Processing tile extraction for patient  TCGA-G2-AA3F-01
40.0
1024
Processing tile extraction for patient  TCGA-E7-A3Y1-01
40.0
1024
Processing tile extraction for patient  TCGA-BT-A2LA-01
40.0
1024
Processing tile extraction for patient  TCGA-BT-A20J-01
40.0
1024
Processing tile extraction for patient  TCGA-BT-A42B-01


In [21]:
import glob
example = Path('../tile_data/TCGA-DK-A2I6-01/')
tiles = list(glob.glob('../tile_data/TCGA-DK-A2I6-01/*.png'))
len(tiles)

4997