# IMPORTS

In [1]:
# Install openslide
%%capture
!apt-get install openslide-tools 
!apt-get install python3-openslide
!pip install gcsfs

In [2]:
import os, sys, math
import glob
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

import openslide
from openslide import OpenSlideError
import numpy as np
import multiprocessing

from IPython.display import Image
from IPython.display import clear_output
from google.cloud import storage

if 'google.colab' in sys.modules: # Colab-only Tensorflow version selector
  %tensorflow_version 2.x
import tensorflow as tf
import gcsfs
print("Tensorflow version " + tf.__version__)

Tensorflow version 2.2.0


# PARAMETERS

In [3]:
DATA_PATH = 'gs://kds-ada7d35cbdd41095c77a3724f32409454cb4258b266b089616a4cc5a' # PANDA bucket
DATA_PATTERN = DATA_PATH + '/*/*.tiff'
PROJECT_ID = 'prostate-cancer-grade-assessment'  # prefix for output file names
SRC_BUCKET = DATA_PATH[5:]
DEST_BUCKET = 'huynhdoo'
DEST_FOLDER = 'prostate-cancer'

OUTPUT_DIR = './'
TRAIN_DIR = 'train_images'
TRAIN_PATH = os.path.join(DATA_PATH, TRAIN_DIR)
TRAIN_EXT = '.tiff'
MASK_DIR = 'train_label_masks'
MASK_PATH = os.path.join(DATA_PATH, MASK_DIR)
MASK_EXT = "_mask.tiff"

In [4]:
!gsutil ls $DATA_PATH

gs://kds-ada7d35cbdd41095c77a3724f32409454cb4258b266b089616a4cc5a/sample_submission.csv
gs://kds-ada7d35cbdd41095c77a3724f32409454cb4258b266b089616a4cc5a/test.csv
gs://kds-ada7d35cbdd41095c77a3724f32409454cb4258b266b089616a4cc5a/train.csv
gs://kds-ada7d35cbdd41095c77a3724f32409454cb4258b266b089616a4cc5a/train_images/
gs://kds-ada7d35cbdd41095c77a3724f32409454cb4258b266b089616a4cc5a/train_label_masks/


# DATASET

In [5]:
# Add filenames to dataframe
train_df = pd.read_csv(DATA_PATH + '/train.csv')
image_path = tf.io.gfile.glob(TRAIN_PATH + '/*' + TRAIN_EXT)

# Add train file column for each existing file in train folder
train_df['image_path'] = image_path #Images are in the same order

In [6]:
train_df.head()

Unnamed: 0,image_id,data_provider,isup_grade,gleason_score,image_path
0,0005f7aaab2800f6170c399693a96917,karolinska,0,0+0,gs://kds-ada7d35cbdd41095c77a3724f32409454cb42...
1,000920ad0b612851f8e01bcc880d9b3d,karolinska,0,0+0,gs://kds-ada7d35cbdd41095c77a3724f32409454cb42...
2,0018ae58b01bdadc8e347995b69f99aa,radboud,4,4+4,gs://kds-ada7d35cbdd41095c77a3724f32409454cb42...
3,001c62abd11fa4b57bf7a6c603a11bb9,karolinska,4,4+4,gs://kds-ada7d35cbdd41095c77a3724f32409454cb42...
4,001d865e65ef5d2579c190a0e0350d8f,karolinska,0,0+0,gs://kds-ada7d35cbdd41095c77a3724f32409454cb42...


In [7]:
# Data abnormalty
train_df.loc[(train_df['isup_grade'] == 2) & (train_df['gleason_score'] == '4+3')]

Unnamed: 0,image_id,data_provider,isup_grade,gleason_score,image_path
7273,b0a92a74cb53899311acc30b7405e101,karolinska,2,4+3,gs://kds-ada7d35cbdd41095c77a3724f32409454cb42...


In [8]:
# Data correction
train_df.loc[(train_df['isup_grade'] == 2) & (train_df['gleason_score'] == '4+3'), 'isup_grade'] = 3

# I/O files

In [9]:
# Open a slide
def open_slide(filename, tmp_file='temp.tmp'):
    """
    Open a whole-slide image (WSI)
    :filename : Name of the slide file.
    return: an OpenSlide object representing a whole-slide image.
    """
    try:
        download(SRC_BUCKET, filename, tmp_file)
        slide = openslide.open_slide(tmp_file)
    except OpenSlideError:
        slide = None
    except FileNotFoundError:
        slide = None
    return slide 

def download_image(name, path):
    remote_path = 'https://storage.googleapis.com/' + path[len('gs://'):]
    local_path = tf.keras.utils.get_file(name+'.tiff', origin = remote_path)
    return local_path

def remove_image(path):
    os.remove(path)

def open_image(path, level):
    # Open slide
    slide = openslide.open_slide(path)
    dimensions = slide.level_dimensions[level]
    image = slide.read_region((0, 0), level, dimensions)
    image = image.convert('RGB')

    # close slide
    slide.close()
    return image

def save_image(image, path):    
    image.save(path)

def upload_image(src, dest):
    storage_client = storage.Client(project=PROJECT_ID)
    bucket = storage_client.bucket(DEST_BUCKET)
    blob_name = os.path.join(DEST_FOLDER, dest)
    blob = bucket.blob(blob_name)
    blob.upload_from_filename(src)

def display_image(img):
    img_array = np.array(img)
    print('shape :', img_array.shape)
    display(tf.keras.preprocessing.image.array_to_img(img_array))

# GOOGLE CLOUD CREDENTIALS

In [10]:
if 'google.colab' in sys.modules:
   from google.colab import auth
   auth.authenticate_user()

# PATCHES EXTRACTION
ref : https://www.kaggle.com/ajenningsfrankston/scaled-tiles-with-efficient-net

In [11]:
def get_patches(img, mode=0, tile_size=128, n_tiles=16, threshold=0.8):
    result = []
    h, w, c = img.shape

    # Padding image
    pad_s = (tile_size * mode) // 2 # Stride +0% or +50%
    pad_h = (tile_size - h % tile_size) % tile_size + pad_s # Pad on height
    pad_w = (tile_size - w % tile_size) % tile_size + pad_s # Pad on width
    img = np.pad(img,[[pad_h // 2, pad_h - pad_h // 2], [pad_w // 2,pad_w - pad_w//2], [0,0]], constant_values=255)

    # Reshape image into tile (row, size, col, size, 3)
    img = img.reshape(img.shape[0] // tile_size, tile_size, 
                      img.shape[1] // tile_size, tile_size, 
                      3)
    # Reshape image into (row x col, size, size, 3)
    img = img.transpose(0,2,1,3,4).reshape(-1, tile_size, tile_size, 3)
    
    #n_tiles_with_info = (img.reshape(img.shape[0],-1).sum(1) < tile_size ** 2 * 3 * 255).sum()
    # Count non-blank tiles
    n_tiles_with_info = (img.reshape(img.shape[0],-1).sum(1) / (tile_size ** 2 * 3 * 255) < threshold).sum()
    
    # Complete with blank tiles
    if len(img) < n_tiles:
        img = np.pad(img,[[0,n_tiles-len(img)],[0,0],[0,0],[0,0]], constant_values=255)

    # Keep N tiles sort by pixel density (darker to lighter)
    idxs = np.argsort(img.reshape(img.shape[0],-1).sum(-1))[:n_tiles]
    img = img[idxs]
    for i in range(len(img)):
        result.append({'img':img[i], 'idx':i})
    #return result, n_tiles_with_info >= n_tiles
    return result, n_tiles_with_info

def get_mounting(patches, tile_size=128, n_tiles=16):
    # sub_imgs = False
    # idxes = np.asarray(idxes) + n_tiles if sub_imgs else idxes    
    idxes = list(range(n_tiles))
    n_row_tiles = int(np.sqrt(n_tiles))
    mounting = np.zeros((tile_size * n_row_tiles, tile_size * n_row_tiles, 3))
    for h in range(n_row_tiles):
        for w in range(n_row_tiles):
            i = h * n_row_tiles + w
            if len(patches) > idxes[i]:
                this_img = patches[idxes[i]]['img']
            else:
                this_img = np.ones((tile_size, tile_size, 3)).astype(np.uint8) * 255
            this_img = 255 - this_img
            h1 = h * tile_size
            w1 = w * tile_size
            mounting[h1:h1+tile_size, w1:w1+tile_size] = this_img
    mounting = 255 - mounting
    mounting = mounting.astype(np.float32)
    mounting /= 255
    #mounting = mounting.transpose(2, 0, 1)
    return mounting

# PROCESSING

## Export utilities

In [12]:
def mount_patches(idx, name, path, label, provider):
    slide = download_image(name, path)
    image = np.array(open_image(slide, 1))

    for mode in (0, 2):
        for SIZE, TILES in SIZE_TILES:
            patches, infos = get_patches(image, mode=mode, tile_size=SIZE, n_tiles=TILES, threshold=THRESHOLD) # Extract patches
            mounting = get_mounting(patches, tile_size=SIZE, n_tiles=TILES) # Make montage from patch
            mounting = tf.keras.preprocessing.image.array_to_img(mounting)
            filename = name + '_' + str(mode) + EXTENSION

            EXPORT_FOLDER = os.path.join('mounting', str(SIZE) + 'x' + str(TILES))
            if not(os.path.exists(EXPORT_FOLDER)):
                os.makedirs(EXPORT_FOLDER) # Create export folder

            src_img = os.path.join(EXPORT_FOLDER, filename)
            dest_img = os.path.join(EXPORT_FOLDER, provider, str(label), filename)
            save_image(mounting, src_img)        
            upload_image(src_img, dest_img)
            #os.remove(src_img) # Remove patch image
    
    remove_image(slide) # Remove slide image to save local disk space
    return idx

def proceed(result):
    clear_output()
    print('Proceed file', result)

def multiprocess(num_files):
    # Processes available
    num_processes = multiprocessing.cpu_count()
    pool = multiprocessing.Pool(num_processes)

    # Image per process split
    if num_processes > num_files:
        num_processes = num_files

    print("Number of processes: " + str(num_processes))
    print("Number of files: " + str(num_files))
    return pool

## Multiprocess export

In [None]:
# Export patches from slides
SIZE_TILES = ((256, 6**2))
THRESHOLD = 0.85 # Limit of a relevant tile
EXTENSION = '.jpg' # Output extension
LEVEL = -1 # Slide level definition
START = 0 # To restart extraction if needed
END = -1

# start tasks pooling
pool = multiprocess(len(train_df[START:END]))
for idx, row in train_df[START:END].iterrows():
    name, path, label, provider = row.image_id, row.image_path, row.isup_grade, row.data_provider
    result = pool.apply_async(mount_patches, args = (idx, name, path, label, provider), callback = proceed)
pool.close()
pool.join()

Proceed file 10614


# CLEANING

In [None]:
# To save local disk space
#!rm -r mounting

In [None]:
# WARNING: only for test
#!gsutil -m rm -r gs://huynhdoo/prostate-cancer/mounting

CommandException: 1 files/objects could not be removed.
