In [1]:
!pip install lmdb

Collecting lmdb
  Downloading lmdb-1.3.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (305 kB)
[K     |████████████████████████████████| 305 kB 10.7 MB/s eta 0:00:01
[?25hInstalling collected packages: lmdb
Successfully installed lmdb-1.3.0


In [2]:
import openslide
from matplotlib.colors import ListedColormap
from skimage import color
import skimage.morphology  as skmp
from histolab.slide import Slide
import numpy as np
from PIL import Image
import tifffile
from histolab.filters.image_filters import (
    ApplyMaskImage,
    Compose,
    OtsuThreshold,
    RgbToGrayscale,GreenPenFilter,BluePenFilter
)
from histolab.filters.morphological_filters import BinaryDilation,BinaryErosion,BinaryClosing,BinaryOpening,RemoveSmallObjects,RemoveSmallHoles
import cv2
import glob
import time
import lmdb

In [3]:
def composed_filters(image_rgb):
    filters = Compose(
        [
            RgbToGrayscale(),
            OtsuThreshold(),
#             BinaryDilation(),
#             BinaryErosion(),
            BinaryClosing(),
            RemoveSmallHoles(),
#             BinaryClosing(),
#             BinaryDilation(),
            RemoveSmallObjects(),
            ApplyMaskImage(image_rgb),
            BluePenFilter(),
            GreenPenFilter(),

        ]
    )
    return filters(image_rgb)



class TileGenerator:
    def __init__(
        self,
        slide_path,
        tile_size,
        overlap=0
    ):
        """
        Create a DeepZoomGenerator wrapping an OpenSlide object.
        :param slide_path: location for the slide
        :param slide_id: id of current slide (e.g., slide name)
        :param tile_size: the width and height of a single tile.  For best viewer
                          performance, tile_size + 2 * overlap should be a power
                          of two.
        :param overlap: Overlapping size (only used when tiles are extracted from the grid)
        """
        self.slide = openslide.OpenSlide(slide_path)
        self.slide_path = slide_path
        self.slide_id = slide_path.split("/")[-1].split(".")[0]
        self.tile_size = tile_size
        self.overlap = int(overlap)
        self.extraction_tile_size = int(self.tile_size)
        self.extraction_overlap = int(self.overlap)
        
        #extract thumbnail and mask non-tissue
        thumb_down_rate = 8
        size = np.array(self.slide.level_dimensions)[0] // thumm_down_rate
        self.thumbnail = np.array(slide.get_thumbnail([size[0], size[1]]).convert('RGB'))
        self.thumbnail_masked = np.array(composed_filters(Image.fromarray(self.thumbnail)))
        
    def get_tile_locations(self, tissue_thred):
        """
        Generate tiles locations at the highest resolution from the grid
        :param tissue_thred: tissue threshold (i.e., tile contains tissue region greater than thred)
        :return:
        """
        # Get the downsample rate from level 0 to mask level
        downsample_rate = self.slide.level_dimensions[0][0] / self.thumbnail_masked.shape[1]
        tile_size = self.tile_size 
        overlap = self.overlap
        interval = tile_size - overlap
        # Calculate the tile size to be used at the mask level
        mask_tile_size = int(tile_size / downsample_rate)
        highest_dimension = self.slide.level_dimensions[0]
        lowest_dimension = self.thumbnail_masked.shape[:2]
        counter = 0
        location_tracker = {}
        for i in range(0, int(highest_dimension[0]), int(interval)):
            for j in range(0, int(highest_dimension[1]), int(interval)):
                mask_i = i / downsample_rate
                mask_j = j / downsample_rate
                if (
                    (mask_i + mask_tile_size) < lowest_dimension[1]
                    and (mask_j + mask_tile_size) < lowest_dimension[0]
                    and (i + tile_size < highest_dimension[0])
                    and (j + tile_size < highest_dimension[1])
                ):
                    mask_tile = self.thumbnail_masked[
                        int(mask_j) : int(mask_j + mask_tile_size),
                        int(mask_i) : int(mask_i + mask_tile_size),
                    ]
                    if (
                        float(np.sum(np.array(mask_tile > 0).astype(np.uint8))) / float(mask_tile_size ** 2)
                    ) >= tissue_thred:
                        location_tracker[counter] = (
                            int(i),
                            int(j),
                        )
                        counter += 1

        return counter, location_tracker


In [4]:
### function to exctract tile given its location from slide
def generate_tile(slide, tile_size, location, normalizer):
    orig_tile = slide.read_region((location[0], location[1]), 0, (tile_size, tile_size))
    orig_tile = np.asarray(orig_tile.convert('RGB'))
    return orig_tile
    return orig_tile

In [38]:
### generate list of tiles locations for given slide_id
t1 = time.time()
slide_id = '00c058_0'
slide_path = '/data/neuro/stroke/kaggle_strip_ai/train/{0}.tif'.format(slide_id)
slide = openslide.OpenSlide(slide_path)

slide_name = slide_path.split('/')[::-1][0].split('.')[0]
tile_size = 256
overlap = 0
tissue_thred = 0.5 

tile_generator = TileGenerator(
    slide_path,
    tile_size,
    overlap
)
counter, location_tracker = tile_generator.get_tile_locations(
    tissue_thred=tissue_thred
)
location_dict: dict = {}
location_dict[slide_name] = location_tracker
print('finished extracting tile location', (time.time() - t1))

finished extracting tile location 64.12947058677673


In [27]:
location_dict

{'029c68_0': {0: (4608, 13568),
  1: (4608, 13824),
  2: (4608, 14080),
  3: (4864, 11776),
  4: (4864, 12544),
  5: (4864, 12800),
  6: (4864, 13056),
  7: (4864, 13312),
  8: (4864, 13568),
  9: (4864, 13824),
  10: (4864, 14080),
  11: (5120, 11520),
  12: (5120, 11776),
  13: (5120, 12032),
  14: (5120, 12544),
  15: (5120, 12800),
  16: (5120, 13056),
  17: (5120, 13312),
  18: (5120, 13568),
  19: (5120, 13824),
  20: (5120, 14080),
  21: (5120, 14336),
  22: (5376, 11264),
  23: (5376, 11520),
  24: (5376, 11776),
  25: (5376, 12800),
  26: (5376, 13056),
  27: (5376, 13312),
  28: (5376, 13568),
  29: (5376, 13824),
  30: (5376, 14080),
  31: (5376, 14336),
  32: (5376, 14592),
  33: (5376, 14848),
  34: (5376, 15104),
  35: (5376, 15360),
  36: (5376, 15616),
  37: (5632, 11008),
  38: (5632, 11264),
  39: (5632, 11520),
  40: (5632, 11776),
  41: (5632, 13568),
  42: (5632, 13824),
  43: (5632, 14080),
  44: (5632, 14592),
  45: (5632, 14848),
  46: (5632, 15104),
  47: (5632

In [35]:
### find out how many tiles were exctracted
counter = len(location_dict[slide_id].keys())
counter

2439

In [31]:
### extract all tiles and save to orig_tiles array
t1 = time.time()
orig_tiles = np.zeros((counter, tile_size, tile_size, 3), dtype=np.uint8)
for tile_id in range(counter):
    cur_loc = location_dict['00c058_0'][tile_id]
    orig_tile = generate_tile(slide, tile_size, [int(cur_loc[0]), int(cur_loc[1])], normalizer=False)
    orig_tiles[tile_id] =  orig_tile
print('finished extracting tiles to array', (time.time() - t1))

finished extracting tiles to array 9.637192487716675


### lmdb database array writing

In [6]:
import lmdb

In [26]:
lmdb_loc = '/data/eredekop/stroke_kaggle/processed/lmdb'
env = lmdb.open("%s" % lmdb_loc, map_size=6e+13)

  env = lmdb.open("%s" % lmdb_loc, map_size=6e+13)


In [29]:
with env.begin(write=True) as txn:
    write_start = time.time()
    txn.put(str(slide_id).encode(), orig_tiles.astype(np.uint8).tobytes())
    # put location
#     txn_locs.put(("%s_loc" % str(data['slide_id'])).encode(), data['locations'].astype(np.int64).tobytes())
print("Finish writing time: %f" % (time.time() - write_start))

Finish writing time: 1.654622


In [39]:
### always need to close environment
env.close()

### lmdb database array reading

In [16]:
env = lmdb.open("%s" % (lmdb_loc), max_readers=3, readonly=True,
                             lock=False, readahead=False, meminit=False)

In [20]:
with env.begin(write=False) as txn:
    tilebuf = txn.get(str(slide_id).encode())
    tilebuf = np.frombuffer(tilebuf, dtype=np.uint8)
    tilebuf = tilebuf.reshape(-1, tile_size, tile_size, 3)

TypeError: a bytes-like object is required, not 'NoneType'

In [28]:
tilebuf.shape

AttributeError: 'NoneType' object has no attribute 'shape'

In [21]:
### always need to close environment
env.close()

### database for all slides

In [22]:
lmdb_loc = '/data/eredekop/stroke_kaggle/processed/lmdb'
env = lmdb.open("%s" % lmdb_loc, map_size=6e+13)

  env = lmdb.open("%s" % lmdb_loc, map_size=6e+13)


In [23]:
all_files = glob.glob('/data/neuro/stroke/kaggle_strip_ai/train/*.tif')
all_files

['/data/neuro/stroke/kaggle_strip_ai/train/006388_0.tif',
 '/data/neuro/stroke/kaggle_strip_ai/train/008e5c_0.tif',
 '/data/neuro/stroke/kaggle_strip_ai/train/00c058_0.tif',
 '/data/neuro/stroke/kaggle_strip_ai/train/01adc5_0.tif',
 '/data/neuro/stroke/kaggle_strip_ai/train/026c97_0.tif',
 '/data/neuro/stroke/kaggle_strip_ai/train/028989_0.tif',
 '/data/neuro/stroke/kaggle_strip_ai/train/029c68_0.tif',
 '/data/neuro/stroke/kaggle_strip_ai/train/032f10_0.tif',
 '/data/neuro/stroke/kaggle_strip_ai/train/0372b0_0.tif',
 '/data/neuro/stroke/kaggle_strip_ai/train/037300_0.tif',
 '/data/neuro/stroke/kaggle_strip_ai/train/03d1ec_0.tif',
 '/data/neuro/stroke/kaggle_strip_ai/train/03e6b7_0.tif',
 '/data/neuro/stroke/kaggle_strip_ai/train/0415c3_0.tif',
 '/data/neuro/stroke/kaggle_strip_ai/train/04439c_0.tif',
 '/data/neuro/stroke/kaggle_strip_ai/train/045eb0_0.tif',
 '/data/neuro/stroke/kaggle_strip_ai/train/0468a8_0.tif',
 '/data/neuro/stroke/kaggle_strip_ai/train/0468a8_1.tif',
 '/data/neuro/

In [24]:
with env.begin(write=True) as txn:
    for i in range(len(all_files)):
        t1 = time.time()
        slide_path = all_files[i]
        slide_id = slide_path.split('/')[::-1][0].split('.')[0]
        slide = openslide.OpenSlide(slide_path)
        
        tile_size = 256
        overlap = 0
        tissue_thred = 0.5 

        tile_generator = TileGenerator(
            slide_path,
            tile_size,
            overlap
        )
        counter, location_tracker = tile_generator.get_tile_locations(
            tissue_thred=tissue_thred
        )
        location_dict: dict = {}
        location_dict[slide_id] = location_tracker
        orig_tiles = np.zeros((counter, tile_size, tile_size, 3), dtype=np.uint8)
        for tile_id in range(counter):
            cur_loc = location_dict[slide_id][tile_id]
            orig_tile = generate_tile(slide, tile_size, [int(cur_loc[0]), int(cur_loc[1])], normalizer=False)
            orig_tiles[tile_id] =  orig_tile
        txn.put(str(slide_id).encode(), orig_tiles.astype(np.uint8).tobytes())
        print('Finished: ', i, slide_id, time.time() - t1)

Finished:  0 006388_0 381.42585134506226
Finished:  1 008e5c_0 26.912149906158447
Finished:  2 00c058_0 82.83898758888245
Finished:  3 01adc5_0 228.55203580856323
Finished:  4 026c97_0 13.725768566131592
Finished:  5 028989_0 443.6755635738373


Error: mdb_put: Input/output error

In [11]:
slide_id, orig_tiles.shape

('029c68_0', (2649, 256, 256, 3))

In [15]:
env.close()

In [None]:
pqueue = Queue()
start_idx = 0
for i in range(opts.num_ps-1):
    end_idx = start_idx + batch_size
    reader_p = Process(target=generate_helper, args=(pqueue, opts,
                                                     slides_to_process[start_idx: end_idx]))
    reader_p.start()
    reader_processes.append(reader_p)
    start_idx = end_idx
# Ensure all slides are processed by processes.
reader_p = Process(target=generate_helper, args=(pqueue, opts, slides_to_process[start_idx: len(slides_to_process)]))
reader_p.start()
reader_processes.append(reader_p)

counter, ihc_counter, num_done = 0, 0, 0
batches = []

while True:
    # Block if necessary until an item is available.
    data = pqueue.get()
    print("Queue len: %d" % pqueue.qsize())
    # Done indicates job on one process is finished.
    if data == "Done":
        num_done += 1
        print("One part is done!")
        if num_done == opts.num_ps:
            break
    elif data == "ihc":
        counter += 1
        ihc_counter += 1
    else:
        batches.append(data)
    # Write a batch of data.
    if len(batches) == opts.write_batch_size:
        counter = write_batch_data(env_5x, env_10x, env_locs, batches, len(slides_to_process), counter)
# Write the rest data.
if len(batches) > 0:
    counter = write_batch_data(env_5x, env_10x, env_locs, batches, len(slides_to_process), counter)
for process in reader_processes:
    process.join()
assert counter == len(slides_to_process), "%d processed slides, %d slides to be processed" \
                                          % (counter, len(slides_to_process))
print("Number of ihc slides: %d" % ihc_counter)

In [None]:
def generate_helper(pqueue, opts, slides_to_process):
    tile_normalizer = reinhard_bg.ReinhardNormalizer()
    # use the pre-computed LAB mean and std values
    tile_normalizer.fit(None)
    counter = 0
    for slide_info in slides_to_process:
        slide_id = slide_info['slide_id']
        slide_loc = opts.slide_loc
        if not os.path.isfile("%s/%s.svs" % (opts.slide_loc, str(slide_id))):
            slide_loc = opts.slide_archive_loc
        tile_generator = generate_grid.TileGeneratorGrid(slide_loc, slide_id, opts.tile_size_10x, opts.mag,
                                                         opts.overlap, check_ihc=True, verbose=opts.verbose)
        if tile_generator.ihc is True:
            counter += 1
            pqueue.put("ihc")
            continue
        _, norm_tiles_high, locations, _ = tile_generator.gen_and_extract_tiles(tile_normalizer, opts.ts_thres)
        if len(norm_tiles_high) == 0:
            counter += 1
            pqueue.put("ihc")
            continue
        im_size_high = norm_tiles_high[0].shape[0]
        norm_tiles_low = np.zeros((len(norm_tiles_high), im_size_high // 2, im_size_high // 2, 3), dtype=np.uint8)
        for tile_id in range(len(norm_tiles_high)):
            norm_tile_high = Image.fromarray(norm_tiles_high[tile_id].astype(np.uint8))
            norm_tile_low = norm_tile_high.resize((im_size_high // 2, im_size_high // 2), Image.ANTIALIAS)
            norm_tiles_low[tile_id, :, :, :] = norm_tile_low

        data = {
            "slide_id": slide_id,
            "norm_tiles_10x": norm_tiles_high,
            "norm_tiles_5x": norm_tiles_low,
            "locations": locations
        }
        pqueue.put(data)
        counter += 1
        print("Put tiled slide [%s] on to queue: [%d]/[%d]" % (slide_id, counter, len(slides_to_process)))
    pqueue.put('Done')