In [11]:
import logging
import os
from pathlib import Path
from timeit import default_timer as timer

import cv2
import humanize
import isupgrader.data.tiler.v4
import pandas as pd
import tifffile
from joblib import Parallel, delayed
from pandas.core.frame import DataFrame
from tqdm import tqdm
from isupgrader.utils.tqdm_joblib import tqdm_joblib

In [12]:
# CONFIG
PANDA_PATH="/data/raw/panda"
PANDA_INTERIM_PATH="/data/interim/panda"
OUTPUT_PATH="/data/processed/panda"
SLIDE_LEVEL=1
THREADS=40
TILE_SIZE=256
DEBUG=False

In [13]:
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] - %(asctime)s: %(message)s', datefmt="%d/%m %H:%M:%S")

In [14]:
def tile_wrap(path_to_slide: str, slide_id: str, tile_size: int, out_dir: str, level: int) -> DataFrame:
    image = tifffile.imread(path_to_slide, key=level)
    tiles = isupgrader.data.tiler.v4.generate_tiles(image, tile_size)

    # create output dir for slides
    Path(out_dir).mkdir(parents=True, exist_ok=True)

    # output tiles in order of info (high -> low). I.e. 0.png has most info
    for i, tile in enumerate(tiles):
        tile_out_path = os.path.join(out_dir, f"{i}.png")
        cv2.imwrite(tile_out_path, cv2.cvtColor(tile, cv2.COLOR_BGR2RGB))

    df = pd.DataFrame([{"slide_id": slide_id, "n_tiles": len(tiles)}])

    return df


In [15]:
panda_df_path = os.path.join(PANDA_INTERIM_PATH, "train.processed.csv")
df = pd.read_csv(panda_df_path)

logging.info("Starting: PANDA Tiling.")
logging.info(f"panda_path={PANDA_PATH}")
logging.info(f"output_path={OUTPUT_PATH}")
logging.info(f"slide_level={SLIDE_LEVEL}")
logging.info(f"threads={THREADS}")
logging.info(f"tile_size={TILE_SIZE}")

start = timer()

if DEBUG:
    logging.getLogger().setLevel(logging.DEBUG)
    logging.debug("~~DEBUGGING~~")
    df_out = tile_wrap(path_to_slide=os.path.join(PANDA_PATH, "train_images", f"{df.iloc[1].slide_id}.tiff"),
                        slide_id=df.iloc[0].slide_id,
                        tile_size=TILE_SIZE,
                        out_dir=os.path.join(OUTPUT_PATH, "train_images", df.iloc[1].slide_id),
                        level=SLIDE_LEVEL)

    df_slides = [df_out]
else:
    with tqdm_joblib(tqdm(total=len(df))) as progress_bar:
        df_slides = Parallel(n_jobs=THREADS)(delayed(tile_wrap)(
            path_to_slide=os.path.join(PANDA_PATH, "train_images", f"{df.iloc[i].slide_id}.tiff"),
            slide_id=df.iloc[i].slide_id,
            tile_size=TILE_SIZE,
            out_dir=os.path.join(OUTPUT_PATH, "train_images", df.iloc[i].slide_id),
            level=SLIDE_LEVEL
        ) for i in range(len(df)))

df_slides = pd.concat(df_slides)

df_out = pd.merge(df, df_slides, on="slide_id")

df_out.to_csv(os.path.join(OUTPUT_PATH, "slides_train.csv"), index=False)

end = timer()

time_taken = humanize.naturaldelta(end - start)

n_slides = df_out["n_tiles"].notnull().sum()
time_per_slide = (end - start) / n_slides

total_panda_size = len(pd.read_csv(os.path.join(PANDA_PATH, "train.csv")))

logging.info("Done tiling")
logging.info("-----------")
logging.info(f"Total time: {time_taken}")
logging.info(f"Total slides: {n_slides}")
logging.info(f"Time per slide: {time_per_slide:.2f}s")

if n_slides < total_panda_size:
    panda_time_estimation = humanize.naturaldelta(time_per_slide * total_panda_size)
    logging.info(f"Estimated time to process complete PANDA dataset: {panda_time_estimation}")


[INFO] - 04/08 10:56:49: Starting: PANDA Tiling.
[INFO] - 04/08 10:56:49: panda_path=/data/raw/panda
[INFO] - 04/08 10:56:49: output_path=/data/processed/panda
[INFO] - 04/08 10:56:49: slide_level=1
[INFO] - 04/08 10:56:49: threads=40
[INFO] - 04/08 10:56:49: tile_size=256
  3%|▎         | 316/10615 [00:21<11:30, 14.91it/s]


KeyboardInterrupt: 