# Extract non-blank tiles from PNG WSIs, and save it in labels-specific folders
Fast and simple method to produce relatively balanced dataset for adjusting and validating classification models before generalise it on the whole train_images dataset.

In [1]:
import os
import cv2
import numpy as np
import pandas as pd

In [2]:
# Function to generate tiles from images based on provided parameters
def generate_tiles(image_id, label, image_dir, output_dir, tile_size, threshold_mean, threshold_std):
    # Create label-specific output directory
    label_output_dir = os.path.join(output_dir, label)
    os.makedirs(label_output_dir, exist_ok=True)

    # Construct the path for the image using its ID
    image_path = os.path.join(image_dir, f"{image_id}.png")
    
    # Check if the image exists
    if not os.path.exists(image_path):
        print(f"Image not found: {image_path}")
        return

    try:
        # Read the image using OpenCV
        img = cv2.imread(image_path)
        height, width, _ = img.shape
    except Exception as e:
        print(f"Error reading image {image_path}: {e}")
        return

    # Calculate the number of rows and columns for tiles
    rows_count = height // tile_size
    cols_count = width // tile_size

    # Iterate through rows and columns to generate tiles
    for row_idx in range(rows_count):
        for col_idx in range(cols_count):
            # Calculate coordinates for each tile
            x = col_idx * tile_size
            y = row_idx * tile_size

            # Extract the tile from the image
            tile = img[y:y+tile_size, x:x+tile_size]

            # Define the name for the tiles
            tile_name = f"{image_id}_tile_{col_idx}_{row_idx}.png"
            
            # Convert the tile to a NumPy array for mean and std calculation
            tile_np = np.array(tile)

            # Check if the tile is not mostly blank based on mean and std thresholds
            if tile_np.mean() >= threshold_mean and tile_np.std() >= threshold_std:
                # Define the path for the tiles
                tile_path = os.path.join(label_output_dir, tile_name)
                # Save the tile as a PNG files
                cv2.imwrite(tile_path, tile)

In [3]:
# Define directory paths and CSV file
image_dir = "/train_images"
output_dir = "/output_tiles"
csv_file_path = "/train.csv"

# Read the CSV file containing image IDs and labels
data = pd.read_csv(csv_file_path)

# Iterate through each row in the CSV file
for index, dataset_row in data.iterrows():
    # Extract image ID, label, and generate tiles for each entry
    generate_tiles(dataset_row['image_id'], dataset_row['label'], image_dir, output_dir, tile_size=256, threshold_mean=170, threshold_std=15)