### Necessary Imports

In [4]:
import os
import numpy as np
import pandas as pd
import shutil
from PIL import Image
import cv2
# The path can also be read from a config file, etc.
OPENSLIDE_PATH = r'E:\KSA Project\data_preprocessing\openslide-bin-4.0.0.3-windows-x64\bin'

if hasattr(os, 'add_dll_directory'):
    # Windows
    with os.add_dll_directory(OPENSLIDE_PATH):
        import openslide
else:
    import openslide
from WSI_Stiching_Code.wsi_core.WholeSlideImage import WholeSlideImage

In [5]:
# Example usage
input_dir = r'E:\KSA Project\dataset\svs_files'
output_dir = r'E:\KSA Project\dataset\downsample_crop'
target_size = (1024, 1024)  # Example target size for resizing if required. 
label = 'example_label'

### Dynamic Downsampling of the WSI images 

In [None]:
def get_wsi_magnification(slide):
    """
    Get the magnification of a WSI image using OpenSlide properties.
    """
    objective_power = slide.properties.get('openslide.objective-power')
    if objective_power:
        return int(objective_power)
    else:
        raise ValueError("WSI does not contain objective power information.")

def downsample_and_resize_wsi(wsi_path, output_dir, label,target_size=None):
    slide = openslide.OpenSlide(wsi_path)
    mag = get_wsi_magnification(slide)
    print(f"Magnification: {mag}x")
    scale_factor = mag / 5.0
    print(f"Scale factor: {scale_factor}")
    # print original dimensions of the WSI 
    print(f"Original dimensions: {slide.dimensions}")
    # Calculate level to read from for downsampling to 5x
    level = slide.get_best_level_for_downsample(scale_factor)
    downsample = slide.level_downsamples[level]
    print(f"Using scale factor: {scale_factor} the best level {level} with downsample factor {downsample}")
    # Downsample to 5x
    downsampled_width = int(slide.level_dimensions[0][0] / scale_factor)
    downsampled_height = int(slide.level_dimensions[0][1] / scale_factor)
    print(f"Downsampled dimensions: {downsampled_width} x {downsampled_height}")
    region = slide.read_region((0, 0), level, slide.level_dimensions[level])
    # print the region size after downsample
    print(f"Downsampled region dimensions using read region: {region.size}")
    downsampled_img = region.resize((downsampled_width, downsampled_height), Image.LANCZOS)
    print(f"Downsampled dimensions using resize: {downsampled_img.size}")
    
    # Resize the image to the exact target size if necessary
    if target_size:
        resized_img = downsampled_img.resize(target_size, Image.LANCZOS)
    
    # Save the downsampled and resized image with the label in the filename
    # output_path = os.path.join(output_dir, f"{os.path.basename(wsi_path)[:12]}_{label}.png")
    # if not os.path.exists(os.path.dirname(output_path)):
    #     os.makedirs(os.path.dirname(output_path))
    # downsampled_img.save(output_path)
    # print(f"Saved downsampled and resized WSI to {output_path}")

# Process all WSI images in the input directory
for root, dirs, files in os.walk(input_dir):
    for wsi_file in files:
        if wsi_file.endswith('.svs') or wsi_file.endswith('.tiff'):
            wsi_path = os.path.join(root, wsi_file)
            print(f"Processing WSI: {wsi_file}")
            downsample_and_resize_wsi(wsi_path, output_dir, label, target_size)

print("Finished DownSampling!")

Processing WSI: TCGA-3L-AA1B_nonMSIH.svs
Magnification: 40x
Scale factor: 8.0
Original dimensions: (95615, 74462)
Using scale factor: 8.0 the best level 1 with downsample factor 4.000116473747436
Downsampled dimensions: 11951 x 9307
Downsampled region dimensions using read region: (23903, 18615)
Downsampled dimensions using resize: (11951, 9307)
Processing WSI: TCGA-A6-2671_nonMSIH.svs
Magnification: 40x
Scale factor: 8.0
Original dimensions: (101680, 36748)
Using scale factor: 8.0 the best level 1 with downsample factor 4.0
Downsampled dimensions: 12710 x 4593
Downsampled region dimensions using read region: (25420, 9187)
Downsampled dimensions using resize: (12710, 4593)
Processing WSI: TCGA-A6-2675_nonMSIH.svs
Magnification: 40x
Scale factor: 8.0
Original dimensions: (157316, 21227)
Using scale factor: 8.0 the best level 1 with downsample factor 4.000282698831512
Downsampled dimensions: 19664 x 2653
Downsampled region dimensions using read region: (39329, 5306)
Downsampled dimension

ValueError: WSI does not contain objective power information.

### DownSampling Using CLAM approach

In [6]:
def get_wsi_magnification(slide):
    try:
        magnification = slide.properties[openslide.PROPERTY_NAME_OBJECTIVE_POWER]
        return float(magnification)
    except KeyError:
        raise ValueError("Magnification information not available in the WSI properties.")

def crop_downsample_image_old(image, output_path):
    image = np.array(image)
    # Convert the image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # Calculate the Laplacian variance of the grayscale image
    variance = cv2.Laplacian(gray, cv2.CV_64F).var()
    # If variance is too low, indicating a uniform region, adjust the threshold
    if variance < 50:
        _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
    else:
        # Apply a binary threshold to get the detailed regions
        blurred = cv2.GaussianBlur(gray, (5, 5), 0)
        _, thresh = cv2.threshold(blurred, 150, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
    # Find contours in the thresholded image
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    # Sort contours by area (descending) to get top 10 largest contours
    contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
    # Initialize an empty mask to draw the contours
    mask = np.zeros_like(gray)
    # Draw the top 10 contours on the mask
    cv2.drawContours(mask, contours, -1, 255, thickness=cv2.FILLED)
    # Find bounding box coordinates of the masked area
    x, y, w, h = cv2.boundingRect(mask)
    # Crop the image to the bounding box
    cropped_image = image[y:y+h, x:x+w]
    cropped_image = Image.fromarray(cropped_image)
    # Save the cropped image
    # cropped_image.save(output_path)
    print(f'Downsampled cropped image dimensions old method: {cropped_image.size}')

def crop_downsample_image(image, output_path):
    image = np.array(image)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)  # Adjust threshold for white background
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    contours = sorted(contours, key=cv2.contourArea, reverse=True)[:10]
    mask = np.zeros_like(gray)
    cv2.drawContours(mask, contours, -1, 255, thickness=cv2.FILLED)
    x, y, w, h = cv2.boundingRect(mask)
    cropped_image = image[y:y+h, x:x+w]
    cropped_image = Image.fromarray(cropped_image)
    # cropped_image.save(output_path)
    print(f'Cropped image dimensions: {cropped_image.size}')

def save_downsampled_cropped_image(WSI_object, slide_id, save_dir):
    mag = get_wsi_magnification(WSI_object.getOpenSlide())
    scale_factor = mag / 5.0
    wsi = WSI_object.getOpenSlide()
    best_level = wsi.get_best_level_for_downsample(scale_factor)
    downsampled_image = WSI_object.getOpenSlide().read_region((0, 0), best_level, WSI_object.level_dim[best_level])
    downsampled_image = downsampled_image.convert("RGB")
    # downsampled_image_path = os.path.join(save_dir, slide_id + '_downsampled.png')
    # downsampled_image.save(downsampled_image_path)
    # Crop the downsampled image
    cropped_save_path = os.path.join(save_dir, slide_id + '.png')
    print(f'Downsampled image dimensions: {downsampled_image.size}')
    crop_downsample_image(downsampled_image,cropped_save_path)
    crop_downsample_image_old(downsampled_image,cropped_save_path)

# Process all WSI images in the input directory
for root, dirs, files in os.walk(input_dir):
    for wsi_file in files:
        if wsi_file.endswith('.svs') or wsi_file.endswith('.tiff'):
            wsi_path = os.path.join(root, wsi_file)
            slide_id = os.path.basename(wsi_path).split('.')[0]
            slide = openslide.OpenSlide(wsi_path)
            WSI_object = WholeSlideImage(wsi_path)
            # print original dimensions of the WSI
            print(f"Original dimensions: {slide.dimensions} of wsi {slide_id}")
            save_downsampled_cropped_image(WSI_object, slide_id, output_dir)

Original dimensions: (95615, 74462) of wsi TCGA-3L-AA1B_nonMSIH
Downsampled image dimensions: (23903, 18615)
Cropped image dimensions: (23903, 16991)
Downsampled cropped image dimensions old method: (23903, 17151)
Original dimensions: (101680, 36748) of wsi TCGA-A6-2671_nonMSIH
Downsampled image dimensions: (25420, 9187)
Cropped image dimensions: (25041, 8394)
Downsampled cropped image dimensions old method: (25046, 8542)
Original dimensions: (157316, 21227) of wsi TCGA-A6-2675_nonMSIH
Downsampled image dimensions: (39329, 5306)
Cropped image dimensions: (26302, 2688)


KeyboardInterrupt: 

### Code for verification of already downsampled data

In [None]:
import os
import csv
import openslide
from PIL import Image

def get_wsi_metadata(slide, slide_id):
    """
    Extract metadata from a slide object, including magnification, scale factors, and downsample level.
    Parameters:
        slide (OpenSlide): The loaded WSI slide.
        slide_id (str): The identifier of the slide (file name without extension).
    Returns:
        dict: Metadata including slide_id, dimensions, magnification, mpp, scale factors, and downsample level.
    """
    try:
        width, height = slide.dimensions
        objective_power = slide.properties.get('openslide.objective-power', 'Unknown')
        mpp_x = slide.properties.get('openslide.mpp-x', 'Unknown')
        mpp_y = slide.properties.get('openslide.mpp-y', 'Unknown')
        scale_factor = slide.properties.get('aperio.AppMag', 'Unknown')

        # Safely calculate scale_factor2 and downsample_level
        scale_factor2 = float(objective_power) / 5 if objective_power != 'Unknown' else None
        downsample_level = slide.get_best_level_for_downsample(scale_factor2) if scale_factor2 else None

        # Prepare metadata dictionary
        metadata = {
            "slide_id": slide_id,
            "width": width,
            "height": height,
            "magnification_power": float(objective_power) if objective_power != 'Unknown' else None,
            "mpp_x": round(float(mpp_x), 3) if mpp_x != 'Unknown' else None,
            "mpp_y": round(float(mpp_y), 3) if mpp_y != 'Unknown' else None,
            "scale_factor": scale_factor,
            "scale_factor2": scale_factor2,
            "downsample_level": downsample_level
        }
        return metadata
    except Exception as e:
        print(f"Error retrieving metadata for slide {slide_id}: {e}")
        return None

def calculate_cropped_downsampling(original_width, cropped_width, level):
    org_downsample_level = original_width / cropped_width
    org_level_downsample = original_width / level
    return org_downsample_level, org_level_downsample


def save_metadata_to_csv(input_dir, png_dir, output_csv):
    """
    Process all slides in the input directory, calculate metadata, and update with downsampled image dimensions.
    Parameters:
        input_dir (str): Directory containing .svs files.
        png_dir (str): Directory containing the downsampled PNG images.
        output_csv (str): Path to save the metadata CSV file.
    """
    fieldnames = [
        "slide_id", "width", "height", "magnification_power", "mpp_x", "mpp_y", "scale_factor", "scale_factor2", "downsample_level", "downsampled_width", "downsampled_height", "calculated_downsample_factor", "calculated_downsample_level"
    ]
    results = []

    for root, _, files in os.walk(input_dir):
        for wsi_file in files:
            if wsi_file.endswith('.svs') or wsi_file.endswith('.tiff'):
                wsi_path = os.path.join(root, wsi_file)
                slide_id = os.path.splitext(wsi_file)[0]

                try:
                    slide = openslide.OpenSlide(wsi_path)
                    metadata = get_wsi_metadata(slide, slide_id)

                    if metadata:
                        # Add downsampled dimensions
                        png_path = os.path.join(png_dir, f"{slide_id}.png")
                        if os.path.exists(png_path):
                            with Image.open(png_path) as img:
                                metadata["downsampled_width"], metadata["downsampled_height"] = img.size

                                # Calculate cropped downsampling factor and level
                                org_downsample_level, org_level_downsample = calculate_cropped_downsampling(
                                    metadata["width"], metadata["height"],metadata["downsample_level"] , img.size[0], img.size[1]
                                )
                                metadata["org_downsample_level"] = org_downsample_level
                                metadata["org_level_downsample"] = org_level_downsample
                        else:
                            metadata["downsampled_width"] = "Not Found"
                            metadata["downsampled_height"] = "Not Found"
                            metadata["calculated_downsample_factor"] = "Not Found"
                            metadata["calculated_downsample_level"] = "Not Found"

                        results.append(metadata)
                        print(f"Processed slide: {slide_id}")
                        print(metadata)
                except Exception as e:
                    print(f"Error processing slide {slide_id}: {e}")
    
    # Write results to CSV
    # with open(output_csv, mode='w', newline='') as csvfile:
    #     writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    #     writer.writeheader()
    #     writer.writerows(results)

    print(f"Metadata saved to {output_csv}")

# Example usage
input_dir = r'E:\\KSA Project\\dataset\\svs_files'  # Replace with the actual path to your slides
png_dir = r'E:\\KSA Project\\dataset\\cropped_data'  # Directory containing downsampled PNG images
output_csv = r'E:\\KSA Project\\dataset\\svs_metadata2.csv'  # Path to save the metadata CSV
save_metadata_to_csv(input_dir, png_dir, output_csv)
