# process the datasets for DINO
1. Recursively traverse a parent folder (which contains only subfolders).
2. Read each 4-channel .tif/.TIF (uint16) in the subfolders.
3. Compute how many 224×224 patches fit fully in the image, then place that grid centered within the image.
4. Skip any patch that contains 0 or 65535 (“NaNs”).
5. Perform per-patch min–max scaling (valid range to 0–2550–255).
6. Save each valid chip as a 4-channel .tif, with the file name format
- OriginalFileName_rowStart_colStart.tif

7. Show a progress bar for:

- The list of TIF images being processed, and
- The row/column tiling for each image (nested progress bars).


In [1]:
import os
import glob
import numpy as np
import tifffile
from tqdm import tqdm


def chip_and_save_tifs_centered(
    input_folder: str,
    output_folder: str,
    chip_size: int = 224
):
    """
    Recursively traverse subfolders in `input_folder`, read 4-channel .tif/.TIF
    images (uint16), tile them into 224x224 patches in a grid that is
    centered in the image, drop patches with any 0 or 65535, min-max scale
    to [0, 255], and save them to `output_folder`. Includes tqdm progress bars.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Gather all subdirectories
    subdirs = [d for d in os.scandir(input_folder) if d.is_dir()]

    for subdir in subdirs:
        # Collect both .tif and .TIF files in this subdir
        tif_files = glob.glob(os.path.join(subdir.path, "*.tif")) \
                  + glob.glob(os.path.join(subdir.path, "*.TIF"))

        # If no TIFs found in this subdir, move on
        if not tif_files:
            continue

        # Create a sub-progress bar for TIF files in this subdir
        for tif_path in tqdm(tif_files, desc=f"Processing TIFs in {subdir.name}", unit="file"):
            filename = os.path.splitext(os.path.basename(tif_path))[0]

            # Read the 4-channel image with tifffile
            img = tifffile.imread(tif_path)
            # Expected shape: (height, width, channels) or (channels, height, width)

            if img.ndim != 3:
                print(f"Skipping {tif_path}: not a 3D array (found shape: {img.shape}).")
                continue

            # If channels are first, transpose to (H, W, C)
            # so we end up with shape: (height, width, 4)
            if img.shape[0] == 4 and (img.shape[1] != 4 and img.shape[2] != 4):
                img = np.transpose(img, (1, 2, 0))
            elif img.shape[-1] != 4:
                print(f"Skipping {tif_path}: not a 4-channel image.")
                continue

            # Ensure uint16
            if img.dtype != np.uint16:
                print(f"Warning: {tif_path} not in uint16. Found {img.dtype}, converting.")
                img = img.astype(np.uint16)

            height, width, channels = img.shape

            # How many full patches along each dimension?
            n_rows = height // chip_size
            n_cols = width // chip_size

            # If the image is too small for even one 224x224 patch, skip
            if n_rows == 0 or n_cols == 0:
                print(f"Skipping {tif_path}: image too small for {chip_size}x{chip_size} patches.")
                continue

            # Total covered area
            covered_height = n_rows * chip_size
            covered_width = n_cols * chip_size

            # Offsets to center the grid
            offset_row = (height - covered_height) // 2
            offset_col = (width - covered_width) // 2

            # Nested progress bars for row and column tiling
            for row_idx in tqdm(range(n_rows), desc="Rows", leave=False):
                for col_idx in tqdm(range(n_cols), desc="Cols", leave=False):
                    row_start = offset_row + row_idx * chip_size
                    col_start = offset_col + col_idx * chip_size

                    # Extract the chip
                    chip = img[row_start:row_start + chip_size,
                               col_start:col_start + chip_size, :]

                    # Skip if any pixel in the chip is 0 or 65535
                    if np.any(chip == 0) or np.any(chip == 65535):
                        continue

                    # Convert to float for min-max scaling
                    chip_float = chip.astype(np.float32)
                    valid_min = chip_float.min()
                    valid_max = chip_float.max()

                    # Avoid divide-by-zero if min == max
                    if valid_min == valid_max:
                        continue

                    # Scale to [0, 255]
                    chip_scaled = 255.0 * (chip_float - valid_min) / (valid_max - valid_min)
                    chip_scaled = chip_scaled.astype(np.uint8)

                    # Output name: originalfilename_row_col.tif
                    out_chip_name = f"{filename}_{row_start}_{col_start}.tif"
                    out_chip_path = os.path.join(output_folder, out_chip_name)

                    tifffile.imwrite(
                        out_chip_path,
                        chip_scaled,
                        dtype=chip_scaled.dtype,
                    )

    print("Chipping complete.")


In [None]:

# Example usage
input_dir = "../data/processed"  # contains only subfolders, each with .tif/.TIF
output_dir = "../data/chipped_224"

chip_and_save_tifs(
    input_folder=input_dir,
    output_folder=output_dir,
    chip_size=224
)