# process the datasets for DINO
1. Recursively traverse a parent folder (which contains only subfolders).
2. Read each 4-channel .tif/.TIF (uint16) in the subfolders.
3. Compute how many 224×224 patches fit fully in the image, then place that grid centered within the image.
4. Skip any patch that contains 0 or 65535 (“NaNs”).
5. Perform per-patch min–max scaling (valid range to 0–2550–255).
6. Save each valid chip as a 4-channel .tif, with the file name format
- OriginalFileName_rowStart_colStart.tif

7. Show a progress bar for:

- The list of TIF images being processed, and
- The row/column tiling for each image (nested progress bars).


In [1]:
import os
import glob
import numpy as np
import tifffile
from tqdm import tqdm
import re
import multiprocessing
import numpy as np
from scipy.signal import convolve2d


import numpy as np

def skip_chip(chip: np.ndarray,factor=0.3) -> bool:
    """
    Returns True if more than 30% of all band-pixels in this chip are 0 or 65535.
    
    chip.shape is assumed to be (height, width, 4).
    """
    # Create a boolean mask for pixels that are 0 or 65535 (in any band)
    bad_mask = (chip == 0) | (chip == 65535)
    
    # Count how many pixels meet this criterion
    bad_count = np.count_nonzero(bad_mask)
    
    # Total number of band-pixels = height * width * 4
    total_count = chip.size  # same as chip.shape[0] * chip.shape[1] * chip.shape[2]
    
    # Fraction of band-pixels that are 0 or 65535
    fraction_bad = bad_count / total_count
    
    # Skip if fraction_bad > 0.30
    return fraction_bad > factor


        
def clean_filename(filename: str):
    """Remove any extra '.tif' or '.TIF' extensions from the filename."""
    return re.sub(r'(\.tif)+$', '', filename, flags=re.IGNORECASE)

def process_folder(subdir, output_folder, chip_size=224):
    """Processes all .tif/.TIF images in a given subdirectory."""
    tif_files = glob.glob(os.path.join(subdir, "*.tif")) + glob.glob(os.path.join(subdir, "*.TIF"))

    if not tif_files:
        return

    for tif_path in tif_files:
        raw_filename = os.path.basename(tif_path)
        dirname = os.path.dirname(tif_path)
        filename =clean_filename( os.path.splitext(raw_filename)[0])

        # Read the 4-channel image with tifffile
        img = tifffile.imread(tif_path)


        # Read the accompanying RGB file
        parts = filename.split("_")

        parts[0] = 'RGB'
        # Remove the second part (index 1)
        parts.pop(1)
        # Join the parts back together
        rgb_filename = '_'.join(parts) + ".TIF"
        rgb_filename = os.path.join(dirname, rgb_filename)

        if not os.path.exists(rgb_filename):
            continue # skip the missing aligned file
            
        rgb = tifffile.imread(rgb_filename)


        

        if img.ndim != 3:
            print(f"Skipping {tif_path}: not a 3D array (found shape: {img.shape}).")
            continue

                # Ensure correct channel order
        if img.shape[0] in [4, 5] and img.shape[1] not in [4, 5] and img.shape[2] not in [4, 5]:
            img = np.transpose(img, (1, 2, 0))  # Convert (C, H, W) → (H, W, C)
        elif img.shape[-1] not in [4, 5]:
            print(f"Skipping {tif_path}: not a 4 or 5-channel image.")
            continue
            
        # Ensure uint16
        if img.dtype != np.uint16:
            print(f"Warning: {tif_path} not in uint16. Found {img.dtype}, converting.")
            img = img.astype(np.uint16)

        height, width, channels = img.shape

        # Compute the number of tiles that fit
        n_rows = height // chip_size
        n_cols = width // chip_size

        if n_rows == 0 or n_cols == 0:
            print(f"Skipping {tif_path}: image too small for {chip_size}x{chip_size} patches.")
            continue

        # Compute center-aligned grid
        covered_height = n_rows * chip_size
        covered_width = n_cols * chip_size
        offset_row = (height - covered_height) // 2
        offset_col = (width - covered_width) // 2

        # Process tiles
        for row_idx in range(n_rows):
            for col_idx in range(n_cols):
                row_start = offset_row + row_idx * chip_size
                col_start = offset_col + col_idx * chip_size

                # Extract chip
                chip = img[row_start:row_start + chip_size, col_start:col_start + chip_size, :]
                chip_rgb = rgb[row_start:row_start + chip_size, col_start:col_start + chip_size, :]
                # If EITHER a 3×3 block is all 0 OR a 3×3 block is all 65535, skip it.
                if skip_chip(chip):
                    continue  # skip this chip

                # Convert to float for min-max scaling
                chip_float = chip.astype(np.float32)
                valid_min = chip_float.min()
                valid_max = chip_float.max()

                # Avoid divide-by-zero if min == max
                if valid_min == valid_max:
                    continue

                # Scale to [0, 255]
                chip_scaled = 255.0 * (chip_float - valid_min) / (valid_max - valid_min)
                chip_scaled = chip_scaled.astype(np.uint8)

                # Clean up output filename
                out_chip_name = f"{filename}_{row_start}_{col_start}.tif"
                out_rgb_name = f"{filename}_{row_start}_{col_start}_RGB.tif"
                out_chip_path = os.path.join(output_folder, out_chip_name)
                out_rgb_path = os.path.join(output_folder, out_rgb_name)

                tifffile.imwrite(
                    out_chip_path,
                    chip_scaled,
                    dtype=chip_scaled.dtype,
                )

                
                tifffile.imwrite(
                    out_rgb_path,
                    chip_rgb,
                    dtype=chip_rgb.dtype,
                )

import os
import glob
import numpy as np
import tifffile
from tqdm import tqdm
import re
from concurrent.futures import ThreadPoolExecutor, as_completed


def parallel_process_folders(input_folder: str, output_folder: str, max_threads=4, chip_size=224):
    """
    Uses ThreadPoolExecutor to process each folder in parallel.
    Each folder runs in its own thread.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Gather all subdirectories
    subdirs = [d.path for d in os.scandir(input_folder) if d.is_dir()]

    with ThreadPoolExecutor(max_threads) as executor:
        future_to_subdir = {executor.submit(process_folder, subdir, output_folder, chip_size): subdir for subdir in subdirs}

        # Progress bar for folder-level execution
        for future in tqdm(as_completed(future_to_subdir), total=len(future_to_subdir), desc="Processing folders"):
            subdir = future_to_subdir[future]
            try:
                future.result()  # Check for exceptions
            except Exception as e:
                print(f"Error processing {subdir}: {e}")

    print("Parallel chipping complete.")


In [2]:
# Example usage
# Example usage
input_dir = "../../data/msrgb_processed/"  # contains only subfolders, each with .tif/.TIF
output_dir = "../../data/output_multi/RGBMS/"
num_cores = min(multiprocessing.cpu_count(), 8)  # Limit to 8 cores max

parallel_process_folders(
    input_folder=input_dir,
    output_folder=output_dir,
    max_threads=num_cores,
    chip_size=512
)

Processing folders: 100%|██████████████████████████████████████████████████████████████| 44/44 [30:47<00:00, 41.98s/it]

Parallel chipping complete.



