In [36]:
import os

def count_files_in_directory(directory):
    file_count = 0
    for root, dirs, files in os.walk(directory):
        file_count += len(files)
    return file_count

directory_path = "../data/chipped_224/"

In [37]:
count_files_in_directory(directory_path)

536844

In [34]:
import os
import collections
def count_files_by_prefix(directory):
    """
    Counts the number of files in a directory, grouping by:
    - First prefix before the first "_"
    - Second prefix before the second "_"
    
    Example:
    - "image1_0_0.tif" → ('image1', '0')
    - "image1_0_224.tif" → ('image1', '0')
    - "image2_1_0.tif" → ('image2', '1')
    
    Output:
    {
        'image1': {'0': 100, '1': 50},
        'image2': {'0': 80, '1': 60},
        ...
    }
    """
    file_counts = collections.defaultdict(lambda: collections.defaultdict(int))

    for filename in os.listdir(directory):
        if filename.endswith(".tif") or filename.endswith(".TIF"):
            parts = filename.split("_")

            if len(parts) >= 2:  # Ensure there are at least two prefixes
                first_prefix = parts[0]
                second_prefix = parts[3]

                file_counts[first_prefix][second_prefix] += 1

    return file_counts


In [35]:
output_dir = "../data/chipped_224/"  # Update this path
file_counts = count_files_by_prefix(output_dir)

# Print results, sorting alphabetically by both first and second prefix
for first_prefix in sorted(file_counts.keys()):
    print(f"{first_prefix}:")
    for second_prefix, count in sorted(file_counts[first_prefix].items(), key=lambda x: x[0]):  # Sorted as string
        print(f"  {second_prefix}: {count} files")

3M:
  miningwaste: 14758 files
  nitrogen: 7389 files
  olivegrove: 81600 files
  portugal: 20551 files
  uk: 239269 files
ALTUM:
  beechforest: 316 files
  macroalgae: 3006 files
P4M:
  cocao: 44555 files
  tropical: 6860 files
REDEDGE:
  blueberry: 1702 files
  botrytis: 63846 files
  contamination: 14669 files
  forestfuel: 22158 files
  potato: 1780 files
  rivers: 335 files
SEQUOIA:
  cherry: 53 files
  diurnal: 297 files
  localization: 489 files
  nature: 7827 files
  subtropical: 3650 files


In [11]:
import os
import numpy as np
import tifffile as tiff
import matplotlib.pyplot as plt
from pathlib import Path

import tifffile as tiff
import numpy as np
from skimage.transform import resize
import cv2

def load_tif_image(filepath):
    """
    Load a 4-band TIFF image and return as a NumPy array (uint16).
    If the image is very large (height or width > 10k), it will be
    downsampled by a factor of 10.
    """
    # Read the image (H, W, 4)
    img = tiff.imread(filepath)
    print(f"Original shape: {img.shape}, dtype: {img.dtype}")

    # If any dimension is >10,000, downsample by factor of 10
    if img.shape[0] > 3000 or img.shape[1] > 3000:
        print("Downscaling with OpenCV to 10% size...")
        new_height = int(img.shape[0] * 0.1)
        new_width = int(img.shape[1] * 0.1)
        # Note: OpenCV expects (width, height)
        resized = cv2.resize(img, (new_width, new_height), interpolation=cv2.INTER_AREA)
        img = resized.astype(np.uint16)

    return img
    

def plot_images(image, title_prefix, save_path):
    """Generate and save multiple visualizations for a 4-band image."""
    bands = ['GRE', 'RED', 'REDEDGE', 'NIR']
    
    # Plot individual bands
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    for i in range(4):
        ax = axes[i // 2, i % 2]
        ax.imshow(image[:, :, i], cmap='gray')
        ax.set_title(f"{title_prefix} - {bands[i]}")
        ax.axis("off")
    
    # False Color Composite (NIR, RED, GREEN)
    false_color = np.stack([image[:, :, 3], image[:, :, 1], image[:, :, 0]], axis=-1)  # (H, W, 3)
    print(image.shape)
    false_color = false_color / np.max(false_color)  # Normalize for visualization
    
    ax_false = axes[0, 2]
    ax_false.imshow(false_color)
    ax_false.set_title(f"{title_prefix} - False Color (NIR, RED, GREEN)")
    ax_false.axis("off")
    
    # Zoomed-in False Color (20x)
    zoom_factor = 20
    h, w, _ = image.shape
    center_x, center_y = w // 2, h // 2
    zoom_size_x, zoom_size_y = w // (2 * zoom_factor), h // (2 * zoom_factor)
    
    zoomed = false_color[
        center_y - zoom_size_y:center_y + zoom_size_y,
        center_x - zoom_size_x:center_x + zoom_size_x,
        :
    ]
        
    ax_zoom = axes[1, 2]
    ax_zoom.imshow(zoomed)
    ax_zoom.set_title(f"{title_prefix} - False Color Zoomed (20x)")
    ax_zoom.axis("off")

    # Save the visualization
    plt.tight_layout()
    plt.savefig(save_path, dpi=300)
    plt.close()

def plot_histogram(image, title_prefix, save_path):
    """Plot histograms for all 4 bands and save as an image."""
    bands = ['GRE', 'RED', 'REDEDGE', 'NIR']
    fig, ax = plt.subplots(figsize=(8, 6))

    for i in range(4):
        ax.hist(image[:, :, i].ravel(), bins=256, alpha=0.6, label=bands[i], histtype='step')

    ax.set_title(f"{title_prefix} - Histogram of Bands")
    ax.set_xlabel("Pixel Intensity")
    ax.set_ylabel("Frequency")
    ax.legend()
    
    plt.savefig(save_path, dpi=300)
    plt.close()


def process_directory(root_dir, output_dir):
    """Process all subdirectories containing TIFF files and save outputs to the given directory."""
    root_path = Path(root_dir)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)  # Ensure output directory exists

    for sub_dir in root_path.iterdir():
        sub_output_dir = Path(output_dir) / sub_dir.name
        
        if os.path.exists(sub_output_dir):
            print("Skipping" + str(sub_output_dir))
            continue
            
        if sub_dir.is_dir():
            process_single_subfolder(root_dir, sub_dir.name, output_dir)

def process_single_subfolder(root_dir, subfolder_name, output_dir):
    """Process a specific subfolder, handling cases with one or multiple images."""
    sub_path = Path(root_dir) / subfolder_name
    if not sub_path.is_dir():
        print(f"Subfolder {subfolder_name} not found in {root_dir}")
        return
    
    tif_files = sorted(sub_path.glob("*.tif"))  # Sort to get first and last
    if len(tif_files) == 0:
        print(f"Skipping {subfolder_name}: No TIFF files found.")
        return
    
    first_tif = tif_files[0]  # Always process the first image
    last_tif = tif_files[-1] if len(tif_files) > 1 else None  # Only set last if there's more than one
    
    print(f"Processing {subfolder_name}: {first_tif.name}" + (f", {last_tif.name}" if last_tif else " (Only 1 image)"))

    # Load the first image
    img_first = load_tif_image(first_tif)

    # Create a subfolder in the output directory
    sub_output_dir = Path(output_dir) / subfolder_name
    sub_output_dir.mkdir(parents=True, exist_ok=True)

    # Define save paths
    save_path_first = sub_output_dir / f"{subfolder_name}_first.png"
    save_path_hist_first = sub_output_dir / f"{subfolder_name}_hist_first.png"

    # Generate and save plots for the first image
    plot_images(img_first, f"{subfolder_name} - First", save_path_first)
    plot_histogram(img_first, f"{subfolder_name} - First", save_path_hist_first)

    print(f"Saved plots for {subfolder_name} in {sub_output_dir}")

    # If there's a second image, process it as well
    if last_tif and last_tif != first_tif:
        img_last = load_tif_image(last_tif)
        save_path_last = sub_output_dir / f"{subfolder_name}_last.png"
        save_path_hist_last = sub_output_dir / f"{subfolder_name}_hist_last.png"
        
        plot_images(img_last, f"{subfolder_name} - Last", save_path_last)
        plot_histogram(img_last, f"{subfolder_name} - Last", save_path_hist_last)

        print(f"Saved additional plots for last image in {sub_output_dir}")


In [13]:

# Set your paths here
directory_path = "../data/processed"  # Change to your actual input directory
output_directory = "../data/outputs"  # Change to your desired output directory
os.makedirs(output_directory, exist_ok=True)
process_directory(directory_path, output_directory)

Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Processing rededge_potato_f: REDEDGE_CAL_rededge_potato_f_Ecobreed_krompir_konv_20_07_2022_transparent_reflectance.tif (Only 1 image)
Original shape: (3923, 3262, 4), dtype: uint16
Downscaling with OpenCV to 10% size...
(392, 326, 4)
Saved plots for rededge_potato_f in ..\data\outputs\rededge_potato_f
Processing rededge_rivers_kk: REDEDGE_CAL_rededge_rivers_kk_KK_2020_03_20200707_MS_DLS_ORTHO_E