In [23]:
import os
import numpy as np
import pandas as pd
from PIL import Image, ImageEnhance, ImageFilter
import random
from scipy.ndimage import rotate, shift
import math

def safe_open_image(path, mode="L"):
    """Open image safely and convert to desired mode ('L' for grayscale, 'RGB' for color)"""
    try:
        img = Image.open(path)
        if img.mode != mode:
            print(f"⚠️ Converting {path} from {img.mode} to {mode}")
            img = img.convert(mode)
        return img
    except Exception as e:
        print(f"❌ Error loading image {path}: {e}")
        # Return a blank image fallback
        return Image.new(mode, (256, 256))

def z_score_standardize(image_array):
    """Apply z-score standardization to image array"""
    mean = np.mean(image_array)
    std = np.std(image_array)
    if std == 0:
        return image_array - mean
    return (image_array - mean) / std

def augment_image(img_array, augmentation_type):
    """Apply data augmentation to image array"""
    # Work with 2D array for augmentation
    if len(img_array.shape) == 3 and img_array.shape[2] == 1:
        augmented = img_array.squeeze()  # Remove the channel dimension
        add_channel_back = True
    else:
        augmented = img_array.copy()
        add_channel_back = False

    if augmentation_type == "rotation":
        angle = random.uniform(-30, 30)
        augmented = rotate(augmented, angle, reshape=False, mode='nearest')

    elif augmentation_type == "shift":
        shift_x = random.uniform(-20, 20)
        shift_y = random.uniform(-20, 20)
        augmented = shift(augmented, [shift_y, shift_x], mode='nearest')

    elif augmentation_type == "flip_horizontal":
        augmented = np.fliplr(augmented)

    elif augmentation_type == "flip_vertical":
        augmented = np.flipud(augmented)

    elif augmentation_type == "contrast":
        # Ensure values are in 0-255 range for PIL
        img_for_pil = (np.clip(augmented, 0, 1) * 255).astype(np.uint8)
        img_pil = Image.fromarray(img_for_pil, mode='L')
        enhancer = ImageEnhance.Contrast(img_pil)
        factor = random.uniform(0.7, 1.3)
        img_pil = enhancer.enhance(factor)
        augmented = np.array(img_pil).astype(np.float32) / 255.0

    elif augmentation_type == "brightness":
        # Ensure values are in 0-255 range for PIL
        img_for_pil = (np.clip(augmented, 0, 1) * 255).astype(np.uint8)
        img_pil = Image.fromarray(img_for_pil, mode='L')
        enhancer = ImageEnhance.Brightness(img_pil)
        factor = random.uniform(0.8, 1.2)
        img_pil = enhancer.enhance(factor)
        augmented = np.array(img_pil).astype(np.float32) / 255.0

    elif augmentation_type == "gaussian_filter":
        # Ensure values are in 0-255 range for PIL
        img_for_pil = (np.clip(augmented, 0, 1) * 255).astype(np.uint8)
        img_pil = Image.fromarray(img_for_pil, mode='L')
        radius = random.uniform(0.5, 2.0)
        img_pil = img_pil.filter(ImageFilter.GaussianBlur(radius=radius))
        augmented = np.array(img_pil).astype(np.float32) / 255.0

    # Add channel dimension back if it was removed
    if add_channel_back:
        augmented = np.expand_dims(augmented, axis=-1)

    return augmented

def create_patches(image_array, patch_size=(256, 256), overlap=0.5):
    # Handle both 2D and 3D arrays
    if len(image_array.shape) == 3:
        h, w, c = image_array.shape
    else:
        h, w = image_array.shape
        c = 1
    
    patch_h, patch_w = patch_size

    step_h = int(patch_h * (1 - overlap))
    step_w = int(patch_w * (1 - overlap))

    patches = []
    positions = []

    for y in range(0, h - patch_h + 1, step_h):
        for x in range(0, w - patch_w + 1, step_w):
            if len(image_array.shape) == 3:
                patch = image_array[y:y+patch_h, x:x+patch_w, :]
                if patch.shape[:2] == patch_size:
                    patches.append(patch)
                    positions.append((y, x))
            else:
                patch = image_array[y:y+patch_h, x:x+patch_w]
                if patch.shape == patch_size:
                    # Add channel dimension for consistency
                    patch = np.expand_dims(patch, axis=-1)
                    patches.append(patch)
                    positions.append((y, x))

    return patches, positions

def preprocess_and_save_images_method1(csv_path, raw_image_folder, processed_folder, image_size=(256, 256)):
    base_folder = os.path.join(processed_folder, "method1_resized")
    original_folder = os.path.join(base_folder, "original")
    augmented_folder = os.path.join(base_folder, "augmented")

    os.makedirs(original_folder, exist_ok=True)
    os.makedirs(augmented_folder, exist_ok=True)

    df = pd.read_csv(csv_path)
    used_filenames = df['image'].dropna().unique()

    augmentation_types = ["rotation", "shift", "flip_horizontal", "flip_vertical", 
                         "contrast", "brightness", "gaussian_filter"]

    print(f"🔄 Processing Method 1 (Resize to {image_size})...")
    
    # Tracking variables
    total_images = len(used_filenames)
    missing_images = []
    error_images = []
    processed_images = []

    for filename in used_filenames:
        raw_path = os.path.join(raw_image_folder, filename)
        base_name = os.path.splitext(filename)[0]

        if not os.path.exists(raw_path):
            print(f"⚠️ Missing image: {raw_path}")
            missing_images.append(filename)
            continue

        try:
            img = safe_open_image(raw_path, mode="L")
            img = img.resize(image_size)
            img_array = np.array(img).astype("float32") / 255.0
            
            # Add channel dimension for consistency
            img_array = np.expand_dims(img_array, axis=-1)
            img_standardized = z_score_standardize(img_array)

            original_path = os.path.join(original_folder, f"{base_name}.npy")
            np.save(original_path, img_standardized)

            for aug_type in augmentation_types:
                augmented_img = augment_image(img_standardized, aug_type)
                augmented_path = os.path.join(augmented_folder, f"{base_name}_{aug_type}.npy")
                np.save(augmented_path, augmented_img)
            
            processed_images.append(filename)

        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")
            error_images.append((filename, str(e)))

    # Summary report
    print(f"\n📊 METHOD 1 SUMMARY:")
    print(f"📝 Total images in CSV: {total_images}")
    print(f"✅ Successfully processed: {len(processed_images)}")
    print(f"❌ Missing images: {len(missing_images)}")
    print(f"💥 Error images: {len(error_images)}")
    
    if missing_images:
        print(f"\n📋 Missing images ({len(missing_images)}):")
        for img in missing_images:
            print(f"  - {img}")
    
    if error_images:
        print(f"\n📋 Error images ({len(error_images)}):")
        for img, error in error_images:
            print(f"  - {img}: {error}")

    print(f"📁 Original images: {original_folder}")
    print(f"📁 Augmented images: {augmented_folder}")

def preprocess_and_save_images_method2(csv_path, raw_image_folder, processed_folder, patch_size=(256, 256)):
    base_folder = os.path.join(processed_folder, "method2_patched")
    original_folder = os.path.join(base_folder, "original")
    augmented_folder = os.path.join(base_folder, "augmented")

    os.makedirs(original_folder, exist_ok=True)
    os.makedirs(augmented_folder, exist_ok=True)

    df = pd.read_csv(csv_path)
    used_filenames = df['image'].dropna().unique()

    augmentation_types = ["rotation", "shift", "flip_horizontal", "flip_vertical", 
                         "contrast", "brightness", "gaussian_filter"]

    print(f"🔄 Processing Method 2 (Patch to {patch_size})...")
    
    # Tracking variables
    total_images = len(used_filenames)
    missing_images = []
    error_images = []
    no_patches_images = []
    processed_images = []
    total_patches = 0

    for filename in used_filenames:
        raw_path = os.path.join(raw_image_folder, filename)
        base_name = os.path.splitext(filename)[0]

        if not os.path.exists(raw_path):
            print(f"⚠️ Missing image: {raw_path}")
            missing_images.append(filename)
            continue

        try:
            img = safe_open_image(raw_path, mode="L")
            img_array = np.array(img).astype("float32") / 255.0
            
            # Don't add channel dimension here yet - let create_patches handle it
            patches, positions = create_patches(img_array, patch_size)

            if not patches:
                print(f"⚠️ No patches created for {filename} - image might be too small")
                no_patches_images.append((filename, img_array.shape))
                continue

            patch_count = len(patches)
            total_patches += patch_count
            
            for i, patch in enumerate(patches):
                patch_standardized = z_score_standardize(patch)
                original_path = os.path.join(original_folder, f"{base_name}_patch_{i:03d}.npy")
                np.save(original_path, patch_standardized)

                for aug_type in augmentation_types:
                    augmented_patch = augment_image(patch_standardized, aug_type)
                    augmented_path = os.path.join(augmented_folder, f"{base_name}_patch_{i:03d}_{aug_type}.npy")
                    np.save(augmented_path, augmented_patch)
            
            processed_images.append((filename, patch_count))

        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")
            error_images.append((filename, str(e)))

    # Summary report
    print(f"\n📊 METHOD 2 SUMMARY:")
    print(f"📝 Total images in CSV: {total_images}")
    print(f"✅ Successfully processed: {len(processed_images)} images")
    print(f"🧩 Total patches created: {total_patches}")
    print(f"❌ Missing images: {len(missing_images)}")
    print(f"📏 No patches (too small): {len(no_patches_images)}")
    print(f"💥 Error images: {len(error_images)}")
    
    if missing_images:
        print(f"\n📋 Missing images ({len(missing_images)}):")
        for img in missing_images:
            print(f"  - {img}")
    
    if no_patches_images:
        print(f"\n📋 Images too small for patches ({len(no_patches_images)}):")
        for img, shape in no_patches_images:
            print(f"  - {img}: shape {shape}, needed at least {patch_size}")
    
    if error_images:
        print(f"\n📋 Error images ({len(error_images)}):")
        for img, error in error_images:
            print(f"  - {img}: {error}")

    if processed_images:
        print(f"\n📋 Patch counts per image:")
        for img, count in processed_images:
            print(f"  - {img}: {count} patches")

    print(f"📁 Original patches: {original_folder}")
    print(f"📁 Augmented patches: {augmented_folder}")

def preprocess_all_methods(csv_path, raw_image_folder, processed_folder):
    print("🚀 Starting comprehensive image preprocessing...")
    preprocess_and_save_images_method1(csv_path, raw_image_folder, processed_folder)
    print("\n" + "="*50 + "\n")
    preprocess_and_save_images_method2(csv_path, raw_image_folder, processed_folder)
    print("\n🎉 All preprocessing completed!")
    print(f"📁 Results saved in: {processed_folder}")
    print("📋 Folder structure:")
    print("  ├── method1_resized/")
    print("  │   ├── original/")
    print("  │   └── augmented/")
    print("  └── method2_patched/")
    print("      ├── original/")
    print("      └── augmented/")

In [24]:
preprocess_all_methods(
    csv_path="/Users/Shai/OneDrive/Documents/THESIS/thesis-airq/data_matching/matched/7_24_matched.csv",
    raw_image_folder="/Users/Shai/OneDrive/Desktop/THESIS_data/images/724 images/",
    processed_folder="/Users/Shai/OneDrive/Documents/THESIS/thesis-airq/preprocessing_images/processed_images/7_24/",
)

preprocess_all_methods(
    csv_path="/Users/Shai/OneDrive/Documents/THESIS/thesis-airq/data_matching/matched/10_19_matched.csv",
    raw_image_folder="/Users/Shai/OneDrive/Desktop/THESIS_data/images/1019 images/",
    processed_folder="/Users/Shai/OneDrive/Documents/THESIS/thesis-airq/preprocessing_images/processed_images/10_19/",
)

preprocess_all_methods(
    csv_path="/Users/Shai/OneDrive/Documents/THESIS/thesis-airq/data_matching/matched/11_10_matched.csv",
    raw_image_folder="/Users/Shai/OneDrive/Desktop/THESIS_data/images/1110 images/",
    processed_folder="/Users/Shai/OneDrive/Documents/THESIS/thesis-airq/preprocessing_images/processed_images/11_10/",
)


🚀 Starting comprehensive image preprocessing...
🔄 Processing Method 1 (Resize to (256, 256))...
⚠️ Converting /Users/Shai/OneDrive/Desktop/THESIS_data/images/724 images/152338.JPG from RGB to L
⚠️ Converting /Users/Shai/OneDrive/Desktop/THESIS_data/images/724 images/152342.JPG from RGB to L
⚠️ Converting /Users/Shai/OneDrive/Desktop/THESIS_data/images/724 images/152348.JPG from RGB to L
⚠️ Converting /Users/Shai/OneDrive/Desktop/THESIS_data/images/724 images/152418.JPG from RGB to L
⚠️ Converting /Users/Shai/OneDrive/Desktop/THESIS_data/images/724 images/152422.JPG from RGB to L
⚠️ Converting /Users/Shai/OneDrive/Desktop/THESIS_data/images/724 images/152510.JPG from RGB to L
⚠️ Converting /Users/Shai/OneDrive/Desktop/THESIS_data/images/724 images/152512.JPG from RGB to L
⚠️ Converting /Users/Shai/OneDrive/Desktop/THESIS_data/images/724 images/152604.JPG from RGB to L
⚠️ Converting /Users/Shai/OneDrive/Desktop/THESIS_data/images/724 images/152838.JPG from RGB to L
⚠️ Converting /Users/S