# Image Normalization for Tiger-Fox-Elephant Dataset

This notebook will help us normalize our tiger images to a consistent size. We'll process the images in a way that maintains image quality and creates a standardized dataset for our classification model.

In [1]:
# Import necessary libraries
import os
import cv2
import numpy as np
from pathlib import Path
from IPython.display import display, Image
import matplotlib.pyplot as plt
import sys
from pathlib import Path

# Set up project paths
project_root = Path.cwd().parent 
sys.path.append(str(project_root))

from config.config import Config

# Set up base directories
raw_base_dir = Config.RAW_DATA_DIR
processed_base_dir = Config.PROCESSED_DATA_DIR

# Get all subdirectories in the raw directory
raw_subdirs = [d for d in raw_base_dir.iterdir() if d.is_dir()]

print("Found the following directories to process:")
for subdir in raw_subdirs:
    print(f"- {subdir.name}")

# Create corresponding processed directories
for raw_subdir in raw_subdirs:
    processed_subdir = processed_base_dir / raw_subdir.name
    processed_subdir.mkdir(parents=True, exist_ok=True)

Found the following directories to process:
- elephant
- Elephant_negative_class
- fox
- Fox_negative_class
- not_tiger_for_test
- tiger
- tiger_for_test
- Tiger_negative_class


In [2]:
def display_comparison(original_img, normalized_img, filename, title1="Original", title2="Normalized"):
    """
    Display two images side by side for comparison with enhanced labeling
    
    Args:
        original_img: The source image
        normalized_img: The processed image
        filename: Name of the image file being processed
        title1: Label for the original image
        title2: Label for the processed image
    """
    plt.figure(figsize=(10, 4))
    
    plt.subplot(1, 2, 1)
    plt.imshow(cv2.cvtColor(original_img, cv2.COLOR_BGR2RGB))
    plt.title(f"{title1}\nFile: {filename}\nSize: {original_img.shape[1]}x{original_img.shape[0]}")
    plt.axis('off')
    
    plt.subplot(1, 2, 2)
    plt.imshow(cv2.cvtColor(normalized_img, cv2.COLOR_BGR2RGB))
    plt.title(f"{title2}\nFile: {filename}\nSize: {normalized_img.shape[1]}x{normalized_img.shape[0]}")
    plt.axis('off')
    
    plt.tight_layout()
    plt.show()

In [3]:
def process_directory(input_dir, output_dir, target_size, num_samples=10):
    """
    Process all images in a directory and save resized versions to output directory.
    Shows more sample comparisons for better visualization.
    
    Args:
        input_dir (Path): Directory containing source images
        output_dir (Path): Directory where processed images will be saved
        target_size (int): Target size for both width and height
        num_samples (int): Number of sample images to display for comparison
    """
    processed_count = 0
    failed_files = []
    
    print(f"\nProcessing directory: {input_dir.name}")
    print(f"Saving to: {output_dir}")
    
    # Get list of all image files first
    image_files = [f for f in os.listdir(input_dir) 
                  if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff'))]
    
    # If we have fewer images than requested samples, show all images
    num_samples = min(num_samples, len(image_files))
    
    # Select sample images evenly distributed through the directory
    if num_samples > 0:
        sample_indices = np.linspace(0, len(image_files) - 1, num_samples, dtype=int)
        sample_files = set([image_files[i] for i in sample_indices])
    else:
        sample_files = set()
    
    for filename in image_files:
        input_path = input_dir / filename
        output_path = output_dir / f"resized_{filename}"
        
        print(f"Processing {filename}...", end=' ')
        try:
            # Read image
            img = cv2.imread(str(input_path))
            if img is None:
                raise ValueError(f"Failed to read image: {filename}")
            
            # Print original dimensions
            print(f"Original size: {img.shape}", end=' -> ')
            
            # Simple resize to target dimensions
            resized = cv2.resize(img, (target_size, target_size))
            
            # Save resized image
            cv2.imwrite(str(output_path), resized)
            
            print(f"Resized to: {resized.shape}")
            
            # Display comparison if this is a sample file
            if filename in sample_files:
                print("\nDisplaying sample comparison:")
                display_comparison(img, resized, filename)
            
            processed_count += 1
            
        except Exception as e:
            failed_files.append((filename, str(e)))
            print(f"FAILED: {str(e)}")
            continue
    
    return processed_count, failed_files

In [None]:
# Get target size from config
target_size = Config.IMG_SIZE[0] if isinstance(Config.IMG_SIZE, tuple) else Config.IMG_SIZE

# Process all subdirectories
total_processed = 0
total_failed = []

for raw_subdir in raw_subdirs:
    processed_subdir = processed_base_dir / raw_subdir.name
    
    print(f"\n{'='*50}")
    print(f"Processing {raw_subdir.name} directory")
    print(f"{'='*50}")
    
    # Process this directory with more samples
    processed_count, failed_files = process_directory(
        raw_subdir,
        processed_subdir,
        target_size,
        num_samples=10  # Show 10 samples per directory
    )
    
    total_processed += processed_count
    total_failed.extend([(raw_subdir.name, f, e) for f, e in failed_files])
    
    print(f"\nCompleted processing {raw_subdir.name}:")
    print(f"- Processed {processed_count} images")
    if failed_files:
        print(f"- Failed to process {len(failed_files)} images")

# Print final summary
print("\n" + "="*50)
print("Processing Complete")
print("="*50)
print(f"Total images processed: {total_processed}")
if total_failed:
    print("\nFailed files:")
    for dir_name, filename, error in total_failed:
        print(f"- {dir_name}/{filename}: {error}")