In [None]:
import numpy as np
import os
import zipfile
from PIL import Image
from sklearn.preprocessing import LabelEncoder
from google.colab import drive
import shutil  # Added for deletion
# Define Paths
zip_path = "/content/Waste Classification Dataset.zip"
extract_path = "/content/Waste_Classification"

saved_images_dir = "/content/Processed Data/preprocessed_batches"
saved_labels_path = "/content/Processed Data/preprocessed_labels.npy"

# Force Re-Extraction: Delete old extracted folder and re-extract dataset
if os.path.exists(extract_path):
    print("Deleting old extracted dataset...")
    shutil.rmtree(extract_path)  # Delete the extracted folder

print("Extracting dataset...")
with zipfile.ZipFile(zip_path, "r") as z:
    z.extractall(extract_path)
print("Extraction complete!")

# Re-check total images **AFTER** extraction
total_images = sum([len(files) for _, _, files in os.walk(extract_path) if any(file.endswith((".jpg")) for file in files)])
print(f"Total extracted images: {total_images}")

# Load Image Paths & Labels
labels, img_paths = [], []
for root, dirs, files in os.walk(extract_path):
    category = os.path.basename(root)
    if category in ["recyclable", "organic"]:  # Adjust categories if needed
        for file in files:
            if file.lower().endswith((".jpg")):
                labels.append(category)
                img_paths.append(os.path.join(root, file))

# Debugging: Check Total Images & Labels
print(f"Total images found: {len(img_paths)}")
print(f"Total labels found: {len(labels)}")
assert len(img_paths) == len(labels), "Mismatch between images and labels!"

#Encode Labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

#Ensure Directory Exists Before Saving Labels
os.makedirs(os.path.dirname(saved_labels_path), exist_ok=True)

#Save Labels to Drive
np.save(saved_labels_path, y)
print(f"Saved {len(y)} labels successfully.")

#DELETE OLD PREPROCESSED BATCHES BEFORE PROCESSING NEW ONES
if os.path.exists(saved_images_dir):
    print("Deleting old preprocessed image batches...")
    shutil.rmtree(saved_images_dir)

#Standardization Function
def standardize_images(images):
    """Standardize images: subtract mean and divide by standard deviation."""
    mean = np.mean(images, axis=(0, 1, 2), keepdims=True)
    std = np.std(images, axis=(0, 1, 2), keepdims=True)
    return (images - mean) / (std + 1e-7)  # Avoid division by zero

#Batch Processing Function
def preprocess_images(image_paths, batch_size=500):
    """Process images in batches and save them separately to prevent memory overload."""
    os.makedirs(saved_images_dir, exist_ok=True)  # Ensure directory exists

    for i in range(0, len(image_paths), batch_size):
        batch_file_path = os.path.join(saved_images_dir, f"batch_{i // batch_size}.npy")

        images = []
        for img_path in image_paths[i : i + batch_size]:  # Process batch
            try:
                img = Image.open(img_path).convert("RGB").resize((256, 256))
                img = np.array(img, dtype=np.float32) / 255.0  # Normalize
                images.append(img)
            except Exception as e:
                print(f"⚠ Skipping corrupted image: {img_path}")

        batch = np.array(images)
        np.save(batch_file_path, batch)  # Save batch
        print(f"Saved batch {i // batch_size}: {batch.shape}")

#Process & Save Images in Batches
print("Preprocessing images in batches...")
preprocess_images(img_paths, batch_size=500)
print("Image preprocessing complete!")
