In [None]:
# import numpy as np
# import os
# import zipfile
# import shutil
# import tensorflow as tf
# from sklearn.preprocessing import LabelEncoder
# from PIL import Image
# from concurrent.futures import ThreadPoolExecutor
# from collections import Counter
# import random

# # ‚úÖ Define Paths
# ZIP_PATH = "Waste Classification Dataset.zip"
# EXTRACT_PATH = "Waste_Classification"
# SAVED_IMAGES_PATH = "Processed_Data/images.npy"
# SAVED_LABELS_PATH = "Processed_Data/labels.npy"

# # ‚úÖ Force Re-Extraction: Delete old dataset and re-extract
# if os.path.exists(EXTRACT_PATH):
#     print("üö® Deleting old extracted dataset...")
#     shutil.rmtree(EXTRACT_PATH)

# print("üìÇ Extracting dataset...")
# with zipfile.ZipFile(ZIP_PATH, "r") as z:
#     z.extractall(EXTRACT_PATH)
# print("‚úÖ Extraction complete!")

# # ‚úÖ Load Image Paths & Labels
# labels, img_paths = [], []
# for root, dirs, files in os.walk(EXTRACT_PATH):
#     category = os.path.basename(root)
#     if category in ["recyclable", "organic"]:  # Adjust categories if needed
#         for file in files:
#             if file.lower().endswith((".jpg", ".jpeg", ".png")):
#                 labels.append(category)
#                 img_paths.append(os.path.join(root, file))

# print(f"üîπ Total images found: {len(img_paths)}")
# print(f"üîπ Total labels found: {len(labels)}")
# assert len(img_paths) == len(labels), "‚ùå Mismatch between images and labels!"

# # ‚úÖ Balance the Dataset (Undersampling to Match the Smallest Class)
# class_counts = Counter(labels)
# min_count = min(class_counts.values())  # Get the smallest class size
# print(f"üîç Class distribution before balancing: {class_counts}")

# balanced_img_paths, balanced_labels = [], []
# for category in class_counts.keys():
#     category_indices = [i for i, lbl in enumerate(labels) if lbl == category]
#     sampled_indices = random.sample(category_indices, min_count)  # Undersampling
    
#     for idx in sampled_indices:
#         balanced_img_paths.append(img_paths[idx])
#         balanced_labels.append(labels[idx])

# print(f"‚úÖ Class distribution after balancing: {Counter(balanced_labels)}")

# # ‚úÖ Replace original lists with balanced versions
# img_paths = balanced_img_paths
# labels = balanced_labels

# # ‚úÖ Encode Labels
# label_encoder = LabelEncoder()
# y = label_encoder.fit_transform(labels)

# # ‚úÖ Ensure Directory Exists Before Saving Labels & Images
# os.makedirs(os.path.dirname(SAVED_LABELS_PATH), exist_ok=True)

# # ‚úÖ Save Labels
# np.save(SAVED_LABELS_PATH, y)
# print(f"üìÅ Saved {len(y)} labels successfully.")

# # ‚úÖ Image Processing Function (Optimized)
# IMG_SIZE = (128, 128)

# def preprocess_image(img_path):
#     """Loads and preprocesses an image (resizing, normalization)."""
#     try:
#         img = Image.open(img_path).convert("RGB").resize(IMG_SIZE)
#         img = np.array(img, dtype=np.float32) / 255.0  # Normalize
#         return img
#     except Exception as e:
#         print(f"‚ö† Skipping corrupted image: {img_path}")
#         return None

# # ‚úÖ Use ThreadPoolExecutor for Faster Processing
# print("üîÑ Processing images using multiprocessing...")
# with ThreadPoolExecutor(max_workers=8) as executor:  # Use 8 threads for speed
#     images = list(executor.map(preprocess_image, img_paths))

# # ‚úÖ Remove failed loads (None values)
# valid_data = [(img, label) for img, label in zip(images, y) if img is not None]

# # ‚úÖ Split into separate arrays
# X, y = zip(*valid_data)  
# X = np.array(X)
# y = np.array(y)

# # ‚úÖ Save Processed Images
# np.save(SAVED_IMAGES_PATH, X)
# print(f"‚úÖ Saved processed images! Shape: {X.shape}")

# # ‚úÖ Load Dataset from Saved Files
# print("üìÇ Loading saved dataset...")
# X = np.load(SAVED_IMAGES_PATH, mmap_mode="r")
# y = np.load(SAVED_LABELS_PATH, mmap_mode="r")
# print(f"üìä Loaded images: {X.shape}, Labels: {y.shape}")

# # ‚úÖ Create TensorFlow Dataset (Efficient Streaming)
# dataset = tf.data.Dataset.from_tensor_slices((X, y))

# # ‚úÖ Apply Data Augmentation for Robustness
# def augment(image, label):
#     image = tf.image.random_flip_left_right(image)
#     image = tf.image.random_brightness(image, max_delta=0.2)
#     return image, label

# dataset = dataset.shuffle(len(X)).map(augment).batch(32).prefetch(tf.data.AUTOTUNE)
# print("üöÄ Dataset ready for training!")


NEW DATA PROCESSING

In [None]:
import numpy as np
import os
import zipfile
import shutil
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from PIL import Image
from concurrent.futures import ThreadPoolExecutor
from collections import Counter
import random

# ‚úÖ Define Paths
ZIP_PATH = "Waste Classification Dataset.zip"
EXTRACT_PATH = "Waste_Classification"
SAVED_IMAGES_PATH = "Processed_Data/images.npy"
SAVED_LABELS_PATH = "Processed_Data/labels.npy"

# ‚úÖ Force Re-Extraction: Delete old dataset and re-extract
if os.path.exists(EXTRACT_PATH):
    print("üö® Deleting old extracted dataset...")
    shutil.rmtree(EXTRACT_PATH)

print("üìÇ Extracting dataset...")
with zipfile.ZipFile(ZIP_PATH, "r") as z:
    z.extractall(EXTRACT_PATH)
print("‚úÖ Extraction complete!")

# ‚úÖ Load Image Paths & Labels
labels, img_paths = [], []
for root, dirs, files in os.walk(EXTRACT_PATH):
    category = os.path.basename(root)
    if category in ["recyclable", "organic"]:
        for file in files:
            if file.lower().endswith((".jpg", ".jpeg", ".png")):
                labels.append(category)
                img_paths.append(os.path.join(root, file))

print(f"üîπ Total images found: {len(img_paths)}")
print(f"üîπ Total labels found: {len(labels)}")
assert len(img_paths) == len(labels), "‚ùå Mismatch between images and labels!"

# ‚úÖ Balance the Dataset (Undersampling to Match the Smallest Class)
class_counts = Counter(labels)
min_count = min(class_counts.values())
print(f"üîç Class distribution before balancing: {class_counts}")

balanced_img_paths, balanced_labels = [], []
for category in class_counts.keys():
    category_indices = [i for i, lbl in enumerate(labels) if lbl == category]
    sampled_indices = random.sample(category_indices, min_count)
    
    for idx in sampled_indices:
        balanced_img_paths.append(img_paths[idx])
        balanced_labels.append(labels[idx])

print(f"‚úÖ Class distribution after balancing: {Counter(balanced_labels)}")

# ‚úÖ Encode Labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(balanced_labels)

# ‚úÖ Ensure Directory Exists Before Saving Labels & Images
os.makedirs(os.path.dirname(SAVED_LABELS_PATH), exist_ok=True)

# ‚úÖ Save Labels
np.save(SAVED_LABELS_PATH, y)
print(f"üìÅ Saved {len(y)} labels successfully.")

# ‚úÖ Image Processing Function
IMG_SIZE = (128, 128)

def preprocess_image(img_path):
    try:
        img = Image.open(img_path).convert("RGB").resize(IMG_SIZE)
        img = np.array(img, dtype=np.float32) / 255.0  # Normalize
        return img
    except Exception as e:
        print(f"‚ö† Skipping corrupted image: {img_path}")
        return None

# ‚úÖ Use ThreadPoolExecutor for Faster Processing
print("üîÑ Processing images using multiprocessing...")
with ThreadPoolExecutor(max_workers=8) as executor:
    images = list(executor.map(preprocess_image, balanced_img_paths))

# ‚úÖ Remove failed loads
valid_data = [(img, label) for img, label in zip(images, y) if img is not None]

# ‚úÖ Convert to NumPy Arrays
X, y = zip(*valid_data)
X = np.array(X)
y = np.array(y)

# ‚úÖ Save Processed Images
np.save(SAVED_IMAGES_PATH, X)
print(f"‚úÖ Saved processed images! Shape: {X.shape}")

# ‚úÖ Load Dataset from Saved Files
print("üìÇ Loading saved dataset...")
X = np.load(SAVED_IMAGES_PATH)
y = np.load(SAVED_LABELS_PATH)
print(f"üìä Loaded images: {X.shape}, Labels: {y.shape}")

# ‚úÖ Ensure TensorFlow Dataset Works with Tensors
def augment(image, label):
    image = tf.convert_to_tensor(image, dtype=tf.float32)
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_brightness(image, max_delta=0.2)
    return image, label

dataset = tf.data.Dataset.from_tensor_slices((X, y))
dataset = dataset.shuffle(len(X)).map(augment).batch(32).prefetch(tf.data.AUTOTUNE)

print("üöÄ Dataset ready for training!")


üö® Deleting old extracted dataset...
üìÇ Extracting dataset...
‚úÖ Extraction complete!
üîπ Total images found: 24705
üîπ Total labels found: 24705
üîç Class distribution before balancing: Counter({'organic': 13880, 'recyclable': 10825})
‚úÖ Class distribution after balancing: Counter({'organic': 10825, 'recyclable': 10825})
üìÅ Saved 21650 labels successfully.
üîÑ Processing images using multiprocessing...
‚úÖ Saved processed images! Shape: (21650, 128, 128, 3)
üìÇ Loading saved dataset...


MemoryError: Unable to allocate 3.96 GiB for an array with shape (1064140800,) and data type float32