In [1]:
import numpy as np
import os
import shutil
from PIL import Image, UnidentifiedImageError
from skimage.transform import resize
import pickle

In [2]:
def resize_with_padding(image, target_height, target_width):
    """
    Resize image with padding to maintain aspect ratio.

    """
    # Calculate aspect ratio of original image
    original_height, original_width, _ = image.shape
    aspect_ratio = original_width / original_height
    
    # Resize image while preserving aspect ratio and fill with white pixels
    if aspect_ratio > target_width / target_height:
        # Image is wider, resize based on width
        new_width = target_width
        new_height = int(target_width / aspect_ratio)
    elif aspect_ratio < target_width / target_height:
        # Image is taller, resize based on height
        new_height = target_height
        new_width = int(aspect_ratio * target_height)
    else:
        # Image has the same aspect ratio as target
        new_height = target_height
        new_width = target_width
    
    resized_image = resize(image, (new_height, new_width), mode='constant') * 255  # Fill with white pixels
    
    # Pad to target dimensions with white pixels if necessary
    padded_image = np.ones((target_height, target_width, 3), dtype=np.uint8) * 255  # Fill with white pixels
    y_offset = (target_height - new_height) // 2
    x_offset = (target_width - new_width) // 2
    padded_image[y_offset:y_offset+new_height, x_offset:x_offset+new_width] = resized_image
    
    return padded_image.astype(np.uint8)

In [3]:
def remove_transparency(im, bg_colour=(255, 255, 255)):
    # Only process if image has transparency (http://stackoverflow.com/a/1963146)
    if im.mode in ('RGBA', 'LA') or (im.mode == 'P' and 'transparency' in im.info):

        # Need to convert to RGBA if LA format due to a bug in PIL (http://stackoverflow.com/a/1963146)
        alpha = im.convert('RGBA').split()[-1]

        # Create a new background image of our matt color.
        # Must be RGBA because paste requires both images have the same format
        # (http://stackoverflow.com/a/8720632  and  http://stackoverflow.com/a/9459208)
        bg = Image.new("RGBA", im.size, bg_colour + (255,))
        bg.paste(im, mask=alpha)
        return bg.convert('RGB')

    else:
        return im

In [4]:
def process_image(image_path, target_height, target_width):
    with Image.open(path) as img:
        # Remove transparency and resize the image
        img = remove_transparency(img)
        new_size = (224, 224) # Adjust the new size as needed
        img = img.resize(new_size).convert('RGB')
        img_array = np.array(img).astype(np.uint8)
        
        # Resize and pad the image
        processed_image = resize_with_padding(img_array, target_height, target_width)

    return processed_image

In [5]:
trashbox_path = 'TrashBox/rm_deleted/'
target_height = 224
target_width = 224

image_paths = []
labels = []
allowed_extensions = {".jpg", ".png"}

for foldername in os.listdir(trashbox_path):
    folder = os.path.join(trashbox_path, foldername)
    for root, dirs, files in os.walk(folder):
        # Check if the current folder contains any subfolders
        subfolders = [d for d in dirs if os.path.isdir(os.path.join(root, d))]

        # If there are no subfolders, collect images from the current folder
        if not subfolders:
            for file in files:
                _, extension = os.path.splitext(file.lower())
                if extension in allowed_extensions:
                    image_paths.append(os.path.join(root, file))
                    # Extract the label from the current subfolder
                    labels.append(os.path.basename(root))
        else:
            # Iterate through each subfolder
            for subfolder in subfolders:
                subfolder_path = os.path.join(root, subfolder)
                for file in os.listdir(subfolder_path):
                    _, extension = os.path.splitext(file.lower())
                    if extension in allowed_extensions:
                        image_paths.append(os.path.join(subfolder_path, file))
                        # Extract the label from the current subfolder
                        labels.append(os.path.basename(subfolder_path))

In [6]:
processed_images = []
for path in image_paths:
    processed_image = process_image(path, target_height, target_width)
    processed_images.append(processed_image)

In [7]:
X = np.array(processed_images)
y = np.array(labels)

In [8]:
# Save the arrays to files with pickle protocol 2
with open('trashbox_224x224.npy', 'wb') as file:
    np.save(file, X, allow_pickle=False)
    
with open('labels.npy', 'wb') as file:
    np.save(file, y, allow_pickle=False)

In [9]:
X.shape

(23685, 224, 224, 3)

In [10]:
y.shape

(23685,)