### This notebook reduces the file sizes of both damage and body datasets.

In [3]:
import os
import random
import shutil
from PIL import Image

def process_and_copy_images(source_dir, dest_dir, percentage, target_size=(128, 128), format='JPEG'):
    """
    Processes images by resizing and changing format, then copies a percentage of 
    these images to a destination directory while maintaining the folder structure.

    :param source_dir: The source directory containing subfolders with images.
    :param dest_dir: The destination directory where processed images will be copied.
    :param percentage: The percentage of images to select from each subfolder.
    :param target_size: The target size to which images will be resized.
    :param format: The image format to be used for saving the processed images.
    """
    # Ensure the destination directory exists
    if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

    # Iterate through the subdirectories in the source directory
    for subdir in os.listdir(source_dir):
        subdir_path = os.path.join(source_dir, subdir)
        
        # Skip if it's not a directory
        if not os.path.isdir(subdir_path):
            continue

        # List all image files in the subdirectory
        files = [f for f in os.listdir(subdir_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        # Randomly select a percentage of the files
        selected_files = random.sample(files, max(1, len(files) * percentage // 100))

        # Create corresponding subdirectory in destination directory
        dest_subdir = os.path.join(dest_dir, subdir)
        if not os.path.exists(dest_subdir):
            os.makedirs(dest_subdir)

        # Process and copy the selected files to the destination subdirectory
        for file in selected_files:
            file_path = os.path.join(subdir_path, file)
            with Image.open(file_path) as img:
                # Resize and convert format
                img = img.resize(target_size)
                # Save processed image in destination directory
                new_filepath = os.path.join(dest_subdir, os.path.splitext(file)[0] + '.jpg')
                img.save(new_filepath, format=format)

# Paths for the source and destination directories
# Replace these with the actual paths to your directories
body_source_test_dir = '/Users/sachin/My Files/Data Science/Playground/group-coursework-sa3n/data/body/test'
body_source_train_dir = '/Users/sachin/My Files/Data Science/Playground/group-coursework-sa3n/data/body/train'
body_dest_dir = '/Users/sachin/My Files/Data Science/Playground/group-coursework-sa3n/data/bodyCNN'

damage_source_test_dir = '/Users/sachin/My Files/Data Science/Playground/group-coursework-sa3n/data/damage/validation'
damage_source_train_dir = '/Users/sachin/My Files/Data Science/Playground/group-coursework-sa3n/data/damage/training'
damage_dest_dir = '/Users/sachin/My Files/Data Science/Playground/group-coursework-sa3n/data/damageCNN'

# Process and copy images from 'test' and 'train' directories
process_and_copy_images(body_source_test_dir, os.path.join(body_dest_dir, 'test'),25)
process_and_copy_images(body_source_train_dir, os.path.join(body_dest_dir, 'train'),25)

# Process and copy images from 'test' and 'train' directories
process_and_copy_images(damage_source_test_dir, os.path.join(damage_dest_dir, 'test'),100)
process_and_copy_images(damage_source_train_dir, os.path.join(damage_dest_dir, 'train'),100)

### Both datasets have reduced in size, and both are around 12 mb.