In [None]:
# Libraries

import os
from PIL import Image, ImageEnhance
import numpy as np
from collections import defaultdict

# Directories
RAW_IMG_DIR = "DATA/"
OUTPUT_DIR = "STANDARDIZED_AUGMENTED_DATA/"
INSECT_FOLDERS_LIST = get_folders(RAW_IMG_DIR)

# make dir if doesn't exist already
os.makedirs(OUTPUT_DIR, exist_ok=True)

def get_folders(directory):
    '''
    Param: directory - str
    Use: gets all subfolders in directory
    Returns: list of folder names
    '''
    folders = []
    for entry in os.scandir(directory):
        if entry.is_dir():
            folders.append(entry.name)
    return folders


def get_image_file(img_path):
    '''
    Param: img_path - str
    Use: Splits the string from last occurrence of forward slash.
    Returns: filename as str
    '''
    
    filename = img_path.rsplit('/', 1)[-1]
    return filename


def preprocess_image(image_path, desired_square_dim = 224, verbose = True):
    '''
    Param: img_path - str, desired_dim - int
    Use: 
        # convert to RGB format
        # standardize to desired size
        # normalize: pixels normalized to a [-1,1]
        # saves final image
        
    Returns: processed Image (none if NA), log as dict of image errors if applicable
    '''
    
    error_log = defaultdict(list)
    img_name = get_image_file(image_path)
    
    try:
        # Open image & convert to RGB
        image = Image.open(image_path).convert("RGB")
        
        # Standardize to fixed image size
        image = image.resize((desired_square_dim, desired_square_dim))
        
        # Convert to array & normalize pixel values [-1, 1]
        image_array = np.array(image).astype(np.float32)
        image_array = (image_array / (desired_square_dim/2)) - 1.0
        
        # Save preprocessed image
        preprocessed_image = Image.fromarray(((image_array + 1) * 127.5).astype(np.uint8))
#         preprocessed_image.save(save_path)
        
        return preprocessed_image, error_log  #successful preprocessing (error log will be empty)
    
    except Exception as e:
        
        if verbose:
            print(f"Error processing {image_path}: {e}")
            
        #update error log
        error_log[img_name] = e
        
        return None, error_log   #unsuccessful preprocessing

    
def augment_image(img_object, output_subdir, base_filename, verbose = True):
    '''
    Param: img_object - Image object, output_subdir - str, base_filename - str
    Use: 
        # create new images by flipping each horizontally, vertically, rotate, and brightness
        # saves augmented images in output folder
        
    Returns: boolean reflection image augmentation success, log as dict of image errors if applicable
    '''
    
    error_log = defaultdict(list)
        
    try:
        
        image = img_object
        
        # augmentation transformations to be applied
        augmentations = {
            "original": image,
            "flipped_horizontally": image.transpose(Image.FLIP_LEFT_RIGHT),
            "flipped_vertically": image.transpose(Image.FLIP_TOP_BOTTOM),
            "rotated_90": image.rotate(90),
            "rotated_180": image.rotate(180),
            "brightness_enhanced": ImageEnhance.Brightness(image).enhance(1.5)
        }
        
        # Apply each augmentation and save
        for aug_name, aug_image in augmentations.items():
            
            #save the 5 generated images AND the original (6 images total) 
            aug_save_path = os.path.join(output_subdir, f"{base_filename}_{aug_name}.jpg")
            aug_image.save(aug_save_path)
        
        return True, error_log
    
    except Exception as e:
        
        if verbose:
            print(f"Error processing {image_path}: {e}")
            
        #update error log
        error_log[base_filename] = e
        
        return False, error_log


def main():
    
    total_files = 0
    successfully_processed_and_standardized = 0
        

    # Process selected folders
    for folder in INSECT_FOLDERS_LIST:
        
        # get the path of the folder
        folder_path = os.path.join(RAW_IMG_DIR, folder)
        
        # get path of output folder
        output_folder = os.path.join(OUTPUT_DIR, folder)
        
        # make insect subfolders if DNE
        os.makedirs(output_folder, exist_ok=True) 
        

        for root, subdirs, files in os.walk(folder_path):
            for file in files: # for each image in RAW_IMG_DATA
                total_files += 1
                
                # get img files
                if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                     
                    
                    # get complete path to raw image
                    input_path = os.path.join(root, file)
                    
                    # saving relative path to this img
                    relative_path = os.path.relpath(root, folder_path)
                    
                    # constuct path to save processed img to corresponding output folder
                    output_subdir = os.path.join(output_folder, relative_path)                                       
                    
                    # saving info needed for output file saving
                    base_filename, file_extension = os.path.splitext(file)
                    

                    # standardize this img
                    processed_img, process_error_log = preprocess_image(input_path)
                    
                    # succesfully processed img
                    if processed_img is not None:
                        
                        # increase data set with augmentation on processed image
                        final_img_success, augment_error_log = augment_image(processed_img, output_subdir, base_filename)
                        
                        if final_img_success == True:
                            successfully_processed_and_standardized +=6
                       
                        else:
                            print(augment_error_log)
                            
                            
                    # unsuccesfully processed img
                    else:
                        print(process_error_log)
                        
                else:
                    print(f"Skipped non-image file: {file}")

    # summary
    print(f"\n\nStandardizing, normalization, and data augmentation on images complete!\n{successfully_processed_and_standardized}, total raw images: {total_files}")


if __name__ == "__main__":
    main()
    

Error processing DATA/silverfish-160/image_15_silverfish reddit.jpg: cannot identify image file 'DATA/silverfish-160/image_15_silverfish reddit.jpg'
defaultdict(<class 'list'>, {'image_15_silverfish reddit.jpg': UnidentifiedImageError("cannot identify image file 'DATA/silverfish-160/image_15_silverfish reddit.jpg'")})
Error processing DATA/silverfish-160/image_16_silverfish reddit.jpg: cannot identify image file 'DATA/silverfish-160/image_16_silverfish reddit.jpg'
defaultdict(<class 'list'>, {'image_16_silverfish reddit.jpg': UnidentifiedImageError("cannot identify image file 'DATA/silverfish-160/image_16_silverfish reddit.jpg'")})
Skipped non-image file: image_128_silverfish.webp
Error processing DATA/flea-160/image_42_flea_google.jpg: cannot identify image file 'DATA/flea-160/image_42_flea_google.jpg'
defaultdict(<class 'list'>, {'image_42_flea_google.jpg': UnidentifiedImageError("cannot identify image file 'DATA/flea-160/image_42_flea_google.jpg'")})
Error processing DATA/flea-160/i

Error processing DATA/flea-160/image_17_flea_google.jpg: cannot identify image file 'DATA/flea-160/image_17_flea_google.jpg'
defaultdict(<class 'list'>, {'image_17_flea_google.jpg': UnidentifiedImageError("cannot identify image file 'DATA/flea-160/image_17_flea_google.jpg'")})
Error processing DATA/flea-160/image_8_flea_google.jpg: cannot identify image file 'DATA/flea-160/image_8_flea_google.jpg'
defaultdict(<class 'list'>, {'image_8_flea_google.jpg': UnidentifiedImageError("cannot identify image file 'DATA/flea-160/image_8_flea_google.jpg'")})
Error processing DATA/flea-160/image_33_flea_google.jpg: cannot identify image file 'DATA/flea-160/image_33_flea_google.jpg'
defaultdict(<class 'list'>, {'image_33_flea_google.jpg': UnidentifiedImageError("cannot identify image file 'DATA/flea-160/image_33_flea_google.jpg'")})
Error processing DATA/flea-160/image_49_flea_google.jpg: cannot identify image file 'DATA/flea-160/image_49_flea_google.jpg'
defaultdict(<class 'list'>, {'image_49_flea_g

Error processing DATA/carpenter-ant-160/image_22_carpenterant_google.jpg: cannot identify image file 'DATA/carpenter-ant-160/image_22_carpenterant_google.jpg'
defaultdict(<class 'list'>, {'image_22_carpenterant_google.jpg': UnidentifiedImageError("cannot identify image file 'DATA/carpenter-ant-160/image_22_carpenterant_google.jpg'")})
Error processing DATA/carpenter-ant-160/image_31_carpenterant_google.jpg: cannot identify image file 'DATA/carpenter-ant-160/image_31_carpenterant_google.jpg'
defaultdict(<class 'list'>, {'image_31_carpenterant_google.jpg': UnidentifiedImageError("cannot identify image file 'DATA/carpenter-ant-160/image_31_carpenterant_google.jpg'")})
Error processing DATA/carpenter-ant-160/image_40_carpenterant_google.jpg: cannot identify image file 'DATA/carpenter-ant-160/image_40_carpenterant_google.jpg'
defaultdict(<class 'list'>, {'image_40_carpenterant_google.jpg': UnidentifiedImageError("cannot identify image file 'DATA/carpenter-ant-160/image_40_carpenterant_googl

Error processing DATA/carpenter-ant-160/image_15_carpenterant_google.jpg: cannot identify image file 'DATA/carpenter-ant-160/image_15_carpenterant_google.jpg'
defaultdict(<class 'list'>, {'image_15_carpenterant_google.jpg': UnidentifiedImageError("cannot identify image file 'DATA/carpenter-ant-160/image_15_carpenterant_google.jpg'")})
Error processing DATA/carpenter-ant-160/image_46_carpenterant_google.jpg: cannot identify image file 'DATA/carpenter-ant-160/image_46_carpenterant_google.jpg'
defaultdict(<class 'list'>, {'image_46_carpenterant_google.jpg': UnidentifiedImageError("cannot identify image file 'DATA/carpenter-ant-160/image_46_carpenterant_google.jpg'")})
Error processing DATA/carpenter-ant-160/image_37_carpenterant_google.jpg: cannot identify image file 'DATA/carpenter-ant-160/image_37_carpenterant_google.jpg'
defaultdict(<class 'list'>, {'image_37_carpenterant_google.jpg': UnidentifiedImageError("cannot identify image file 'DATA/carpenter-ant-160/image_37_carpenterant_googl