In [2]:
import os
import cv2
import sys
from pathlib import Path

# Compress the image to 128x128 and convert it to grayscale

In [2]:
# Size restrictions 
new_size = (64, 64)

# Image Directories
input_directory = Path('Tomato_Dataset/')
output_directory = Path('Tomato_Dataset_Preprocessed/')

# Creating the output directory incase the directory does not exists
os.makedirs(output_directory, exist_ok=True)
# Checking if the input directory exists 

if not os.path.isdir(input_directory):
    print(f"Directory '{input_directory}' not found.")
    sys.exit(1)

# Function to resize the image 
def paddedResize(image):
    # # Storing the new width and new height in a variable
    new_width, new_height = new_size
    
    # # Storing the old width and old height in a variable 
    old_height, old_width = image.shape[:2]

    # Determining the scale in the image
    scale = 128 / max(old_height, old_width)

    # Determining the new width and new height by the scale 
    new_height = int(old_height * scale)
    new_width = int(old_width * scale)

    # # Computing the difference in height and width
    # height_allignment = old_height - new_height
    # width_allignment = old_width - new_width

    # # Compute the aspect ratio for not overstreaching the images
    # cropped_image_top = height_allignment // 2
    # cropped_image_left = width_allignment // 2

    # Customize the dimensions for better storage and transformation
    image_top = (128 - new_height) // 2
    image_left = (128 - new_width) // 2
    image_right = (128 - image_left - new_width)
    image_bottom = (128 - image_top - new_height)

    # Resizing the image to the given height and width and downstream the area of the image without casuing any pixel distortion [INTER_AREA]
    resize_image = cv2.resize(
        image, 
        (new_width, new_height), 
        interpolation=cv2.INTER_AREA)

    # # Add Padding to all the size to prevent image and feature distortion
    image_padding = cv2.copyMakeBorder(
        resize_image,
        image_top,
        image_bottom,
        image_left,
        image_right,
        cv2.BORDER_CONSTANT,
    )

    return image_padding

# For loop to transverse/walking through all the images in folders present in the directory 
for current_address, directory, file_name in os.walk(input_directory):
    for individual_files in file_name:
        # Return true if the image file is a jpeg/jpeg or png file 
        if not individual_files.lower().endswith(('jpg', 'jpeg', 'png')):
            continue
        
        # Storing the image path in a variable
        image_individual_path = os.path.join(current_address, individual_files)
        # Storing the raw image file in an Open CV object
        raw_image = cv2.imread(image_individual_path)

        # Converting the raw image to grey
        raw_image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2GRAY)

        # Resize the image with padding 
        processed_image = paddedResize(raw_image)

        # Storing the object into a specific output folder
        file_output_path = Path(output_directory) / Path(current_address).relative_to(input_directory) / individual_files
        # If the path does not exists for the directory, make one
        if not file_output_path.exists():
            file_output_path.parent.mkdir(parents=True, exist_ok=True)

        # Save the image in the directory
        cv2.imwrite(str(file_output_path), processed_image)

        print("File successfully processed:", image_individual_path)

File successfully processed: Tomato_Dataset/Tomato Late blight/TomatoLateBlight(3165).JPG
File successfully processed: Tomato_Dataset/Tomato Late blight/TomatoLateBlight(1158).JPG
File successfully processed: Tomato_Dataset/Tomato Late blight/TomatoLateBlight(3535).JPG
File successfully processed: Tomato_Dataset/Tomato Late blight/TomatoLateBlight(1508).JPG
File successfully processed: Tomato_Dataset/Tomato Late blight/3d9cca85-96cf-4186-b863-933bbbbc8075___GHLB2 Leaf 117.4.JPG
File successfully processed: Tomato_Dataset/Tomato Late blight/aa2b76f2-a7e8-443c-81eb-20c00c0fd488___GHLB_PS Leaf 39.1 Day 16.jpg
File successfully processed: Tomato_Dataset/Tomato Late blight/72e49321-5eeb-4669-9641-0e9146da9159___RS_Late.B 5102.JPG
File successfully processed: Tomato_Dataset/Tomato Late blight/TomatoLateBlight(2774).JPG
File successfully processed: Tomato_Dataset/Tomato Late blight/TomatoLateBlight(3866).JPG
File successfully processed: Tomato_Dataset/Tomato Late blight/TomatoLateBlight(255).

In [None]:
## Reference:
#             - https://stackoverflow.com/questions/57394135/split-image-dataset-into-train-test-datasets

import shutil
import os
import numpy as np
import argparse
import random

def get_files_from_folder(path):

    files = os.listdir(path)
    return np.asarray(files)

val_ratio = 0.15
test_ratio = 0.15

def main(path_to_data, path_to_test_data, path_to_val_data, train_ratio):
    # get dirs
    _, dirs, _ = next(os.walk(path_to_data))

    # calculates how many train data per class
    data_counter_per_class = np.zeros((len(dirs)))
    for i in range(len(dirs)):
        path = os.path.join(path_to_data, dirs[i])
        files = get_files_from_folder(path)
        data_counter_per_class[i] = len(files)
    
    train_counter = np.round(data_counter_per_class * train_ratio)
    val_counter = np.round(data_counter_per_class * val_ratio)
    test_counter = np.round(data_counter_per_class * test_ratio)
    train = "Tomato_Dataset_Splitted/train"
    val = "Tomato_Dataset_Splitted/val"
    test = "Tomato_Dataset_Splitted/test"

    # transfers files
    for i in range(len(dirs)):
        path_to_original = os.path.join(path_to_data, dirs[i])
        path_to_save = os.path.join(path_to_test_data, dirs[i])
        #creates dir
        if not os.path.exists(path_to_save):
            os.makedirs(path_to_save)
        files = get_files_from_folder(path_to_original)
        # moves data
        for j in range(int(test_counter[i])):
            os.makedirs(train, exist_ok=True)
            os.makedirs(val,   exist_ok=True)
            os.makedirs(test,  exist_ok=True)
            dst = os.path.join(path_to_save, files[j])
            src = os.path.join(path_to_original, files[j])
            shutil.move(src, dst)

            random.shuffle(get_files_from_folder(src))

            train_pictures = get_files_from_folder(src)[:train_counter]
            val_pictures = get_files_from_folder(src)[train_counter:train_counter+val_counter]
            test_pictures = get_files_from_folder(src)[train_counter+val_counter:]

            for files in train_pictures:
                shutil.copy(os.path.join(path_to_original, files), os.path.join(train, files))
            for files in test_pictures:
                shutil.copy(os.path.join(path_to_original, files), os.path.join(test, files))
            for files in val_pictures:
                shutil.copy(os.path.join(path_to_original, files), os.path.join(val, files))


            

def parse_args():
  parser = argparse.ArgumentParser(description="Dataset divider")
  parser.add_argument("--data_path", required=True,
    help="Path to data")
  parser.add_argument("--test_data_path_to_save", required=True,
    help="Path to test data where to save")
  parser.add_argument("--train_ratio", required=True,
    help="Train ratio - 0.7 means splitting data in 70 % train and 15 % test and 15 val")
  return parser.parse_args()

if __name__ == "__main__":
  args = parse_args()
  main(args.data_path, args.test_data_path_to_save, float(args.train_ratio))