# **Data Preparation**

## Objectives

* Clean and prepare the data set for further processes.

* Split the data into train, validation, test sets. 

## Additional Comments

* No additional comments.



---

# Change the working directory

In [1]:
import os

In [2]:
current_dir = os.getcwd()
current_dir

'/workspace/mildew-detection/jupyter_notebooks'

In [3]:
os.chdir('/workspace/mildew-detection')
print("You set a new current directory")

You set a new current directory


In [4]:
current_dir = os.getcwd()
current_dir

'/workspace/mildew-detection'

---

## Data cleaning

In this section we will check for and remove dody and non-image files, so that the dataset only contains images with correct file types for further processes.

## Remove dodgy images

In [5]:
# Code used to remove dody images, borrowed from nicknochnack - ImageClassification project. Combined with the non-image-remove section from Walkthrough project 1.

import cv2
import os

# Function to remove non-image files
def remove_non_image_files(data_dir):
    image_extension = ('.png', '.jpg', '.jpeg')
    dataset_folders = ['healthy', 'powdery_mildew']  
    for dataset_folder in dataset_folders:
        folder_path = os.path.join(data_dir, dataset_folder)
        if os.path.isdir(folder_path):
            for file_name in os.listdir(folder_path):
                file_path = os.path.join(folder_path, file_name)
                if os.path.isfile(file_path):
                    try:
                        img = cv2.imread(file_path)
                        if img is None:
                            os.remove(file_path)  # Remove non-image file
                    except Exception as e:
                        print(f"Issue with image {file_path}: {str(e)}")
                        os.remove(file_path)  # Remove non-image file
            print(f"Folder: {dataset_folder} - non-image files removed")

# Directory path containing the images
data_dir = 'inputs/datasets'

# Call the function to remove non-image files with the specified directory
remove_non_image_files(data_dir)


Folder: healthy - non-image files removed
Folder: powdery_mildew - non-image files removed


## Split train validation test sets

* In this section we split the collected and cleaned data into train, validation and test sets into seperate folders. 

* The training set is divided into a 0.70 ratio of data.
* The validation set is divided into a 0.10 ratio of data.
* The test set is divided into a 0.20 ratio of data.

In [7]:
# Code from linx02 - genederpredictor project, used together with the Walkthrough 1 project section for Split train validation test sets. 

import os
import shutil
import random
import joblib

def split_images(data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):

    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("train_set_ratio + validation_set_ratio + test_set_ratio should sum to 1.0")
        return

    # gets classes labels
    labels = os.listdir(data_dir)  # gets the folder name
    if 'test' in labels:
        pass
    else:
        # create train, test folders with classes labels sub-folder
        for folder in ['train', 'validation', 'test']:
            for label in labels:
                os.makedirs(name=data_dir + '/' + folder + '/' + label)

        for label in labels:

            files = os.listdir(data_dir + '/' + label)
            random.shuffle(files)

            train_set_files_qty = int(len(files) * train_set_ratio)
            validation_set_files_qty = int(len(files) * validation_set_ratio)

            count = 1
            for file_name in files:
                if count <= train_set_files_qty:
                    # move a given file to the train set
                    shutil.move(data_dir + '/' + label + '/' + file_name,
                                data_dir + '/train/' + label + '/' + file_name)

                elif count <= (train_set_files_qty + validation_set_files_qty):
                    # move a given file to the validation set
                    shutil.move(data_dir + '/' + label + '/' + file_name,
                                data_dir + '/validation/' + label + '/' + file_name)

                else:
                    # move given file to test set
                    shutil.move(data_dir + '/' + label + '/' + file_name,
                                data_dir + '/test/' + label + '/' + file_name)

                count += 1

            os.rmdir(data_dir + '/' + label)

In [8]:
split_images(data_dir=f"inputs/datasets",
                                   train_set_ratio=0.7,
                                   validation_set_ratio=0.1,
                                   test_set_ratio=0.2
                                   )

---