# Data Collection for Mildew Detection in Cherry Leaves

## Objectives
* Fetch the dataset of cherry leaf images from the provided source and examine its structure.
* Save the raw image data in an organized directory structure for easy access in later stages.

## Inputs
* Dataset URL or access key if the dataset is hosted on platforms like Kaggle.

## Outputs
* Directory structure containing the raw dataset divided into training, validation, and test sets.

## Additional Comments
* Ensure compliance with any data use agreements or NDAs associated with the dataset.
---

# Import packages


In [None]:
%pip install -r "/Users/jordanfletorides/Desktop/github repos/ml-mildew-detector/requirements.txt"

In [None]:
import numpy
import os

---

# Change working directory

* We are assuming you will store the notebooks in a subfolder, therefore when running the notebook in the editor, you will need to change the working directory

We need to change the working directory from its current folder to its parent folder
* We access the current directory with os.getcwd()

In [None]:
import os
current_dir = os.getcwd()
current_dir

We want to make the parent of the current directory the new current directory
* os.path.dirname() gets the parent directory
* os.chir() defines the new current directory

In [None]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

Confirm the new current directory

In [None]:
current_dir = os.getcwd()
current_dir

# Changing file permissions (for Unix-like OS)
! chmod -R u+w inputs/cherry_leaves_dataset

---

# Install Kaggle

In [None]:
# install kaggle package
%pip install kaggle==1.5.12

Run the cell below **to change the Kaggle configuration directory to the current working directory and set permissions for the Kaggle authentication JSON**.

In [None]:
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 kaggle.json

* Get the dataset path from the [Kaggle URL](https://www.kaggle.com/datasets/codeinstitute/cherry-leaves/data). When you are viewing the dataset at Kaggle, check what is after https://www.kaggle.com/ (in some cases kaggle.com/datasets). You should copy that at KaggleDatasetPath.
* Set your destination folder.

Set the Kaggle Dataset and Download it.

In [None]:
KaggleDatasetPath = "codeinstitute/cherry-leaves/data"
DestinationFolder = "inputs/cherry_leaves_dataset"
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

Unzip the downloaded file, and delete the zip file.

In [None]:
import zipfile
with zipfile.ZipFile(DestinationFolder + '/cherry-leaves.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)

os.remove(DestinationFolder + '/cherry-leaves.zip')

---

# Data Preparation

## Data cleaning

### Check and remove non-image files

In [None]:
def clear_non_images(directory_path):
    valid_extensions = ('.png', '.jpg', '.jpeg')  # Valid image file extensions
    
    # Ensure the base directory exists and is a directory
    if not os.path.isdir(directory_path):
        print(f"Provided path '{directory_path}' is not a directory.")
        return

    subdirectories = [d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))]

    for subfolder in subdirectories:
        path = os.path.join(directory_path, subfolder)
        files_in_subfolder = os.listdir(path)

        image_count, non_image_count = 0, 0
        for file in files_in_subfolder:
            if file.lower().endswith(valid_extensions):
                image_count += 1
            else:
                non_image_path = os.path.join(path, file)  # Construct full file path
                try:
                    os.remove(non_image_path)  # Attempt to remove the non-image file
                    non_image_count += 1
                except PermissionError:
                    print(f"Permission denied: {non_image_path}")

        print(f"Subfolder '{subfolder}': {image_count} images, {non_image_count} non-images removed")

In [None]:
# Assuming 'directory_path' is the parameter name used in the revised function 'clear_non_images'
clear_non_images(directory_path='inputs/cherry_leaves_dataset')

## Split train validation test set

In [None]:
import os
import shutil
import random

def distribute_dataset_images(dataset_path, train_ratio, validation_ratio, test_ratio):
    """
    Distribute images across train, validation, and test folders based on specified ratios.
    
    Args:
    dataset_path (str): The directory containing the class folders.
    train_ratio (float): The proportion of images to be used for the training set.
    validation_ratio (float): The proportion of images to be used for the validation set.
    test_ratio (float): The proportion of images to be used for the test set.
    """
    # Check if ratios sum to 1
    if train_ratio + validation_ratio + test_ratio != 1.0:
        print("The sum of ratios must be 1.0.")
        return

    try:
        categories = [category for category in os.listdir(dataset_path) if os.path.isdir(os.path.join(dataset_path, category))]
        
        # Check if 'test' directory exists to avoid recreating structure
        if not 'test' in categories:
            # Creating directories for each subset and category
            for subset in ['train', 'validation', 'test']:
                subset_path = os.path.join(dataset_path, subset)
                for category in categories:
                    dir_path = os.path.join(subset_path, category)
                    if not os.path.exists(dir_path):
                        os.makedirs(dir_path, exist_ok=True)
            
            # Distributing files
            for category in categories:
                files = os.listdir(os.path.join(dataset_path, category))
                random.shuffle(files)
                
                train_end = int(len(files) * train_ratio)
                validation_end = train_end + int(len(files) * validation_ratio)

                train_files = files[:train_end]
                validation_files = files[train_end:validation_end]
                test_files = files[validation_end:]

                # Function to move files to their new location
                def move_files(files, subset):
                    for file_name in files:
                        source_path = os.path.join(dataset_path, category, file_name)
                        destination_path = os.path.join(dataset_path, subset, category, file_name)
                        shutil.move(source_path, destination_path)
                
                # Moving files to respective directories
                move_files(train_files, 'train')
                move_files(validation_files, 'validation')
                move_files(test_files, 'test')
                
                # Remove original category directory if empty
                if os.listdir(os.path.join(dataset_path, category)) == []:
                    os.rmdir(os.path.join(dataset_path, category))

        print("Dataset successfully distributed into train, validation, and test sets.")
    except Exception as e:
        print(f"An error occurred: {e}")


Conventionally,
* The training set is divided into a 0.70 ratio of data.
* The validation set is divided into a 0.10 ratio of data.
* The test set is divided into a 0.20 ratio of data.

In [None]:
distribute_dataset_images(dataset_path="inputs/cherry_leaves_dataset",
                          train_ratio=0.7,
                          validation_ratio=0.1,
                          test_ratio=0.2)

---