# **DataCollection**

## Objectives

- Fetch data from Kaggle, save as raw data and prepare it for further processes.

## Inputs

- Kaggle JSON file - the token is required for kaggle authentication. 

## Outputs

- Generate the Dataset: inputs/cherry_leaves_dataset.

## Additional Comments

- No additional comments here.


---


# Import packages

In [1]:
! pip install -r /workspace/mildew-detection-in-cherry-leaves/requirements.txt




In [2]:
import numpy
import os
import random

Change the working directory

In [3]:
current_dir = os.getcwd()
current_dir

'/workspace/mildew-detection-in-cherry-leaves/jupyter_notebooks'

In [4]:
os.chdir(os.path.dirname(current_dir))
print("You set a new current directory")

You set a new current directory



You set a new current directory

In [5]:
current_dir = os.getcwd()
current_dir

'/workspace/mildew-detection-in-cherry-leaves'

---

# Install Kaggle

In [6]:
pip install kaggle


Note: you may need to restart the kernel to use updated packages.


In [7]:
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()
! chmod 600 kaggle.json

In [8]:
KaggleDatasetPath = "codeinstitute/cherry-leaves"
DestinationFolder = "inputs/cherry_leaves_dataset"   
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

Dataset URL: https://www.kaggle.com/datasets/codeinstitute/cherry-leaves
License(s): unknown
Downloading cherry-leaves.zip to inputs/cherry_leaves_dataset
 93%|███████████████████████████████████▏  | 51.0M/55.0M [00:02<00:00, 37.6MB/s]
100%|██████████████████████████████████████| 55.0M/55.0M [00:02<00:00, 27.4MB/s]


In [9]:
import zipfile
with zipfile.ZipFile(DestinationFolder + '/cherry-leaves.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)

os.remove(DestinationFolder + '/cherry-leaves.zip')

---

# Data Preparation
## Data cleaning
Check files, remove all non-image data and excess images

In [12]:
import os
import random

def remove_non_img_data(my_data_dir):
    """
    This function removes non-image files in the given directory & subdirectories.

    Deletes files without the specified extensions from each subdirectory
    and prints the count of image and non-image files.
    """
    image_ext = ('.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.gif')
    folders = os.listdir(my_data_dir)

    for folder in folders:
        files = os.listdir(my_data_dir + '/' + folder)

        b = []
        c = []

        for given_file in files:
            if not given_file.lower().endswith(image_ext):
                file_location = my_data_dir + '/' + folder + '/' + given_file
                os.remove(file_location)
                b.append(1)
            else:
                c.append(1)
                pass

        print(f"Folder: {folder} - has image file/s", len(c))
        print(f"Folder: {folder} - has non-image file/s", len(b))

In [13]:
remove_non_img_data(my_data_dir='inputs/cherry_leaves_dataset/cherry-leaves')


Folder: healthy - has image file/s 2104
Folder: healthy - has non-image file/s 0
Folder: powdery_mildew - has image file/s 2104
Folder: powdery_mildew - has non-image file/s 0


## Split the data into train, validation and test sets


In [14]:
import os
import shutil
import random
import joblib

- Get the classes labels, create the train, validate and test folders with the classes labels sub-folder.
- Loop through each image file and allocate image files to each of the folders created.

In [15]:
def delete_half_images(my_data_dir):
    """
    This function deletes half of the image files from each subdirectory in the given directory.
    """
    labels = os.listdir(my_data_dir)
    image_ext = ('.jpg', '.jpeg', '.png', '.bmp', '.tiff', '.gif')

    for label in labels:
        label_dir = os.path.join(my_data_dir, label)
        if os.path.isdir(label_dir):
            files = [f for f in os.listdir(label_dir) if f.lower().endswith(image_ext)]
            random.shuffle(files)
            files_to_delete = files[:len(files) // 2]

            for file_name in files_to_delete:
                os.remove(os.path.join(label_dir, file_name))

def count_files_in_folder(folder_path):
    """
    Here we count the total number of files in the given folder and its subfolders,
    the total number of files is returned
    """
    return sum([len(files) for _, _, files in os.walk(folder_path)])

def split_images_train_validation_test(my_data_dir, train_ratio, validation_ratio, test_ratio):
    """
    This function splits images in the given directory into train, validation, and test sets.
    """
    if train_ratio + validation_ratio + test_ratio != 1.0:
        print("train_ratio + validation_ratio + test_ratio should sum 1.0")
        return

    # First delete 50% of the images in each folder
    delete_half_images(my_data_dir)

    labels = os.listdir(my_data_dir) 
    if 'test' in labels:
        pass
    else:
        for folder in ['train', 'validation', 'test']:
            for label in labels:
                os.makedirs(name=my_data_dir + '/' + folder + '/' + label, exist_ok=True)

        for label in labels:
            label_dir = os.path.join(my_data_dir, label)
            if os.path.isdir(label_dir):
                files = os.listdir(label_dir)
                random.shuffle(files)

                train_set_qty = int(len(files) * train_ratio)
                validation_set_qty = int(len(files) * validation_ratio)

                count = 1
                for file_name in files:
                    if count <= train_set_qty:
                        shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                    my_data_dir + '/train/' + label + '/' + file_name)
                    elif count <= (train_set_qty + validation_set_qty):
                        shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                    my_data_dir + '/validation/' + label + '/' + file_name)
                    else:
                        shutil.move(my_data_dir + '/' + label + '/' + file_name,
                                    my_data_dir + '/test/' + label + '/' + file_name)

                    count += 1

            os.rmdir(label_dir)

    # Print the number of files in each folder
    for folder in ['train', 'validation', 'test']:
        print(f"Number of files in {folder} folder:")
        for label in labels:
            folder_path = os.path.join(my_data_dir, folder, label)
            if os.path.isdir(folder_path):
                file_count = len(os.listdir(folder_path))
                print(f"  {label}: {file_count} files")


### The ratio that the data will be split into for each set:

- Train set is divided into a 0.70 ratio.
- Validation set is divided into a 0.10 ratio.
- Test set is divided into a 0.20 ratio.

In [16]:

split_images_train_validation_test(my_data_dir=f"inputs/cherry_leaves_dataset/cherry-leaves",
                                   train_ratio=0.7,
                                   validation_ratio=0.1,
                                   test_ratio=0.2
                                   )

Number of files in train folder:
  healthy: 736 files
  powdery_mildew: 736 files
Number of files in validation folder:
  healthy: 105 files
  powdery_mildew: 105 files
Number of files in test folder:
  healthy: 211 files
  powdery_mildew: 211 files


---


## Conclusions and Next Steps

- The image data has been collected, cleaned and split into train, validation and test sets.
- Next step: Data Visualization.