In [1]:
import os

In [2]:
data_path = 'data'

train_path = os.path.join(data_path, "agri_net_train")
val_path = os.path.join(data_path, "agri_net_val")
test_path = os.path.join(data_path, "agri_net_test")



In [3]:
# Check train, val, and test directories for folders with spaces or parentheses in the their names
# fix these issues by renaming the folders
# Note: This script assumes that the directory structure is consistent and that the directories are not deeply nested.
# It will rename directories in the specified paths if they contain spaces or parentheses.

for path in [train_path, val_path, test_path]:
    for root, dirs, files in os.walk(path):
        for dir_name in dirs:
            if ' ' in dir_name or '(' in dir_name or ')' in dir_name:
                print(f"Directory '{dir_name}' in '{root}' contains spaces or parentheses.")
            if ' ' in dir_name:
                new_dir_name = dir_name.replace(' ', '_')
                os.rename(os.path.join(root, dir_name), os.path.join(root, new_dir_name))
                print(f"Renamed '{dir_name}' to '{new_dir_name}' in '{root}'")
            if '(' in dir_name or ')' in dir_name:
                new_dir_name = dir_name.replace('(', '').replace(')', '')
                os.rename(os.path.join(root, dir_name), os.path.join(root, new_dir_name))
                print(f"Renamed '{dir_name}' to '{new_dir_name}' in '{root}'")  


In [4]:
# Ensure that train, val, and test directories have consistent folder names within each path
# This script assumes that the directory structure is consistent and that the directories are not deeply nested.

# Get list of all directories in the train, val, and test paths
train_dirs = sorted(os.listdir(train_path))
val_dirs = sorted(os.listdir(val_path))
test_dirs = sorted(os.listdir(test_path))
# Check if the directories in train, val, and test paths are consistent

if train_dirs != val_dirs or train_dirs != test_dirs:
    print("Inconsistent directories found:")
    print(f"Train directories: {train_dirs}")
    print(f"Validation directories: {val_dirs}")
    print(f"Test directories: {test_dirs}")
else:
    print("All directories are consistent across train, val, and test paths.")
    print(f"Directories: {train_dirs}")


All directories are consistent across train, val, and test paths.
Directories: ['Alstonia_Scholaris_Diseased', 'Alstonia_Scholaris_Healthy', 'Apple_Apple_Scab', 'Apple_Black_Rot', 'Apple_Cedar_Apple_Rust', 'Apple_Healthy', 'Arjun_Diseased', 'Arjun_Healthy', 'Background_Without_Leaves', 'Bean_Angular_Leaf_Spot', 'Bean_Healthy', 'Bean_Rust', 'Cassava_Bacterial_Blight', 'Cassava_Green_Mite', 'Cassava_Healthy', 'Cassava__Brown_Streak_Disease', 'Cassava__Mosaic_Disease', 'Cherry_Healthy', 'Cherry_Powdery_Mildew', 'Chinar_Diseased', 'Chinar_Healthy', 'Citrus_Diseased', 'Citrus_Healthy', 'Coffee_Cercospora', 'Coffee_Healthy', 'Coffee_Miner', 'Coffee_Phoma', 'Coffee_Rust', 'Corn_Cercospora__Gray_Leaf_Spot', 'Corn_Common_Rust', 'Corn_Healthy', 'Corn_Northern_Leaf_Blight', 'Cotton_Leaf_Diseased', 'Cotton_Leaf_Healthy', 'Cotton_Plant_Diseased', 'Cotton_Plant_Healthy', 'Gauva__Diseased', 'Gauva__Healthy', 'Grape_Black_Rot', 'Grape_Esca_Black_Measles', 'Grape_Healthy', 'Grape_Leaf_Blight_Isariopsis

In [5]:
# Go through the directories and check the 1st word of each directory name.
# If there is only one directory with that name, print it out

def check_unique_directories(path):
    # Get list of all directories in the path
    dirs = sorted(os.listdir(path))
    # Create a dictionary to count occurrences of each directory name
    dir_count = {}
    for dir_name in dirs:
        first_word = dir_name.split('_')[0]
        if first_word not in dir_count:
            dir_count[first_word] = 1
        else:
            dir_count[first_word] += 1

    # Print unique directory names
    for dir_name, count in dir_count.items():
        if count == 1:
            print(f"Unique directory: {dir_name} in {path}")

            # Unless the directory is "Background"            
# Check unique directories in train, val, and test paths
check_unique_directories(train_path)
check_unique_directories(val_path)
check_unique_directories(test_path)

Unique directory: Background in data/agri_net_train
Unique directory: Background in data/agri_net_val
Unique directory: Background in data/agri_net_test


* I manually deleted the directories for Bael, Basil, Blueberry, Raspberry and Squash from train, val, test dirs.
* I Manually moved images from `Lemon_Healthy/[subfolder]` and `Lemon_Diseased/[Subfolder]` directories into `Lemon_Healthy` and `Lemon_Diseased`, and renamed those to `Citrus_[Healthy, Diseased]`.
* Renmaed `Coffee_Health` to `Coffee_Healthy`
* `Rice_Bacterial_Leaf_Blight` only has 28 images and there are no healthy rice images. Remove all rice folders.
* Soybean also seems not to fit the mold of others, with no disease states. Remove those.

In [6]:
# For each directory in train, val, and test paths, check how many images are in each directory
# and print the count
# This script assumes that the directory structure is consistent and that the directories are not deeply nested.

# Create a dataframe to store the counts and directory names
import pandas as pd
image_counts = []
for path in [train_path]:
    for root, dirs, files in os.walk(path):
        for dir_name in dirs:
            dir_path = os.path.join(root, dir_name)
            image_count = len([f for f in os.listdir(dir_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
            image_counts.append({'Directory': dir_name, 'Path': path, 'Image Count': image_count})
            if image_count < 100:
                print(f"Warning: Directory '{dir_path}' in '{path}' contains less than 100 images ({image_count} images).")

# Create a DataFrame from the image counts
df = pd.DataFrame(image_counts)
df




Unnamed: 0,Directory,Path,Image Count
0,Pomegranate_Healthy,data/agri_net_train,200
1,Pongamia_Pinnata_Diseased,data/agri_net_train,193
2,Coffee_Phoma,data/agri_net_train,324
3,Tomato_Late_Blight,data/agri_net_train,1482
4,Tomato_Septoria_Leaf_Spot,data/agri_net_train,1345
...,...,...,...
68,Tomato_Spider_Mites_Two-Spotted_Spider_Mite,data/agri_net_train,1174
69,Bean_Rust,data/agri_net_train,305
70,Jatropha__Diseased,data/agri_net_train,86
71,Coffee_Cercospora,data/agri_net_train,226


The smallest traning folder is 58 images in `data/agri_net_train/Tomato_Gray_Spot`. Let's subsample so that each folder has at most 100 images.

In [7]:
# For the training folder, make a subsample of each category, randomly selecting 100 images
# Put the subsampled images into their corresponding folders in data/agri_net_train100 

# Create the new directory for the subsampled images
subsampled_train_path = os.path.join(data_path, "agri_net_train100")
os.makedirs(subsampled_train_path, exist_ok=True)
# Create the subdirectories for each category
for dir_name in train_dirs:
    os.makedirs(os.path.join(subsampled_train_path, dir_name), exist_ok=True)
# Go through the directories and randomly select 100 images from each directory

import random
for dir_name in train_dirs:
    dir_path = os.path.join(train_path, dir_name)
    image_files = [f for f in os.listdir(dir_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    random.shuffle(image_files)
    selected_files = image_files[:100]
    for file_name in selected_files:
        src_path = os.path.join(dir_path, file_name)
        dst_path = os.path.join(subsampled_train_path, dir_name, file_name)
        os.link(src_path, dst_path)  # Create a hard link to the original file

# Check the number of images in each directory in the subsampled train path
subsampled_image_counts = []
for root, dirs, files in os.walk(subsampled_train_path):
    for dir_name in dirs:
        dir_path = os.path.join(root, dir_name)
        image_count = len([f for f in os.listdir(dir_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
        subsampled_image_counts.append({'Directory': dir_name, 'Path': subsampled_train_path, 'Image Count': image_count})
# Create a DataFrame from the subsampled image counts
subsampled_df = pd.DataFrame(subsampled_image_counts)
subsampled_df

Unnamed: 0,Directory,Path,Image Count
0,Pomegranate_Healthy,data/agri_net_train100,100
1,Pongamia_Pinnata_Diseased,data/agri_net_train100,100
2,Coffee_Phoma,data/agri_net_train100,100
3,Tomato_Late_Blight,data/agri_net_train100,100
4,Tomato_Septoria_Leaf_Spot,data/agri_net_train100,100
...,...,...,...
68,Tomato_Spider_Mites_Two-Spotted_Spider_Mite,data/agri_net_train100,100
69,Bean_Rust,data/agri_net_train100,100
70,Jatropha__Diseased,data/agri_net_train100,86
71,Coffee_Cercospora,data/agri_net_train100,100


In [8]:
subsampled_df.describe()

Unnamed: 0,Image Count
count,73.0
mean,98.520548
std,6.335766
min,58.0
25%,100.0
50%,100.0
75%,100.0
max,100.0


OK, now we have a training set with up to 100 images per class and clean classes with healthy and one or more disease states.