In [1]:
!pip install kagglehub
from google.colab import files

import kagglehub

# Download the latest version of the dataset
path = kagglehub.dataset_download("vipoooool/new-plant-diseases-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/vipoooool/new-plant-diseases-dataset?dataset_version_number=2...


100%|██████████| 2.70G/2.70G [00:40<00:00, 70.9MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/vipoooool/new-plant-diseases-dataset/versions/2


In [2]:
import os

# Define the dataset path
root_folder = "/root/.cache/kagglehub/datasets/vipoooool/new-plant-diseases-dataset/versions/2"

data_dir = os.path.join(root_folder, 'new plant diseases dataset(augmented)/New Plant Diseases Dataset(Augmented)/')

test_path = os.path.join(root_folder, 'test/test')
train_path = os.path.join(data_dir, 'train')
valid_path = os.path.join(data_dir, 'valid')

# Count the number of folders in a directory
def count_folders(path):
    if os.path.exists(path):
        return len([d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))])
    else:
        print(f"Path does not exist: {path}")
        return 0

# Count the number of files in a directory
def count_files(path):
    if os.path.exists(path):
        return len([f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))])
    else:
        print(f"Path does not exist: {path}")
        return 0

# Function to count files in each folder of a given directory
def count_files_in_folders(path):
    if not os.path.exists(path):
        print(f"Path does not exist: {path}")
        return {}

    folder_file_counts = {}
    for folder in os.listdir(path):
        folder_path = os.path.join(path, folder)
        if os.path.isdir(folder_path):
            file_count = len([f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))])
            folder_file_counts[folder] = file_count

    return folder_file_counts

# Print counts
print(f"Number of folders in train_path: {count_folders(train_path)}")
print(f"Number of folders in valid_path: {count_folders(valid_path)}")
print(f"Number of files in test_path: {count_files(test_path)}")
print('-------------------------------------------------------------------')
# Count files in each folder
train_folder_counts = count_files_in_folders(train_path)
valid_folder_counts = count_files_in_folders(valid_path)
# Display results
print("Files in each folder (train_path):")
for folder, count in train_folder_counts.items():
    print(f"{folder}: {count} files")

print("\nFiles in each folder (valid_path):")
for folder, count in valid_folder_counts.items():
    print(f"{folder}: {count} files")

Number of folders in train_path: 38
Number of folders in valid_path: 38
Number of files in test_path: 33
-------------------------------------------------------------------
Files in each folder (train_path):
Corn_(maize)___Cercospora_leaf_spot Gray_leaf_spot: 1642 files
Pepper,_bell___healthy: 1988 files
Tomato___Late_blight: 1851 files
Tomato___Tomato_mosaic_virus: 1790 files
Soybean___healthy: 2022 files
Squash___Powdery_mildew: 1736 files
Tomato___Leaf_Mold: 1882 files
Tomato___Target_Spot: 1827 files
Potato___Late_blight: 1939 files
Grape___healthy: 1692 files
Potato___healthy: 1824 files
Tomato___healthy: 1926 files
Raspberry___healthy: 1781 files
Peach___Bacterial_spot: 1838 files
Apple___Cedar_apple_rust: 1760 files
Corn_(maize)___Common_rust_: 1907 files
Orange___Haunglongbing_(Citrus_greening): 2010 files
Tomato___Bacterial_spot: 1702 files
Blueberry___healthy: 1816 files
Grape___Leaf_blight_(Isariopsis_Leaf_Spot): 1722 files
Corn_(maize)___Northern_Leaf_Blight: 1908 files
Gra

In [3]:
import pandas as pd
import numpy as np
from PIL import Image
import glob
import os
import shutil
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [4]:
# Initialize lists
image_paths = []
species_labels = []
disease_labels = []
dataset_split = []

# Function to process a dataset directory
def process_directory(base_path, split_name):
    for class_folder in os.listdir(base_path):
        class_folder_path = os.path.join(base_path, class_folder)
        if os.path.isdir(class_folder_path):
            species, disease = class_folder.split('___')
            for image_name in os.listdir(class_folder_path):
                image_paths.append(os.path.join(class_folder_path, image_name))
                species_labels.append(species)
                disease_labels.append(disease)
                dataset_split.append(split_name)

# Process train and valid
process_directory(train_path, 'train')
process_directory(valid_path, 'valid')

# Create DataFrame
df = pd.DataFrame({
    'image_path': image_paths,
    'species': species_labels,
    'disease': disease_labels,
    'split': dataset_split
})

# Display first few rows
print(df.head())

                                          image_path       species  \
0  /root/.cache/kagglehub/datasets/vipoooool/new-...  Corn_(maize)   
1  /root/.cache/kagglehub/datasets/vipoooool/new-...  Corn_(maize)   
2  /root/.cache/kagglehub/datasets/vipoooool/new-...  Corn_(maize)   
3  /root/.cache/kagglehub/datasets/vipoooool/new-...  Corn_(maize)   
4  /root/.cache/kagglehub/datasets/vipoooool/new-...  Corn_(maize)   

                               disease  split  
0  Cercospora_leaf_spot Gray_leaf_spot  train  
1  Cercospora_leaf_spot Gray_leaf_spot  train  
2  Cercospora_leaf_spot Gray_leaf_spot  train  
3  Cercospora_leaf_spot Gray_leaf_spot  train  
4  Cercospora_leaf_spot Gray_leaf_spot  train  


Reduced training data to 20K

In [5]:
train_df = df[df['split'] == 'train']

# Calculate the desired proportion for each class
class_proportions = train_df['species'].value_counts(normalize=True)

# Calculate the number of images per class for the reduced dataset
class_counts = (class_proportions * 20000).astype(int)

# Create a new directory for the reduced dataset
reduced_train_path = '/content/reduced_train'  # Or any desired path
os.makedirs(reduced_train_path, exist_ok=True)

# Copy images to the new directory, maintaining proportions
for species, count in class_counts.items():
    # Filter DataFrame for the current species
    species_df = train_df[train_df['species'] == species]

    # Randomly select 'count' images from the species
    selected_images = species_df.sample(count, random_state=42)  # Set random_state for reproducibility

    # Create a subdirectory for the species in the reduced dataset
    species_dir = os.path.join(reduced_train_path, species)
    os.makedirs(species_dir, exist_ok=True)

    # Copy the selected images to the new subdirectory
    for _, row in selected_images.iterrows():
        shutil.copy(row['image_path'], species_dir)

print(f"Reduced training dataset created at: {reduced_train_path}")

Reduced training dataset created at: /content/reduced_train
