# üå± AI PlantDoc Bot: Intelligent Plant Disease Diagnosis

**Project:** AI PlantDoc Bot (Infosys Springboard Virtual Internship)
**Domain:** Artificial Intelligence 
**Author:** SHIVAM SINGH
**Date:** 03 December 2025

---

## üéØ Objective
To develop an AI-powered chatbot that allows users (farmers, gardeners) to diagnose plant diseases by uploading leaf images or describing symptoms. The system will utilize **Computer Vision (CNNs)** for image analysis and **NLP (BERT/LLMs)** for symptom interpretation.

## üìÖ Day 1 Goals
1.  **Environment Setup**: Configure the workspace and dependencies.
2.  **Data Acquisition**: Download the **PlantVillage** (Classification) and **PlantDoc** (Object Detection/Noise) datasets.
3.  **Data Verification**: Validate directory structures and file integrity.
4.  **Exploratory Data Analysis (EDA)**: Analyze class distributions and visualize sample data.

---

In [None]:
# üõ†Ô∏è 1. Environment Setup & Imports
# Importing necessary libraries for file handling, visualization, and system operations.

import os
import glob
import shutil
import pathlib
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from dataclasses import dataclass

# Using a Configuration class to manage paths and constants.
@dataclass
class Config:
    PROJECT_NAME: str = "PlantDocBot"
    BASE_DIR: pathlib.Path = pathlib.Path(f"/content/{PROJECT_NAME}")
    DATA_DIR: pathlib.Path = BASE_DIR / "data"
    PLANT_VILLAGE_DIR: pathlib.Path = DATA_DIR / "plantvillage"
    PLANT_DOC_DIR: pathlib.Path = DATA_DIR / "plantdoc"
    TEXT_CORPUS_DIR: pathlib.Path = DATA_DIR / "text_corpus"

config = Config()

# Ensure base directories exist
for directory in [config.PLANT_VILLAGE_DIR, config.PLANT_DOC_DIR, config.TEXT_CORPUS_DIR]:
    directory.mkdir(parents=True, exist_ok=True)

print(f"‚úÖ Project Structure Created at: {config.BASE_DIR}")

In [None]:
# üì• 2. Data Acquisition
# Cloning the required datasets from GitHub. 
# We check if the data already exists to prevent redundant downloads on re-runs.

def clone_repo(repo_url, target_dir):
    """
    Clones a git repository to a target directory if it doesn't already exist.
    """
    if not os.listdir(target_dir):  # Check if directory is empty
        print(f"‚¨áÔ∏è Cloning {repo_url}...")
        !git clone {repo_url} {target_dir}
        print(f"‚úÖ Successfully cloned to {target_dir}")
    else:
        print(f"‚ÑπÔ∏è Data already exists in {target_dir}. Skipping download.")

# PlantVillage Dataset (High-quality classification data)
clone_repo("https://github.com/spMohanty/plantvillage-Dataset.git", config.PLANT_VILLAGE_DIR)

# PlantDoc Dataset (Real-world noisy data)
clone_repo("https://github.com/pratikkayal/PlantDoc-Dataset.git", config.PLANT_DOC_DIR)

In [None]:
# üîç 3. Data Verification
# Verifying the contents of the downloaded datasets.

def list_contents(directory, name, limit=5):
    """Lists the first few items in a directory to verify content."""
    try:
        contents = sorted([p.name for p in directory.iterdir()])
        print(f"\nüìÇ Contents of {name} ({len(contents)} items):")
        print(f"   {contents[:limit]} ...")
    except Exception as e:
        print(f"‚ùå Error reading {name}: {e}")

list_contents(config.PLANT_VILLAGE_DIR, "PlantVillage")
list_contents(config.PLANT_DOC_DIR, "PlantDoc")

In [None]:
# üìä 4. Exploratory Data Analysis (EDA)
# Searching for image directories and calculating statistics.

def analyze_dataset(base_path, dataset_name):
    """
    Walks through the directory to find image classes and counts.
    """
    image_extensions = {'.jpg', '.jpeg', '.png', '.bmp'}
    class_counts = {}
    
    print(f"\nüîé Analyzing {dataset_name}...")
    
    for root, dirs, files in os.walk(base_path):
        # Count images in current directory
        images = [f for f in files if pathlib.Path(f).suffix.lower() in image_extensions]
        if images:
            class_name = pathlib.Path(root).name
            class_counts[class_name] = len(images)
            
    if not class_counts:
        print(f"‚ö†Ô∏è No image directories found in {dataset_name}. Check the folder structure.")
        return None
    
    # Sort by count
    sorted_counts = sorted(class_counts.items(), key=lambda item: item[1], reverse=True)
    
    print(f"‚úÖ Found {len(class_counts)} classes.")
    print(f"   Total Images: {sum(class_counts.values())}")
    print(f"   Top 5 Classes by size:")
    for cls, count in sorted_counts[:5]:
        print(f"     - {cls}: {count} images")
        
    return sorted_counts

# Analyze PlantVillage (Note: The repo structure might be nested, e.g., raw/color)
# We search recursively to handle this.
pv_stats = analyze_dataset(config.PLANT_VILLAGE_DIR, "PlantVillage")

In [None]:
# üñºÔ∏è 5. Visualization
# Displaying a sample image to ensure data integrity.

def show_sample_image(base_path):
    image_extensions = ['*.jpg', '*.jpeg', '*.png']
    all_images = []
    
    for ext in image_extensions:
        all_images.extend(glob.glob(str(base_path / "**" / ext), recursive=True))
        
    if all_images:
        sample_img_path = all_images[0]
        img = mpimg.imread(sample_img_path)
        plt.figure(figsize=(6, 6))
        plt.imshow(img)
        plt.title(f"Sample: {pathlib.Path(sample_img_path).parent.name}")
        plt.axis('off')
        plt.show()
        print(f"üì∑ Displaying sample from: {sample_img_path}")
    else:
        print("‚ùå No images found to display.")

show_sample_image(config.PLANT_VILLAGE_DIR)

---
## üìÖ Day 2 Goals: Data Preprocessing & Mapping
1.  **Robust Image Visualization**: Ensure images are correctly loaded in RGB format.
2.  **Dataset Mapping**: Create a structured CSV file mapping every image path to its label (disease class). This is crucial for training custom models later.

---

In [None]:
# üé® 6. Robust Color Display
# Displaying a random image and ensuring it is in RGB format.
# This fixes potential issues with RGBA or Grayscale images in the dataset.

import random
import numpy as np
from PIL import Image

def display_random_image(base_path):
    """
    Selects a random image from the dataset, converts it to RGB, and displays it.
    """
    img_exts = ('.jpg', '.jpeg', '.png', '.bmp')
    all_files = []
    
    # Collect all image files
    for root, dirs, files in os.walk(base_path):
        for f in files:
            if f.lower().endswith(img_exts):
                all_files.append(os.path.join(root, f))
                
    if not all_files:
        print("‚ùå No images found.")
        return
        
    # Pick a random file
    sample_file = random.choice(all_files)
    print(f"üì∑ Displaying random image: {sample_file}")
    
    try:
        img = Image.open(sample_file)
        print(f"   Original Mode: {img.mode}")
        
        # Convert to RGB if necessary
        if img.mode != 'RGB':
            img = img.convert('RGB')
            print("   ‚úÖ Converted to RGB")
            
        # Display using Matplotlib
        plt.figure(figsize=(6, 6))
        plt.imshow(np.asarray(img))
        plt.axis('off')
        plt.title(f"Label: {pathlib.Path(sample_file).parent.name}")
        plt.show()
        
    except Exception as e:
        print(f"‚ùå Error opening image: {e}")

# Run the function on PlantVillage data
display_random_image(config.PLANT_VILLAGE_DIR)

In [None]:
# üìù 7. Build CSV Mapping
# Creating a CSV file that maps every image path to its corresponding label.
# This DataFrame will be the foundation for our PyTorch/TensorFlow data loaders.

import pandas as pd

def create_image_dataframe(base_path, output_csv_name="image_data.csv"):
    """
    Walks through the directory, infers labels from folder names, and saves to CSV.
    """
    img_exts = ('.jpg', '.jpeg', '.png', '.bmp')
    records = []
    
    print(f"\nüìä Building Dataset Mapping for {base_path}...")
    
    for root, dirs, files in os.walk(base_path):
        for f in files:
            if f.lower().endswith(img_exts):
                path = os.path.join(root, f)
                
                # Infer label: The directory name relative to the base path
                # Example: data/plantvillage/Tomato_Healthy/001.jpg -> Label: Tomato_Healthy
                rel_path = os.path.relpath(path, base_path)
                label = rel_path.split(os.sep)[0]
                
                # Handle nested structures (like 'raw/color/Tomato_Healthy') if necessary
                # For now, we assume the immediate parent or the first folder after base is the label
                # A more robust way is to use the immediate parent folder name:
                label = pathlib.Path(path).parent.name
                
                records.append({"image_path": path, "label": label})
                
    # Create DataFrame
    df = pd.DataFrame(records)
    
    if df.empty:
        print("‚ö†Ô∏è No images found to map.")
        return None
        
    # Save to CSV
    output_csv_path = config.DATA_DIR / output_csv_name
    df.to_csv(output_csv_path, index=False)
    
    print(f"‚úÖ Total images mapped: {len(df)}")
    print(f"‚úÖ Saved mapping to: {output_csv_path}")
    print("\nSample Rows:")
    print(df.head())
    
    return df

# Create the mapping
df = create_image_dataframe(config.PLANT_VILLAGE_DIR, "plantvillage_mapping.csv")