# Data Exploration - Multi-Crop Leaf Disease Dataset

This notebook explores the multi-crop leaf disease dataset used for training the detection model.

**Dataset Overview:**
- Total Images: ~50,000
- Crops: Tomato, Potato, Corn, Rice, Wheat
- Classes: 30+ disease categories + healthy leaves
- Split: 70% Train, 15% Val, 15% Test

## 1. Import Required Libraries

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("‚úÖ Libraries imported successfully!")

## 2. Dataset Configuration

In [None]:
# Dataset paths
DATASET_ROOT = "../dataset"
RAW_DATA_PATH = os.path.join(DATASET_ROOT, "raw")
PROCESSED_DATA_PATH = os.path.join(DATASET_ROOT, "processed")

TRAIN_DIR = os.path.join(PROCESSED_DATA_PATH, "train")
VAL_DIR = os.path.join(PROCESSED_DATA_PATH, "val")
TEST_DIR = os.path.join(PROCESSED_DATA_PATH, "test")

print(f"Dataset root: {DATASET_ROOT}")
print(f"Training data: {TRAIN_DIR}")
print(f"Validation data: {VAL_DIR}")
print(f"Test data: {TEST_DIR}")

## 3. Dataset Statistics

Let's analyze the dataset structure and compute basic statistics.

In [None]:
def count_images_per_class(data_dir):
    """Count images in each class directory"""
    class_counts = {}
    
    if not os.path.exists(data_dir):
        print(f"‚ö†Ô∏è Directory not found: {data_dir}")
        return class_counts
    
    for class_name in os.listdir(data_dir):
        class_path = os.path.join(data_dir, class_name)
        if os.path.isdir(class_path):
            image_files = [f for f in os.listdir(class_path) 
                          if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
            class_counts[class_name] = len(image_files)
    
    return class_counts

# Count images in each split
train_counts = count_images_per_class(TRAIN_DIR)
val_counts = count_images_per_class(VAL_DIR)
test_counts = count_images_per_class(TEST_DIR)

print(f"Training classes: {len(train_counts)}")
print(f"Validation classes: {len(val_counts)}")
print(f"Test classes: {len(test_counts)}")
print(f"\nTotal training images: {sum(train_counts.values())}")
print(f"Total validation images: {sum(val_counts.values())}")
print(f"Total test images: {sum(test_counts.values())}")

## 4. Class Distribution Visualization

In [None]:
# Visualize class distribution
if train_counts:
    plt.figure(figsize=(16, 6))
    
    classes = list(train_counts.keys())
    counts = list(train_counts.values())
    
    plt.bar(range(len(classes)), counts, color='steelblue', alpha=0.7)
    plt.xlabel('Disease Class', fontsize=12)
    plt.ylabel('Number of Images', fontsize=12)
    plt.title('Training Dataset - Class Distribution', fontsize=14, fontweight='bold')
    plt.xticks(range(len(classes)), classes, rotation=90, ha='right')
    plt.tight_layout()
    plt.grid(axis='y', alpha=0.3)
    plt.show()
    
    print(f"Most common class: {max(train_counts, key=train_counts.get)} ({max(train_counts.values())} images)")
    print(f"Least common class: {min(train_counts, key=train_counts.get)} ({min(train_counts.values())} images)")
else:
    print("‚ö†Ô∏è No training data found. Please prepare the dataset first.")

## 5. Sample Image Visualization

Display sample images from different disease classes.

In [None]:
def display_sample_images(data_dir, num_classes=6, images_per_class=3):
    """Display sample images from multiple classes"""
    
    if not os.path.exists(data_dir):
        print(f"‚ö†Ô∏è Directory not found: {data_dir}")
        return
    
    classes = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
    classes = sorted(classes)[:num_classes]
    
    fig, axes = plt.subplots(num_classes, images_per_class, figsize=(12, num_classes*2))
    fig.suptitle('Sample Images from Different Disease Classes', fontsize=16, fontweight='bold', y=0.995)
    
    for i, class_name in enumerate(classes):
        class_dir = os.path.join(data_dir, class_name)
        images = [f for f in os.listdir(class_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        
        for j in range(images_per_class):
            if j < len(images):
                img_path = os.path.join(class_dir, images[j])
                img = Image.open(img_path)
                
                axes[i, j].imshow(img)
                axes[i, j].axis('off')
                
                if j == 0:
                    axes[i, j].set_title(f"{class_name}", fontsize=10, fontweight='bold', loc='left')
            else:
                axes[i, j].axis('off')
    
    plt.tight_layout()
    plt.show()

# Display samples
display_sample_images(TRAIN_DIR, num_classes=6, images_per_class=4)

## 6. Image Size Analysis

Analyze the distribution of image sizes in the dataset.

In [None]:
def analyze_image_sizes(data_dir, sample_size=500):
    """Analyze image dimensions in dataset"""
    
    if not os.path.exists(data_dir):
        print(f"‚ö†Ô∏è Directory not found: {data_dir}")
        return
    
    widths = []
    heights = []
    aspects = []
    
    classes = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
    
    count = 0
    for class_name in classes:
        class_dir = os.path.join(data_dir, class_name)
        images = [f for f in os.listdir(class_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        
        for img_name in images[:sample_size // len(classes)]:
            try:
                img_path = os.path.join(class_dir, img_name)
                img = Image.open(img_path)
                w, h = img.size
                widths.append(w)
                heights.append(h)
                aspects.append(w/h)
                count += 1
            except:
                continue
    
    print(f"Analyzed {count} images")
    print(f"\nWidth  - Min: {min(widths)}, Max: {max(widths)}, Mean: {np.mean(widths):.1f}")
    print(f"Height - Min: {min(heights)}, Max: {max(heights)}, Mean: {np.mean(heights):.1f}")
    print(f"Aspect Ratio - Min: {min(aspects):.2f}, Max: {max(aspects):.2f}, Mean: {np.mean(aspects):.2f}")
    
    # Plot distributions
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    axes[0].hist(widths, bins=30, color='skyblue', edgecolor='black', alpha=0.7)
    axes[0].set_xlabel('Width (pixels)')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Image Width Distribution')
    axes[0].axvline(np.mean(widths), color='red', linestyle='--', label=f'Mean: {np.mean(widths):.0f}')
    axes[0].legend()
    
    axes[1].hist(heights, bins=30, color='lightcoral', edgecolor='black', alpha=0.7)
    axes[1].set_xlabel('Height (pixels)')
    axes[1].set_ylabel('Frequency')
    axes[1].set_title('Image Height Distribution')
    axes[1].axvline(np.mean(heights), color='red', linestyle='--', label=f'Mean: {np.mean(heights):.0f}')
    axes[1].legend()
    
    axes[2].hist(aspects, bins=30, color='lightgreen', edgecolor='black', alpha=0.7)
    axes[2].set_xlabel('Aspect Ratio (W/H)')
    axes[2].set_ylabel('Frequency')
    axes[2].set_title('Aspect Ratio Distribution')
    axes[2].axvline(np.mean(aspects), color='red', linestyle='--', label=f'Mean: {np.mean(aspects):.2f}')
    axes[2].legend()
    
    plt.tight_layout()
    plt.show()

analyze_image_sizes(TRAIN_DIR, sample_size=500)

## 7. Data Quality Checks

Identify potential issues in the dataset.

In [None]:
def check_data_quality(data_dir):
    """Check for common data quality issues"""
    
    if not os.path.exists(data_dir):
        print(f"‚ö†Ô∏è Directory not found: {data_dir}")
        return
    
    issues = {
        'corrupted': [],
        'too_small': [],
        'grayscale': []
    }
    
    MIN_SIZE = 50  # minimum acceptable dimension
    
    classes = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
    
    total_checked = 0
    for class_name in classes:
        class_dir = os.path.join(data_dir, class_name)
        images = [f for f in os.listdir(class_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        
        for img_name in images[:100]:  # Check first 100 per class
            img_path = os.path.join(class_dir, img_name)
            try:
                img = Image.open(img_path)
                w, h = img.size
                
                # Check size
                if w < MIN_SIZE or h < MIN_SIZE:
                    issues['too_small'].append(img_path)
                
                # Check if grayscale
                if img.mode == 'L':
                    issues['grayscale'].append(img_path)
                
                total_checked += 1
                
            except Exception as e:
                issues['corrupted'].append(img_path)
    
    print(f"‚úÖ Data Quality Report")
    print(f"Total images checked: {total_checked}")
    print(f"\nüìä Issues Found:")
    print(f"  - Corrupted files: {len(issues['corrupted'])}")
    print(f"  - Too small (<{MIN_SIZE}px): {len(issues['too_small'])}")
    print(f"  - Grayscale images: {len(issues['grayscale'])}")
    
    if sum(len(v) for v in issues.values()) == 0:
        print("\n‚ú® No issues found!")
    else:
        print("\n‚ö†Ô∏è Review and fix identified issues before training")
    
    return issues

quality_issues = check_data_quality(TRAIN_DIR)

## 8. Summary & Recommendations

Based on the dataset analysis, here are the key findings and recommendations for model training.

In [None]:
print("="*60)
print("DATASET SUMMARY & RECOMMENDATIONS")
print("="*60)

if train_counts:
    total_images = sum(train_counts.values())
    num_classes = len(train_counts)
    avg_per_class = total_images / num_classes
    
    print(f"\nüìä Dataset Statistics:")
    print(f"  ‚Ä¢ Total classes: {num_classes}")
    print(f"  ‚Ä¢ Total training images: {total_images}")
    print(f"  ‚Ä¢ Average per class: {avg_per_class:.0f}")
    print(f"  ‚Ä¢ Class imbalance ratio: {max(train_counts.values())/min(train_counts.values()):.2f}:1")
    
    print(f"\n‚úÖ Recommendations:")
    
    # Check class balance
    imbalance = max(train_counts.values()) / min(train_counts.values())
    if imbalance > 2:
        print(f"  1. Apply class weights during training (imbalance: {imbalance:.1f}:1)")
    else:
        print(f"  1. Dataset is relatively balanced ‚úì")
    
    # Check dataset size
    if avg_per_class < 500:
        print(f"  2. Consider data augmentation (avg {avg_per_class:.0f} images/class)")
    else:
        print(f"  2. Dataset size is adequate ‚úì")
    
    print(f"  3. Use transfer learning with ImageNet weights")
    print(f"  4. Target input size: 224x224 pixels")
    print(f"  5. Apply standard augmentations (rotation, flip, zoom)")
    
else:
    print("\n‚ö†Ô∏è No dataset found!")
    print("\nTo prepare the dataset:")
    print("  1. Download PlantVillage dataset")
    print("  2. Organize into train/val/test splits")
    print("  3. Run preprocessing script")

print("\n" + "="*60)