# Generate Data Splits for PlantVillage Dataset

This notebook converts JPG images from the PlantVillage dataset into TensorFlow tensors and splits them into train/test/validation sets.


In [None]:
import pandas as pd
import numpy as np
import keras
from pathlib import Path
from sklearn.model_selection import train_test_split
import shutil
from tqdm import tqdm
import os

print(f"Keras version: {keras.__version__}")
print(f"Keras backend: {keras.backend.backend()}")



## 1. Load Metadata and Configure Parameters


In [None]:
# Load metadata
metadata_path = Path("../data/plantvillage_images_metadata.parquet")
df = pd.read_parquet(metadata_path)

# Configuration
BASE_DATA_PATH = Path("../data")
SPLITS_PATH = BASE_DATA_PATH / "splits"
IMAGE_SIZE = (224, 224)  # Standard size for many pre-trained models
TRAIN_SPLIT = 0.7
VAL_SPLIT = 0.15
TEST_SPLIT = 0.15
RANDOM_STATE = 42

# Use only color images for training (you can change this to 'grayscale' or 'segmented' if needed)
IMAGE_TYPE_TO_USE = 'color'

print(f"Total images in metadata: {len(df):,}")
print(f"Image types available: {df['image_type'].unique()}")
print(f"\nUsing image type: {IMAGE_TYPE_TO_USE}")
print(f"\nSplit ratios:")
print(f"  Train: {TRAIN_SPLIT*100}%")
print(f"  Validation: {VAL_SPLIT*100}%")
print(f"  Test: {TEST_SPLIT*100}%")


In [None]:
# Filter for the selected image type
df_filtered = df[df['image_type'] == IMAGE_TYPE_TO_USE].copy()

# Create a combined label from plant_type and condition
df_filtered['label'] = df_filtered['plant_type'] + '___' + df_filtered['condition']

# Make paths absolute
df_filtered['full_image_path'] = df_filtered['image_path'].apply(
    lambda x: BASE_DATA_PATH / x
)

print(f"Images after filtering for {IMAGE_TYPE_TO_USE}: {len(df_filtered):,}")
print(f"\nNumber of classes: {df_filtered['label'].nunique()}")
print(f"\nClass distribution:")
print(df_filtered['label'].value_counts())


## 2. Create Stratified Train/Validation/Test Splits


In [None]:
# Stratified split to maintain class distribution
# First split: separate train from (val + test)
train_df, temp_df = train_test_split(
    df_filtered,
    test_size=(VAL_SPLIT + TEST_SPLIT),
    random_state=RANDOM_STATE,
    stratify=df_filtered['label']
)

# Second split: separate val from test
val_df, test_df = train_test_split(
    temp_df,
    test_size=TEST_SPLIT / (VAL_SPLIT + TEST_SPLIT),
    random_state=RANDOM_STATE,
    stratify=temp_df['label']
)

print(f"Train set size: {len(train_df):,} ({len(train_df)/len(df_filtered)*100:.1f}%)")
print(f"Validation set size: {len(val_df):,} ({len(val_df)/len(df_filtered)*100:.1f}%)")
print(f"Test set size: {len(test_df):,} ({len(test_df)/len(df_filtered)*100:.1f}%)")

# Verify class distribution is maintained
print(f"\nTrain classes: {train_df['label'].nunique()}")
print(f"Val classes: {val_df['label'].nunique()}")
print(f"Test classes: {test_df['label'].nunique()}")


## 3. Create Directory Structure and Save Images


In [None]:
# Create base splits directory
SPLITS_PATH.mkdir(exist_ok=True, parents=True)

# Create subdirectories for each split
for split_name in ['train', 'validation', 'test']:
    split_path = SPLITS_PATH / split_name
    split_path.mkdir(exist_ok=True, parents=True)
    
    # Create class subdirectories
    for label in df_filtered['label'].unique():
        class_path = split_path / label
        class_path.mkdir(exist_ok=True, parents=True)

print("Directory structure created successfully!")
print(f"\nBase path: {SPLITS_PATH}")
print(f"Subdirectories: train, validation, test")
print(f"Classes per subdirectory: {df_filtered['label'].nunique()}")


In [None]:
def copy_images_to_split(df, split_name):
    """Copy images from source to split directory"""
    print(f"\nCopying images to {split_name}...")
    
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        src_path = row['full_image_path']
        label = row['label']
        
        # Create destination path
        dst_path = SPLITS_PATH / split_name / label / src_path.name
        
        # Copy file if source exists
        if src_path.exists():
            shutil.copy2(src_path, dst_path)
        else:
            print(f"Warning: Source file not found: {src_path}")
    
    print(f"Completed copying {len(df):,} images to {split_name}")

# Copy images to each split
copy_images_to_split(train_df, 'train')
copy_images_to_split(val_df, 'validation')
copy_images_to_split(test_df, 'test')

print("\n✓ All images copied successfully!")


## 4. Create Keras Datasets

Now we'll create Keras datasets from the organized images. These datasets will:
- Load images as tensors
- Resize them to a standard size
- Normalize pixel values
- Apply data augmentation (for training set)


In [None]:
# Configuration for TensorFlow datasets
BATCH_SIZE = 32
AUTOTUNE = tf.data.AUTOTUNE

# Get class names (sorted for consistency)
class_names = sorted(df_filtered['label'].unique())
num_classes = len(class_names)

print(f"Number of classes: {num_classes}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Image size: {IMAGE_SIZE}")


In [None]:
# Create datasets from directories
train_ds = tf.keras.utils.image_dataset_from_directory(
    str(SPLITS_PATH / 'train'),
    image_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    label_mode='int',
    shuffle=True,
    seed=RANDOM_STATE
)

val_ds = tf.keras.utils.image_dataset_from_directory(
    str(SPLITS_PATH / 'validation'),
    image_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    label_mode='int',
    shuffle=False
)

test_ds = tf.keras.utils.image_dataset_from_directory(
    str(SPLITS_PATH / 'test'),
    image_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    label_mode='int',
    shuffle=False
)

print("\n✓ TensorFlow datasets created successfully!")


## 5. Normalize and Optimize Datasets


## 6. Verify Datasets


In [None]:
# Get dataset information
def get_dataset_info(ds, name):
    """Get information about a dataset"""
    # Count total batches and samples
    num_batches = 0
    for _ in ds:
        num_batches += 1
    
    print(f"\n{name} Dataset:")
    print(f"  Number of batches: {num_batches}")
    print(f"  Approximate number of samples: {num_batches * BATCH_SIZE}")
    
    # Get a sample batch
    for images, labels in ds.take(1):
        print(f"  Batch shape: {images.shape}")
        print(f"  Label shape: {labels.shape}")
        print(f"  Image dtype: {images.dtype}")
        print(f"  Pixel value range: [{tf.reduce_min(images).numpy():.3f}, {tf.reduce_max(images).numpy():.3f}]")

get_dataset_info(train_ds, "Training")
get_dataset_info(val_ds, "Validation")
get_dataset_info(test_ds, "Test")


In [None]:
import matplotlib.pyplot as plt

# Visualize a batch of images from the training set
plt.figure(figsize=(15, 10))

for images, labels in train_ds.take(1):
    for i in range(min(9, len(images))):
        ax = plt.subplot(3, 3, i + 1)
        plt.imshow(images[i].numpy())
        plt.title(f"Class: {train_ds.class_names[labels[i]]}", fontsize=8)
        plt.axis("off")

plt.suptitle("Sample Images from Training Set (with augmentation)", fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()


## 7. Save Dataset Configuration and Metadata


In [None]:
# Save the split information for reproducibility
train_df.to_csv(SPLITS_PATH / 'train_metadata.csv', index=False)
val_df.to_csv(SPLITS_PATH / 'validation_metadata.csv', index=False)
test_df.to_csv(SPLITS_PATH / 'test_metadata.csv', index=False)

# Save class names
with open(SPLITS_PATH / 'class_names.txt', 'w') as f:
    for class_name in train_ds.class_names:
        f.write(f"{class_name}\n")

# Save configuration
import json

config = {
    'image_size': IMAGE_SIZE,
    'batch_size': BATCH_SIZE,
    'train_split': TRAIN_SPLIT,
    'val_split': VAL_SPLIT,
    'test_split': TEST_SPLIT,
    'random_state': RANDOM_STATE,
    'image_type': IMAGE_TYPE_TO_USE,
    'num_classes': num_classes,
    'train_samples': len(train_df),
    'val_samples': len(val_df),
    'test_samples': len(test_df),
    'total_samples': len(df_filtered)
}

with open(SPLITS_PATH / 'dataset_config.json', 'w') as f:
    json.dump(config, f, indent=4)

print("✓ Metadata and configuration saved!")
print(f"\nSaved files in {SPLITS_PATH}:")
print("  - train_metadata.csv")
print("  - validation_metadata.csv")
print("  - test_metadata.csv")
print("  - class_names.txt")
print("  - dataset_config.json")


## 8. Summary

### What we've accomplished:

1. **Loaded and filtered the PlantVillage metadata** - Selected color images for training
2. **Created stratified splits** - Maintained class distribution across train/validation/test sets
3. **Organized images** - Copied images to `data/splits/` with proper directory structure
4. **Created TensorFlow datasets** - Images are now loaded as tensors ready for training
5. **Applied preprocessing** - Normalized pixel values and added data augmentation for training
6. **Saved metadata** - Stored configuration and split information for reproducibility

### Next steps for model training:

```python
# To use these datasets in your training script:
train_ds = tf.keras.utils.image_dataset_from_directory(
    'data/splits/train',
    image_size=(224, 224),
    batch_size=32
)

val_ds = tf.keras.utils.image_dataset_from_directory(
    'data/splits/validation',
    image_size=(224, 224),
    batch_size=32
)

test_ds = tf.keras.utils.image_dataset_from_directory(
    'data/splits/test',
    image_size=(224, 224),
    batch_size=32
)

# Apply normalization and train your model
model.fit(train_ds, validation_data=val_ds, epochs=10)
```


## 9. (Optional) Save TensorFlow Datasets to Disk

You can optionally save the preprocessed TensorFlow datasets to disk for faster loading in the future. This is useful when you have expensive preprocessing steps.


In [None]:
# Uncomment the following lines to save the datasets to disk
# This will save the preprocessed datasets in TensorFlow format

# # Save datasets
# train_ds.save(str(SPLITS_PATH / 'train_dataset'))
# val_ds.save(str(SPLITS_PATH / 'validation_dataset'))
# test_ds.save(str(SPLITS_PATH / 'test_dataset'))
# 
# print("✓ TensorFlow datasets saved to disk!")
# print(f"\nTo load them later:")
# print("train_ds = tf.data.Dataset.load(str(SPLITS_PATH / 'train_dataset'))")
# print("val_ds = tf.data.Dataset.load(str(SPLITS_PATH / 'validation_dataset'))")
# print("test_ds = tf.data.Dataset.load(str(SPLITS_PATH / 'test_dataset'))")

print("\nNote: Saving TensorFlow datasets to disk is commented out by default.")
print("The current approach (organizing images in folders) is more flexible and recommended.")


In [None]:
# Normalization layer (scales pixel values from [0, 255] to [0, 1])
normalization_layer = tf.keras.layers.Rescaling(1./255)

# Data augmentation for training set
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal_and_vertical"),
    tf.keras.layers.RandomRotation(0.2),
    tf.keras.layers.RandomZoom(0.2),
    tf.keras.layers.RandomContrast(0.2),
])

def prepare_dataset(ds, augment=False):
    """Prepare dataset with normalization and optional augmentation"""
    # Normalize
    ds = ds.map(lambda x, y: (normalization_layer(x), y), num_parallel_calls=AUTOTUNE)
    
    # Apply augmentation if requested
    if augment:
        ds = ds.map(lambda x, y: (data_augmentation(x, training=True), y), 
                    num_parallel_calls=AUTOTUNE)
    
    # Prefetch for performance
    ds = ds.prefetch(buffer_size=AUTOTUNE)
    
    return ds

# Prepare datasets
train_ds = prepare_dataset(train_ds, augment=True)
val_ds = prepare_dataset(val_ds, augment=False)
test_ds = prepare_dataset(test_ds, augment=False)

print("✓ Datasets normalized and optimized!")
