In [2]:
import os
from pathlib import Path
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from PIL import Image

# Define the base data directory - resolve to absolute path first
data_dir = Path("../data/plantvillage dataset").resolve()
project_root = Path("..").resolve()

# List to store all image information
image_data = []

print("Processing images and collecting metadata...")
processed_count = 0

# Iterate through all subdirectories (color, grayscale, segmented)
for image_type_dir in data_dir.iterdir():
    if not image_type_dir.is_dir():
        continue
    
    image_type = image_type_dir.name  # color, grayscale, or segmented
    print(f"\nProcessing {image_type} images...")
    
    # For segmented folder (no subfolders with classifications)
    if image_type == "segmented":
        for image_file in image_type_dir.glob("*.[jJ][pP][gG]"):
            try:
                # Use relative path from project root
                relative_path = image_file.relative_to(project_root)
                
                # Get file size in bytes
                file_size = image_file.stat().st_size
                
                # Get image dimensions
                with Image.open(image_file) as img:
                    width, height = img.size
                
                image_data.append({
                    "image_path": str(relative_path),
                    "image_type": image_type,
                    "plant_type": "segmented",
                    "condition": "segmented",
                    "file_size_bytes": file_size,
                    "width": width,
                    "height": height
                })
                
                processed_count += 1
                if processed_count % 5000 == 0:
                    print(f"  Processed {processed_count} images...")
                    
            except Exception as e:
                print(f"  Error processing {image_file}: {e}")
                continue
    else:
        # For color and grayscale folders (have classification subfolders)
        for classification_dir in image_type_dir.iterdir():
            if not classification_dir.is_dir():
                continue
            
            # Split classification into plant_type and condition
            # Format: [PLANT_TYPE]___[Condition]
            classification = classification_dir.name
            if "___" in classification:
                plant_type, condition = classification.split("___", 1)
            else:
                # Fallback if no separator found
                plant_type = classification
                condition = "unknown"
            
            # Find all image files (JPG, jpg, jpeg, png)
            for ext in ["*.JPG", "*.jpg", "*.jpeg", "*.png"]:
                for image_file in classification_dir.glob(ext):
                    try:
                        # Use relative path from project root
                        relative_path = image_file.relative_to(project_root)
                        
                        # Get file size in bytes
                        file_size = image_file.stat().st_size
                        
                        # Get image dimensions
                        with Image.open(image_file) as img:
                            width, height = img.size
                        
                        image_data.append({
                            "image_path": str(relative_path),
                            "image_type": image_type,
                            "plant_type": plant_type,
                            "condition": condition,
                            "file_size_bytes": file_size,
                            "width": width,
                            "height": height
                        })
                        
                        processed_count += 1
                        if processed_count % 5000 == 0:
                            print(f"  Processed {processed_count} images...")
                            
                    except Exception as e:
                        print(f"  Error processing {image_file}: {e}")
                        continue

print(f"\nTotal images processed: {processed_count}")

# Create DataFrame
df = pd.DataFrame(image_data)

# Add file size in KB and MB for easier reading
df['file_size_kb'] = df['file_size_bytes'] / 1024
df['file_size_mb'] = df['file_size_bytes'] / (1024 * 1024)

# Display summary statistics
print("\n" + "=" * 80)
print("SUMMARY STATISTICS")
print("=" * 80)
print(f"\nTotal images found: {len(df)}")
print(f"\nImages by type:")
print(df['image_type'].value_counts())
print(f"\nUnique plant types: {df['plant_type'].nunique()}")
print(df['plant_type'].value_counts())
print(f"\nUnique conditions: {df['condition'].nunique()}")
print(df['condition'].value_counts().head(10))

print(f"\n\nFile Size Statistics:")
print(f"  Min: {df['file_size_kb'].min():.2f} KB")
print(f"  Max: {df['file_size_kb'].max():.2f} KB")
print(f"  Mean: {df['file_size_kb'].mean():.2f} KB")
print(f"  Median: {df['file_size_kb'].median():.2f} KB")

print(f"\nImage Dimension Statistics:")
print(f"  Width - Min: {df['width'].min()}, Max: {df['width'].max()}, Mean: {df['width'].mean():.1f}")
print(f"  Height - Min: {df['height'].min()}, Max: {df['height'].max()}, Mean: {df['height'].mean():.1f}")
print(f"  Unique dimensions: {df[['width', 'height']].drop_duplicates().shape[0]}")

print(f"\nDataFrame shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()


Processing images and collecting metadata...

Processing grayscale images...
  Processed 5000 images...
  Processed 10000 images...
  Processed 15000 images...
  Processed 20000 images...
  Processed 25000 images...
  Processed 30000 images...
  Processed 35000 images...
  Processed 40000 images...
  Processed 45000 images...
  Processed 50000 images...

Processing segmented images...

Processing color images...
  Processed 55000 images...
  Processed 60000 images...
  Processed 65000 images...
  Processed 70000 images...
  Processed 75000 images...
  Processed 80000 images...
  Processed 85000 images...
  Processed 90000 images...
  Processed 95000 images...
  Processed 100000 images...
  Processed 105000 images...

Total images processed: 108610

SUMMARY STATISTICS

Total images found: 108610

Images by type:
image_type
grayscale    54305
color        54305
Name: count, dtype: int64

Unique plant types: 14
plant_type
Tomato                     36320
Orange                     11014
S

Unnamed: 0,image_path,image_type,plant_type,condition,file_size_bytes,width,height,file_size_kb,file_size_mb
0,data/plantvillage dataset/grayscale/Strawberry...,grayscale,Strawberry,healthy,20310,256,256,19.833984,0.019369
1,data/plantvillage dataset/grayscale/Strawberry...,grayscale,Strawberry,healthy,20650,256,256,20.166016,0.019693
2,data/plantvillage dataset/grayscale/Strawberry...,grayscale,Strawberry,healthy,17905,256,256,17.485352,0.017076
3,data/plantvillage dataset/grayscale/Strawberry...,grayscale,Strawberry,healthy,20219,256,256,19.745117,0.019282
4,data/plantvillage dataset/grayscale/Strawberry...,grayscale,Strawberry,healthy,20746,256,256,20.259766,0.019785


In [14]:
# Display the DataFrame structure and metadata details
print("=" * 80)
print("DATAFRAME STRUCTURE")
print("=" * 80)
print(f"\nColumns: {list(df.columns)}")
print(f"Shape: {df.shape}")
print(f"\nColumn Data Types:")
print(df.dtypes)

print("\n" + "=" * 80)
print("METADATA ANALYSIS")
print("=" * 80)

print(f"\n1. Unique plant types ({df['plant_type'].nunique()}):")
print(sorted(df['plant_type'].unique()))

print(f"\n2. Unique conditions ({df['condition'].nunique()}):")
print(sorted(df['condition'].unique()))

print(f"\n3. Image types ({df['image_type'].nunique()}):")
print(df['image_type'].unique())

print(f"\n4. Image Dimensions:")
print(f"   Unique width x height combinations: {df[['width', 'height']].drop_duplicates().shape[0]}")
print(f"\n   Most common dimensions:")
print(df.groupby(['width', 'height']).size().sort_values(ascending=False).head(10))

print(f"\n5. File Size Distribution:")
print(df['file_size_kb'].describe())

print("\n" + "=" * 80)
print("SAMPLE DATA (all columns)")
print("=" * 80)
df.head(10)

DATAFRAME STRUCTURE

Columns: ['image_path', 'image_type', 'plant_type', 'condition', 'file_size_bytes', 'width', 'height', 'file_size_kb', 'file_size_mb']
Shape: (108610, 9)

Column Data Types:
image_path          object
image_type          object
plant_type          object
condition           object
file_size_bytes      int64
width                int64
height               int64
file_size_kb       float64
file_size_mb       float64
dtype: object

METADATA ANALYSIS

1. Unique plant types (14):
['Apple', 'Blueberry', 'Cherry_(including_sour)', 'Corn_(maize)', 'Grape', 'Orange', 'Peach', 'Pepper,_bell', 'Potato', 'Raspberry', 'Soybean', 'Squash', 'Strawberry', 'Tomato']

2. Unique conditions (21):
['Apple_scab', 'Bacterial_spot', 'Black_rot', 'Cedar_apple_rust', 'Cercospora_leaf_spot Gray_leaf_spot', 'Common_rust_', 'Early_blight', 'Esca_(Black_Measles)', 'Haunglongbing_(Citrus_greening)', 'Late_blight', 'Leaf_Mold', 'Leaf_blight_(Isariopsis_Leaf_Spot)', 'Leaf_scorch', 'Northern_Leaf_Bl

Unnamed: 0,image_path,image_type,plant_type,condition,file_size_bytes,width,height,file_size_kb,file_size_mb
0,data/plantvillage dataset/grayscale/Strawberry...,grayscale,Strawberry,healthy,20310,256,256,19.833984,0.019369
1,data/plantvillage dataset/grayscale/Strawberry...,grayscale,Strawberry,healthy,20650,256,256,20.166016,0.019693
2,data/plantvillage dataset/grayscale/Strawberry...,grayscale,Strawberry,healthy,17905,256,256,17.485352,0.017076
3,data/plantvillage dataset/grayscale/Strawberry...,grayscale,Strawberry,healthy,20219,256,256,19.745117,0.019282
4,data/plantvillage dataset/grayscale/Strawberry...,grayscale,Strawberry,healthy,20746,256,256,20.259766,0.019785
5,data/plantvillage dataset/grayscale/Strawberry...,grayscale,Strawberry,healthy,19378,256,256,18.923828,0.01848
6,data/plantvillage dataset/grayscale/Strawberry...,grayscale,Strawberry,healthy,19179,256,256,18.729492,0.018291
7,data/plantvillage dataset/grayscale/Strawberry...,grayscale,Strawberry,healthy,19173,256,256,18.723633,0.018285
8,data/plantvillage dataset/grayscale/Strawberry...,grayscale,Strawberry,healthy,19681,256,256,19.219727,0.018769
9,data/plantvillage dataset/grayscale/Strawberry...,grayscale,Strawberry,healthy,19501,256,256,19.043945,0.018598


In [15]:
# Save DataFrame as parquet file
output_path = "../data/plantvillage_images_metadata.parquet"
df.to_parquet(output_path, index=False, engine='pyarrow')

print("=" * 80)
print("SAVING PARQUET FILE")
print("=" * 80)
print(f"\n✓ Parquet file saved to: {output_path}")
print(f"✓ File size: {os.path.getsize(output_path) / (1024*1024):.2f} MB")
print(f"✓ Total records: {len(df):,}")

# Verify the parquet file can be read back
df_verify = pd.read_parquet(output_path)
print(f"\n✓ Verification successful - rows read back: {len(df_verify):,}")
print(f"✓ Columns ({len(df_verify.columns)}): {list(df_verify.columns)}")

print("\n" + "=" * 80)
print("SAMPLE VERIFICATION DATA")
print("=" * 80)
print("\nBasic Info:")
print(df_verify[['image_path', 'image_type', 'plant_type', 'condition']].head(5))

print("\nMetadata Info:")
print(df_verify[['image_path', 'width', 'height', 'file_size_kb']].head(5))

print(f"\n\n✓ All columns preserved: {list(df.columns) == list(df_verify.columns)}")


SAVING PARQUET FILE

✓ Parquet file saved to: ../data/plantvillage_images_metadata.parquet
✓ File size: 5.58 MB
✓ Total records: 108,610

✓ Verification successful - rows read back: 108,610
✓ Columns (9): ['image_path', 'image_type', 'plant_type', 'condition', 'file_size_bytes', 'width', 'height', 'file_size_kb', 'file_size_mb']

SAMPLE VERIFICATION DATA

Basic Info:
                                          image_path image_type  plant_type  \
0  data/plantvillage dataset/grayscale/Strawberry...  grayscale  Strawberry   
1  data/plantvillage dataset/grayscale/Strawberry...  grayscale  Strawberry   
2  data/plantvillage dataset/grayscale/Strawberry...  grayscale  Strawberry   
3  data/plantvillage dataset/grayscale/Strawberry...  grayscale  Strawberry   
4  data/plantvillage dataset/grayscale/Strawberry...  grayscale  Strawberry   

  condition  
0   healthy  
1   healthy  
2   healthy  
3   healthy  
4   healthy  

Metadata Info:
                                          image_path  w