In [7]:
import pandas as pd
import os
from collections import Counter

def summarize_image_data(train_csv_path, val_csv_path):
    """
    Summarize image metadata from training and validation CSVs.
    
    Args:
        train_csv_path (str): Path to training CSV
        val_csv_path (str): Path to validation CSV
        
    Returns:
        pd.DataFrame: Summary table of images with usage, crop type, stage, and time info
    """
    # Load CSVs
    train_df = pd.read_csv(train_csv_path)
    val_df = pd.read_csv(val_csv_path)
    
    # Add usage column
    train_df['usage'] = 'training'
    val_df['usage'] = 'validation'
    
    # Combine datasets
    combined_df = pd.concat([train_df, val_df], ignore_index=True)
    
    # Extract unique image information
    image_summary = []
    for img_name, group in combined_df.groupby('origin_img'):
        # Get the first row since image-level data is the same for all chips
        sample_row = group.iloc[0]
        
        # Count chips per image
        chip_count = len(group)
        
        # Count crop types (in case there are mixed labels)
        crop_type_counts = Counter(group['crop_type'])
        primary_crop_type = max(crop_type_counts.items(), key=lambda x: x[1])[0]
        
        # Create the relative image path
        image_path = f"data/all_sv_imgs/{img_name}.jpg"
        
        # Create summary row
        image_summary.append({
            'image_name': img_name,
            'image_path': image_path,
            'usage': sample_row['usage'],
            'crop_type': primary_crop_type,
            'crop_stage': sample_row['crop_stage'],
            'acquisition_date': pd.to_datetime(sample_row['time_of_acquisition']).strftime('%Y-%m-%d')
        })
    
    # Convert to DataFrame and sort
    summary_df = pd.DataFrame(image_summary)
    summary_df = summary_df.sort_values(['usage', 'crop_type', 'acquisition_date'])
    
    return summary_df

def print_dataset_statistics(summary_df):
    """Print statistics about the dataset."""
    print(f"Total unique images: {len(summary_df)}")
    
    # Usage statistics
    usage_counts = summary_df['usage'].value_counts()
    print("\nUsage distribution:")
    for usage, count in usage_counts.items():
        print(f"  {usage}: {count} images ({count/len(summary_df)*100:.1f}%)")
    
    # Crop type statistics
    print("\nCrop type distribution:")
    crop_counts = summary_df['crop_type'].value_counts()
    for crop, count in crop_counts.items():
        print(f"  {crop}: {count} images ({count/len(summary_df)*100:.1f}%)")
    
    # Crop stage statistics
    print("\nCrop stage distribution:")
    stage_counts = summary_df['crop_stage'].value_counts()
    for stage, count in stage_counts.items():
        print(f"  {stage}: {count} images ({count/len(summary_df)*100:.1f}%)")
    
    # Distribution by month (for display only, not stored in final DataFrame)
    temp_df = summary_df.copy()
    temp_df['month'] = pd.to_datetime(temp_df['acquisition_date']).dt.month
    month_counts = temp_df['month'].value_counts().sort_index()
    
    print("\nAcquisition month distribution:")
    months = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
              7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}
    for month, count in month_counts.items():
        print(f"  {months[month]}: {count} images ({count/len(summary_df)*100:.1f}%)")


In [8]:
# Usage example
train_csv = "/workspace/data/masked_data_csiss/training/train_chipping_csv_w_anc.csv"
val_csv = "/workspace/data/masked_data_csiss/validation/validation_chipping_csv_w_anc.csv"

summary_df = summarize_image_data(train_csv, val_csv)
print_dataset_statistics(summary_df)

# Display the first few rows of the summary
print("\nSample of image summary:")
print(summary_df.head())

# Export to CSV if needed
summary_df.to_csv("/workspace/data/image_summary.csv", index=False)

Total unique images: 687

Usage distribution:
  training: 549 images (79.9%)
  validation: 138 images (20.1%)

Crop type distribution:
  Maize: 389 images (56.6%)
  Soybean: 186 images (27.1%)
  Other: 112 images (16.3%)

Crop stage distribution:
  growing: 321 images (46.7%)
  harvesting: 254 images (37.0%)

Acquisition month distribution:
  Aug: 345 images (50.2%)
  Oct: 342 images (49.8%)

Sample of image summary:
      image_name                         image_path     usage crop_type  \
0   IMG_2022_001  data/all_sv_imgs/IMG_2022_001.jpg  training     Maize   
4   IMG_2022_007  data/all_sv_imgs/IMG_2022_007.jpg  training     Maize   
7   IMG_2022_010  data/all_sv_imgs/IMG_2022_010.jpg  training     Maize   
8   IMG_2022_012  data/all_sv_imgs/IMG_2022_012.jpg  training     Maize   
12  IMG_2022_018  data/all_sv_imgs/IMG_2022_018.jpg  training     Maize   

   crop_stage acquisition_date  
0     growing       2022-08-09  
4     growing       2022-08-09  
7     growing       2022-08-0