In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
"""
Onion Shelf Life Data Generator
==============================

This script generates synthetic data for onion shelf life prediction based on 
visual parameters that can be extracted through image processing and computer vision.

Parameters included:
- Physical dimensions (length, width, height, diameter)
- Visual defects (black spots count)
- Surface texture indicators (from image analysis)
- Skin/neck condition scoring
- Visible damage flags (bruises, cuts, lesions)
- Estimated shelf life and quality grading

Author: Generated for onion storage optimization project
Date: September 2025
"""

import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

def generate_onion_data(n_samples=200):
    """
    Generate synthetic onion data for shelf life prediction using image-based parameters

    Parameters:
    -----------
    n_samples : int
        Number of data points to generate (default: 200)

    Returns:
    --------
    pandas.DataFrame
        DataFrame containing synthetic onion data with visual parameters
    """

    # Set random seed for reproducibility
    np.random.seed(42)
    random.seed(42)

    data = []

    for i in range(n_samples):
        # Basic dimensions (in mm)
        # Typical onion sizes range from small (50-70mm) to large (80-120mm) diameter
        base_size = np.random.normal(75, 15)  # Base size with some variation

        # Length (vertical dimension)
        length = max(40, np.random.normal(base_size * 0.9, 8))

        # Width (horizontal diameter)
        width = max(35, np.random.normal(base_size, 10))

        # Height (depth, usually similar to width)
        height = max(30, np.random.normal(width * 0.95, 8))

        # Bulb diameter (primary measurement for size classification)
        diameter = max(width, height)

        # Size classification based on diameter
        if diameter < 50:
            size_class = "Small"
        elif diameter < 75:
            size_class = "Medium" 
        elif diameter < 100:
            size_class = "Large"
        else:
            size_class = "Extra Large"

        # Black spots (count visible on surface from image analysis)
        # Fresh onions have fewer spots, deteriorating ones have more
        freshness_factor = np.random.uniform(0, 1)
        if freshness_factor > 0.8:  # Very fresh
            black_spots = np.random.poisson(0.5)  # Very few spots
        elif freshness_factor > 0.6:  # Moderately fresh
            black_spots = np.random.poisson(2)
        elif freshness_factor > 0.3:  # Starting to deteriorate  
            black_spots = np.random.poisson(5)
        else:  # Poor condition
            black_spots = np.random.poisson(12)

        # Surface texture indicators (from image analysis)
        # Smooth = 1 (fresh), Wrinkled = 2, Very wrinkled = 3, Soft appearance = 4
        if freshness_factor > 0.7:
            surface_texture = np.random.choice([1, 2], p=[0.8, 0.2])
        elif freshness_factor > 0.4:
            surface_texture = np.random.choice([2, 3], p=[0.6, 0.4])  
        else:
            surface_texture = np.random.choice([3, 4], p=[0.4, 0.6])

        # Skin/neck condition scoring (1-5 scale, 1=excellent, 5=poor)
        # Factors: dryness, cracks, open necks, visible rot
        if freshness_factor > 0.8:
            skin_condition = np.random.choice([1, 2], p=[0.7, 0.3])
        elif freshness_factor > 0.6:
            skin_condition = np.random.choice([2, 3], p=[0.5, 0.5])
        elif freshness_factor > 0.3:
            skin_condition = np.random.choice([3, 4], p=[0.6, 0.4])
        else:
            skin_condition = np.random.choice([4, 5], p=[0.4, 0.6])

        # Visible damage flags (binary - detectable from images)
        # Bruises
        has_bruises = 1 if freshness_factor < 0.5 and np.random.random() < 0.4 else 0

        # Cuts/lesions
        has_cuts = 1 if np.random.random() < 0.15 else 0  # Random damage during handling

        # Visible lesions/soft spots
        has_lesions = 1 if freshness_factor < 0.3 and np.random.random() < 0.6 else 0

        # Overall damage flag
        has_visible_damage = 1 if (has_bruises or has_cuts or has_lesions) else 0

        # Estimated shelf life (days) - target variable
        # Based on overall condition indicators
        base_shelf_life = 30  # Baseline for perfect onion

        # Reduce shelf life based on various factors
        shelf_life = base_shelf_life
        shelf_life -= black_spots * 0.8  # Each spot reduces life
        shelf_life -= (surface_texture - 1) * 3  # Texture degradation
        shelf_life -= (skin_condition - 1) * 4  # Skin condition impact
        shelf_life -= has_visible_damage * 8  # Visible damage penalty

        # Add some randomness and ensure positive values
        shelf_life += np.random.normal(0, 3)  # Natural variation
        shelf_life = max(1, min(45, shelf_life))  # Constrain between 1-45 days

        # Quality grade (A, B, C, D)
        if shelf_life > 25:
            quality_grade = "A"
        elif shelf_life > 18:
            quality_grade = "B" 
        elif shelf_life > 10:
            quality_grade = "C"
        else:
            quality_grade = "D"

        data.append({
            'sample_id': f'ONI_{i+1:03d}',
            'length_mm': round(length, 1),
            'width_mm': round(width, 1), 
            'height_mm': round(height, 1),
            'diameter_mm': round(diameter, 1),
            'size_class': size_class,
            'black_spots_count': int(black_spots),
            'surface_texture_score': int(surface_texture),  # 1=smooth, 4=very soft
            'skin_condition_score': int(skin_condition),  # 1=excellent, 5=poor
            'has_bruises': has_bruises,
            'has_cuts': has_cuts, 
            'has_lesions': has_lesions,
            'visible_damage_flag': has_visible_damage,
            'estimated_shelf_life_days': round(shelf_life, 1),
            'quality_grade': quality_grade
        })

    return pd.DataFrame(data)

def save_dataset(df, filename='onion_shelf_life_dataset.csv'):
    """Save the generated dataset to CSV file"""
    df.to_csv(filename, index=False)
    print(f"Dataset saved as '{filename}'")

def print_dataset_summary(df):
    """Print summary statistics and information about the dataset"""
    print(f"Dataset created with {len(df)} samples")
    print(f"Shape: {df.shape}")
    print("\nFirst few rows:")
    print(df.head())

    print("\nDataset statistics:")
    print(df.describe())

    print("\nCategorical variable distributions:")
    print("\nSize class distribution:")
    print(df['size_class'].value_counts())

    print("\nQuality grade distribution:")  
    print(df['quality_grade'].value_counts())

    print("\nSurface texture distribution:")
    print(df['surface_texture_score'].value_counts())

    print("\nSkin condition distribution:")
    print(df['skin_condition_score'].value_counts())

    print("\nDamage indicators:")
    print(f"Samples with bruises: {df['has_bruises'].sum()}")
    print(f"Samples with cuts: {df['has_cuts'].sum()}")
    print(f"Samples with lesions: {df['has_lesions'].sum()}")
    print(f"Samples with any visible damage: {df['visible_damage_flag'].sum()}")

if __name__ == "__main__":
    # Generate the dataset
    print("Generating 200 onion data points for image-based shelf life prediction...")
    df = generate_onion_data(200)

    # Print summary
    print_dataset_summary(df)

    # Save to CSV
    save_dataset(df)

    print("\n" + "="*60)
    print("DATA DICTIONARY")
    print("="*60)
    print("sample_id: Unique identifier for each onion sample")
    print("length_mm: Vertical dimension of onion (mm)")
    print("width_mm: Horizontal diameter of onion (mm)")  
    print("height_mm: Depth dimension of onion (mm)")
    print("diameter_mm: Maximum diameter for size classification (mm)")
    print("size_class: Size category (Small/Medium/Large/Extra Large)")
    print("black_spots_count: Number of visible black spots from image analysis")
    print("surface_texture_score: Surface condition (1=smooth to 4=very soft)")
    print("skin_condition_score: Skin/neck condition (1=excellent to 5=poor)")
    print("has_bruises: Binary flag for visible bruising (0/1)")
    print("has_cuts: Binary flag for cuts or wounds (0/1)")
    print("has_lesions: Binary flag for soft spots or lesions (0/1)")  
    print("visible_damage_flag: Binary flag for any visible damage (0/1)")
    print("estimated_shelf_life_days: Predicted remaining shelf life (days)")
    print("quality_grade: Overall quality grade (A/B/C/D)")
    print("="*60)


Generating 200 onion data points for image-based shelf life prediction...
Dataset created with 200 samples
Shape: (200, 15)

First few rows:
  sample_id  length_mm  width_mm  height_mm  diameter_mm size_class  \
0   ONI_001       73.1      88.9       96.7         96.7      Large   
1   ONI_002       71.1      78.5       76.5         78.5      Large   
2   ONI_003       61.0      55.3       55.6         55.6     Medium   
3   ONI_004       59.8      46.9       46.3         46.9      Small   
4   ONI_005       41.8      48.2       54.3         54.3     Medium   

   black_spots_count  surface_texture_score  skin_condition_score  \
0                  8                      4                     5   
1                  4                      2                     4   
2                 19                      4                     4   
3                  7                      2                     3   
4                 12                      4                     4   

   has_bruises  h