# Synthetic Dataset Generation for Indian Plant Recommendations

This notebook generates a synthetic dataset with air quality features (adjusted for Indian conditions) and soil nutrient features tailored for five common Indian plants:
- Mango
- Neem
- Jamun
- Amla
- Drumstick (Moringa oleifera)

The dataset is saved as `synthetic_combined_dataset_india_plants.csv`.

In [1]:
import numpy as np
import pandas as pd

In [2]:
np.random.seed(42)
n_per_class = 2000
plant_classes = ['Mango', 'Neem', 'Jamun', 'Amla', 'Drumstick']

In [3]:
ranges = {
    'AQI': (0, 500),
    'CO2_Level_ppm': (350, 500),
    'NO2_Level_ppm': (0, 50),
    'PM2_5_ug_m3': (0, 200),
    'PM10_ug_m3': (0, 300),
    'pH_Level': (4.0, 9.0),
    'Nitrogen_mg_kg': (0, 200),
    'Phosphorus_mg_kg': (0, 100),
    'Potassium_mg_kg': (0, 100),
    'Moisture_Level_%': (0, 100),
    'Organic_Matter_%': (0, 10),
    'Plot_Area_m2': (10, 1000)
}

In [4]:
air_quality_means = {
    'AQI': 100,
    'CO2_Level_ppm': 415,
    'NO2_Level_ppm': 12,
    'PM2_5_ug_m3': 54.4,
    'PM10_ug_m3': 116
}
air_quality_std = {
    'AQI': 20,
    'CO2_Level_ppm': 10,
    'NO2_Level_ppm': 3,
    'PM2_5_ug_m3': 5,
    'PM10_ug_m3': 5
}

In [5]:
def generate_feature(mean, std, size, feature_name):
    samples = np.random.normal(mean, std, size)
    min_val, max_val = ranges[feature_name]
    return np.clip(samples, min_val, max_val).round(2)

In [6]:
plant_params = {
    'Mango': {
        'pH_Level': (6.8, 0.3),
        'Nitrogen_mg_kg': (100, 10),
        'Phosphorus_mg_kg': (50, 5),
        'Potassium_mg_kg': (80, 5),
        'Moisture_Level_%': (55, 5),
        'Organic_Matter_%': (3.0, 0.5),
        'Plot_Area_m2': (500, 50)
    },
    'Neem': {
        'pH_Level': (7.0, 0.3),
        'Nitrogen_mg_kg': (70, 8),
        'Phosphorus_mg_kg': (40, 5),
        'Potassium_mg_kg': (40, 5),
        'Moisture_Level_%': (40, 5),
        'Organic_Matter_%': (2.0, 0.3),
        'Plot_Area_m2': (300, 30)
    },
    'Jamun': {
        'pH_Level': (6.5, 0.3),
        'Nitrogen_mg_kg': (90, 8),
        'Phosphorus_mg_kg': (60, 5),
        'Potassium_mg_kg': (60, 5),
        'Moisture_Level_%': (60, 5),
        'Organic_Matter_%': (3.5, 0.5),
        'Plot_Area_m2': (400, 40)
    },
    'Amla': {
        'pH_Level': (7.0, 0.3),
        'Nitrogen_mg_kg': (80, 8),
        'Phosphorus_mg_kg': (55, 5),
        'Potassium_mg_kg': (65, 5),
        'Moisture_Level_%': (50, 5),
        'Organic_Matter_%': (3.0, 0.5),
        'Plot_Area_m2': (350, 40)
    },
    'Drumstick': {
        'pH_Level': (6.2, 0.3),
        'Nitrogen_mg_kg': (60, 8),
        'Phosphorus_mg_kg': (45, 5),
        'Potassium_mg_kg': (55, 5),
        'Moisture_Level_%': (45, 5),
        'Organic_Matter_%': (2.5, 0.5),
        'Plot_Area_m2': (200, 20)
    }
}

In [7]:
data_list = []
for plant in plant_classes:
    aq_features = {feat: generate_feature(air_quality_means[feat], air_quality_std[feat], n_per_class, feat)
                   for feat in air_quality_means}
    soil_features = {feat: generate_feature(*plant_params[plant][feat], n_per_class, feat)
                     for feat in plant_params[plant]}
    df = pd.DataFrame({**aq_features, **soil_features})
    df['Plant_Type'] = plant
    data_list.append(df)

In [8]:
synthetic_data = pd.concat(data_list, ignore_index=True)
synthetic_data = synthetic_data.sample(frac=1, random_state=42).reset_index(drop=True)
synthetic_data.to_csv('synthetic_combined_dataset_india_plants.csv', index=False)

In [9]:
print("Dataset generated and saved as 'synthetic_combined_dataset_india_plants.csv'")
print("\nPlant Type Distribution:")
print(synthetic_data['Plant_Type'].value_counts())
print("\nSample Data:")
print(synthetic_data.head())

Dataset generated and saved as 'synthetic_combined_dataset_india_plants.csv'

Plant Type Distribution:
Plant_Type
Amla         2000
Jamun        2000
Mango        2000
Neem         2000
Drumstick    2000
Name: count, dtype: int64

Sample Data:
      AQI  CO2_Level_ppm  NO2_Level_ppm  PM2_5_ug_m3  PM10_ug_m3  pH_Level  \
0  109.52         410.04          14.42        57.37      110.74      6.27   
1   96.27         415.37          13.37        55.77      119.11      6.87   
2   93.01         406.72          12.13        55.62      121.78      7.53   
3  129.31         412.50          11.86        54.48      118.54      6.30   
4   76.19         411.25          10.61        52.21      116.01      6.52   

   Nitrogen_mg_kg  Phosphorus_mg_kg  Potassium_mg_kg  Moisture_Level_%  \
0           75.82             51.63            67.70             58.26   
1           93.74             56.60            62.01             58.04   
2           99.48             43.02            68.70             