In [1]:
import numpy as np
import pandas as pd

In [2]:
np.random.seed(42)  # for reproducibility
n_per_class = 2000
total_samples = n_per_class * 4

In [3]:
# Define ranges for clipping (min, max) for each feature
ranges = {
    'AQI': (0, 500),
    'CO2_Level_ppm': (350, 2000),
    'NO2_Level_ppm': (0, 200),
    'PM2_5_ug_m3': (0, 500),
    'PM10_ug_m3': (0, 500),
    'VOC_Level_ppm': (0, 10),
    'pH_Level': (4.0, 9.0),
    'Nitrogen_mg_kg': (0, 200),
    'Phosphorus_mg_kg': (0, 100),
    'Potassium_mg_kg': (0, 100),
    'Moisture_Level_%': (0, 100),
    'Organic_Matter_%': (0, 10),
    'Plot_Area_m2': (10, 1000)
}

# Define mean values and standard deviations for soil nutrient features for each plant type
plant_params = {
    'Ficus': {
        'pH_Level': (7.0, 0.3),
        'Nitrogen_mg_kg': (150, 10),
        'Phosphorus_mg_kg': (80, 5),
        'Potassium_mg_kg': (80, 5),
        'Moisture_Level_%': (70, 5),
        'Organic_Matter_%': (4.0, 0.5),
        'Plot_Area_m2': (500, 50)
    },
    'Bonsai': {
        'pH_Level': (6.0, 0.3),
        'Nitrogen_mg_kg': (60, 8),
        'Phosphorus_mg_kg': (40, 5),
        'Potassium_mg_kg': (40, 5),
        'Moisture_Level_%': (50, 5),
        'Organic_Matter_%': (1.5, 0.3),
        'Plot_Area_m2': (50, 10)
    },
    'Orchid': {
        'pH_Level': (5.5, 0.3),
        'Nitrogen_mg_kg': (90, 8),
        'Phosphorus_mg_kg': (60, 5),
        'Potassium_mg_kg': (60, 5),
        'Moisture_Level_%': (40, 5),
        'Organic_Matter_%': (2.0, 0.3),
        'Plot_Area_m2': (200, 20)
    },
    'Succulent': {
        'pH_Level': (8.0, 0.3),
        'Nitrogen_mg_kg': (30, 5),
        'Phosphorus_mg_kg': (20, 3),
        'Potassium_mg_kg': (20, 3),
        'Moisture_Level_%': (20, 4),
        'Organic_Matter_%': (0.5, 0.2),
        'Plot_Area_m2': (300, 30)
    }
}

# For air quality features, we use the same distribution for all classes.
air_quality_means = {
    'AQI': 150,
    'CO2_Level_ppm': 800,
    'NO2_Level_ppm': 50,
    'PM2_5_ug_m3': 35,
    'PM10_ug_m3': 40,
    'VOC_Level_ppm': 2.5
}
air_quality_std = {
    'AQI': 20,
    'CO2_Level_ppm': 50,
    'NO2_Level_ppm': 10,
    'PM2_5_ug_m3': 5,
    'PM10_ug_m3': 5,
    'VOC_Level_ppm': 0.5
}

In [4]:
def generate_features(mean, std, size, feature_name):
    """Generate normally distributed features and clip to specified range."""
    samples = np.random.normal(mean, std, size)
    min_val, max_val = ranges[feature_name]
    return np.clip(samples, min_val, max_val).round(2)

In [5]:
# List to collect data for all classes
data_list = []

# Loop through each plant type and generate features
for plant, params in plant_params.items():
    # Air quality features (same for all classes)
    aq_features = {feat: generate_features(air_quality_means[feat], air_quality_std[feat], n_per_class, feat)
                   for feat in air_quality_means}
    
    # Soil nutrient features (class specific)
    soil_features = {feat: generate_features(*params[feat], n_per_class, feat)
                     for feat in params}
    
    # Combine features into one DataFrame for this class
    df = pd.DataFrame({**aq_features, **soil_features})
    df['Plant_Type'] = plant
    data_list.append(df)

In [6]:
# Combine all classes and shuffle the dataset
balanced_data = pd.concat(data_list, ignore_index=True)
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Save the balanced dataset
balanced_data.to_csv('synthetic_combined_dataset_balanced.csv', index=False)

# Display class distribution and sample rows
print("Balanced Plant Type Distribution:")
print(balanced_data['Plant_Type'].value_counts())
print("\nSample data:")
print(balanced_data.head())

Balanced Plant Type Distribution:
Plant_Type
Bonsai       2000
Ficus        2000
Orchid       2000
Succulent    2000
Name: count, dtype: int64

Sample data:
      AQI  CO2_Level_ppm  NO2_Level_ppm  PM2_5_ug_m3  PM10_ug_m3  \
0  188.55         778.11          40.69        40.01       36.50   
1  171.49         730.50          54.51        32.19       33.43   
2  107.56         818.67          50.07        31.64       38.28   
3  168.79         780.86          34.77        43.98       45.21   
4  131.17         812.54          47.18        31.72       35.70   

   VOC_Level_ppm  pH_Level  Nitrogen_mg_kg  Phosphorus_mg_kg  Potassium_mg_kg  \
0           3.50      6.11           41.09             37.35            42.70   
1           2.45      5.53           68.01             33.77            50.18   
2           3.27      7.47          152.16             75.01            81.14   
3           2.32      6.08           43.26             35.53            37.94   
4           2.42      5.57   