In [6]:
import numpy as np
import pandas as pd
import random
from datetime import datetime, timedelta

# Reproducibility
np.random.seed(42)
random.seed(42)

# Parameters
n_records = 1000
start_date = datetime(2022, 1, 1)
end_date = datetime(2024, 12, 30)

crops = {
    "potato": 300,
    "barley": 300,
    "wheat": 200,
    "strawberries": 100,
    "oilseed rape": 100
}

locations = ["Yorkshire", "Norfolk", "Kent", "Cambridgeshire", "Essex", "Lincolnshire"]

def random_date():
    delta = end_date - start_date
    return start_date + timedelta(days=np.random.randint(0, delta.days))

def vegetation_status(ndvi):
    return "Very healthy, dense vegetation" if ndvi >= 0.5 else "Moderately healthy vegetation"

def assign_disease(crop, temp, hum, lw, rain):
    """Rule-based disease assignment per crop"""
    if crop == "potato":
        # Late blight: humid + leaf wet + temp â‰¥ 10
        if temp >= 10 and hum >= 85 and lw >= 1:
            return "Late Blight"
        else:
            return "No Risk"
    elif crop == "wheat":
        # Septoria: leaf wetness long + moderate temp
        if lw >= 2 and 15 <= temp <= 25:
            return "Septoria Leaf Blotch"
        # Yellow rust: cool + moist
        elif 8 <= temp <= 15 and hum >= 90:
            return "Yellow Rust"
        else:
            return "No Risk"
    elif crop == "barley":
        # Net blotch: humid + moderate temp
        if hum >= 80 and 10 <= temp <= 25:
            return "Net Blotch"
        else:
            return "No Risk"
    elif crop == "oilseed rape":
        # Phoma: wet autumn-like + rainfall
        if 10 <= temp <= 20 and rain >= 100:
            return "Phoma Stem Canker"
        else:
            return "No Risk"
    elif crop == "strawberries":
        # Botrytis: cool + prolonged wetness
        if 15 <= temp <= 22 and lw >= 3:
            return "Botrytis (Grey Mould)"
        else:
            return "No Risk"
    return "No Risk"

# Generate records
records = []

for crop, count in crops.items():
    for _ in range(count):
        date = random_date()
        location = random.choice(locations)
        
        temp = np.round(np.random.uniform(10, 37), 1)
        hum = np.round(np.random.uniform(40, 95), 1)
        rain = np.round(np.random.uniform(30, 300), 1)
        soil_moist = np.round(np.random.uniform(10, 60), 1)
        soil_ph = np.round(np.random.uniform(3.5, 9.9), 1)
        leaf_wet = np.round(np.random.uniform(0.3, 5), 1)
        ndvi = np.round(np.random.uniform(0.3, 0.7), 2)
        veg_status = vegetation_status(ndvi)
        yield_kg = np.round(np.random.uniform(20000, 35000), 1)

        disease = assign_disease(crop, temp, hum, leaf_wet, rain)

        records.append([
            date.strftime("%Y-%m-%d"),
            location,
            crop,
            temp,
            hum,
            rain,
            soil_moist,
            soil_ph,
            leaf_wet,
            ndvi,
            veg_status,
            yield_kg,
            disease
        ])

# Create DataFrame
columns = [
    "Date", "Location", "Crop",
    "Temperature", "Humidity", "Rainfall",
    "Soil_Moisture", "Soil_pH", "Leaf_Wetness",
    "NDVI", "Vegetation_Status", "Yield", "Disease"
]

df = pd.DataFrame(records, columns=columns)

# Save to CSV
df.to_csv("final_1dataset.csv", index=False)

print("Synthetic dataset created:", df.shape)
print(df.head())

Synthetic dataset created: (1000, 13)
         Date      Location    Crop  Temperature  Humidity  Rainfall  \
0  2024-05-10  Lincolnshire  potato         15.0      82.9     191.1   
1  2022-05-11     Yorkshire  potato         10.6      93.3     254.8   
2  2024-01-18     Yorkshire  potato         10.6      68.9     138.0   
3  2024-04-11  Lincolnshire  potato         22.6      87.3     213.7   
4  2023-07-19          Kent  potato         16.2      53.3     214.5   

   Soil_Moisture  Soil_pH  Leaf_Wetness  NDVI               Vegetation_Status  \
0           32.3      4.1           2.5  0.43   Moderately healthy vegetation   
1           20.6      4.7           1.2  0.42   Moderately healthy vegetation   
2           12.3      9.7           1.4  0.34   Moderately healthy vegetation   
3           32.5      3.6           4.7  0.53  Very healthy, dense vegetation   
4           40.5      8.8           1.1  0.46   Moderately healthy vegetation   

     Yield      Disease  
0  22143.0      