In [9]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
from geopy.distance import geodesic

# Load and clean location data
locations_df = pd.read_csv("locations_latlng.csv")
locations = []
for _, row in locations_df.iterrows():
    try:
        latlng_clean = row["LatLng"].replace(";", ",").strip()
        lat, lon = map(float, latlng_clean.split(","))
        locations.append({
            "Province": row["Province"].strip(),
            "City": row["City"].strip(),
            "LatLng": latlng_clean,
            "Latitude": lat,
            "Longitude": lon
        })
    except:
        continue

# Define valid major origin cities
major_origins = {
    "NCR": None,  # All NCR cities allowed
    "Laguna": ["Santa Rosa", "Cabuyao"],
    "Cavite": ["Trece Martires", "Dasmariñas"],
    "Batangas": ["Lipa", "Batangas City"],
    "Bulacan": ["Malolos", "San Jose del Monte"],
    "Pampanga": ["San Fernando", "Angeles"],
    "Baguio": ["Baguio"],
    "Cebu": ["Cebu City"],
    "Iloilo": ["Iloilo City"],
    "Davao": ["Davao City"],
    "Bacolod": ["Bacolod City"],
    "Zamboanga": ["Zamboanga City"]
}

# Filter valid origin locations
valid_origin_locations = []
for loc in locations:
    if loc["Province"] == "NCR":
        valid_origin_locations.append(loc)
    elif loc["Province"] in major_origins:
        allowed_cities = major_origins[loc["Province"]]
        if allowed_cities is None or loc["City"] in allowed_cities:
            valid_origin_locations.append(loc)

# Function to pick a valid origin
def get_valid_origin():
    return random.choice(valid_origin_locations)

# Create a realistic delivery path
def generate_realistic_route(origin_lat, origin_lon, delivery_lat, delivery_lon):
    steps = random.randint(2, 5)
    route = [(origin_lat, origin_lon)]
    for _ in range(steps):
        origin_lat += random.uniform(-0.02, 0.02)
        origin_lon += random.uniform(-0.02, 0.02)
        route.append((round(origin_lat, 6), round(origin_lon, 6)))
    route.append((delivery_lat, delivery_lon))
    return route

# Delivery days based on actual km
def get_delivery_days_km(distance_km):
    if distance_km < 5:
        return 1
    elif distance_km < 20:
        return 2
    elif distance_km < 100:
        return random.randint(3, 4)
    elif distance_km < 500:
        return random.randint(5, 7)
    else:
        return random.randint(8, 10)

# Generate records
num_records = 100
forced_delivered_count = int(num_records * 0.15)
delivered_indices = random.sample(range(1, num_records + 1), forced_delivered_count)

raw_data = []

for i in range(1, num_records + 1):
    is_forced_delivered = i in delivered_indices
    delivery = random.choice(locations)
    origin = get_valid_origin()

    origin_location = f"{origin['City']}, {origin['Province']}"
    origin_coordinates = origin["LatLng"]
    origin_lat, origin_lon = origin["Latitude"], origin["Longitude"]

    delivery_location = f"{delivery['City']}, {delivery['Province']}"
    delivery_coordinates = delivery["LatLng"]
    delivery_lat, delivery_lon = delivery["Latitude"], delivery["Longitude"]

    distance_km = geodesic((origin_lat, origin_lon), (delivery_lat, delivery_lon)).km
    delivery_days = get_delivery_days_km(distance_km)

    order_date = datetime.now() - timedelta(days=np.random.randint(3, 6))
    delivery_date = order_date + timedelta(days=delivery_days)
    timestamp = datetime.now() - timedelta(minutes=np.random.randint(0, 1440))
    delay_by_days = (timestamp.date() - delivery_date.date()).days

    if is_forced_delivered:
        current_lat, current_lon = delivery_lat, delivery_lon
        current_location = delivery_location
        current_coordinates = delivery_coordinates
        status = "Delivered"
    else:
        route = generate_realistic_route(origin_lat, origin_lon, delivery_lat, delivery_lon)
        step = random.randint(0, len(route) - 1)
        current_lat, current_lon = route[step]
        current_coordinates = f"{current_lat},{current_lon}"
        current_location = origin_location if step == 0 else delivery_location

        coord_distance = geodesic((current_lat, current_lon), (delivery_lat, delivery_lon)).km
        if delay_by_days > 1 and coord_distance >= 5:
            status = "Delayed"
        elif coord_distance < 2:
            status = random.choice(["Delivered", "Awaiting Pickup"])
        else:
            status = random.choice(["In Transit", "Awaiting Pickup"])

    perishable = random.choice(["Yes", "No"])
    temperature = round(np.random.uniform(2.0, 12.0), 2) if perishable == "Yes" else round(np.random.uniform(10.0, 30.0), 2)
    temperature_issue = "Exceeded" if perishable == "Yes" and temperature > 7.0 else "Normal" if perishable == "Yes" else "N/A"

    raw_data.append({
        "timestamp": timestamp,
        "order_date": order_date,
        "delivery_date": delivery_date,
        "origin": origin_location,
        "origin_coordinates": origin_coordinates,
        "current_location": current_location,
        "current_coordinates": current_coordinates,
        "delivery_location": delivery_location,
        "delivery_coordinates": delivery_coordinates,
        "perishable": perishable,
        "temperature_celsius": temperature,
        "temperature_issue": temperature_issue,
        "status": status
    })

# Final formatting
sorted_data = sorted(raw_data, key=lambda x: x["timestamp"])
for idx, record in enumerate(sorted_data, start=1):
    record["package_id"] = f"PKG{str(idx).zfill(3)}"
    record["timestamp"] = record["timestamp"].strftime("%Y-%m-%d %H:%M:%S")
    record["order_date"] = record["order_date"].strftime("%Y-%m-%d")
    record["delivery_date"] = record["delivery_date"].strftime("%Y-%m-%d")

columns = [
    "timestamp", "package_id", "order_date", "delivery_date",
    "origin", "origin_coordinates",
    "current_location", "current_coordinates",
    "delivery_location", "delivery_coordinates",
    "perishable", "temperature_celsius", "temperature_issue", "status"
]

df = pd.DataFrame(sorted_data)[columns]
df.to_csv("iot_data.csv", index=False)
df.to_json("iot_data.json", orient="records")

# Show first rows
df.head()

Unnamed: 0,timestamp,package_id,order_date,delivery_date,origin,origin_coordinates,current_location,current_coordinates,delivery_location,delivery_coordinates,perishable,temperature_celsius,temperature_issue,status
0,2025-05-17 16:18:03,PKG001,2025-05-15,2025-05-22,"Cebu City, Cebu","10.32, 123.75","Sulat, Eastern Samar","11.817,125.45","Sulat, Eastern Samar","11.817, 125.450",Yes,5.95,Normal,Delivered
1,2025-05-17 16:19:03,PKG002,2025-05-15,2025-05-21,"Cebu City, Cebu","10.32, 123.75","Tarragona, Davao Oriental","7.05,126.45","Tarragona, Davao Oriental","7.050, 126.450",No,18.78,,Awaiting Pickup
2,2025-05-17 16:25:03,PKG003,2025-05-15,2025-05-20,"Iloilo City, Iloilo","11, 122.67","Calintaan, Occidental Mindoro","10.986034,122.675452","Calintaan, Occidental Mindoro","12.57556, 120.94278",No,19.19,,In Transit
3,2025-05-17 16:25:03,PKG004,2025-05-14,2025-05-20,"Batangas City, Batangas","13.750, 121.050","Lupi, Camarines Sur","13.817,122.9","Lupi, Camarines Sur","13.817, 122.900",Yes,5.57,Normal,Awaiting Pickup
4,2025-05-17 16:26:03,PKG005,2025-05-13,2025-05-18,"Iloilo City, Iloilo","11, 122.67","Northern Kabuntalan, Maguindanao","11.011525,122.671956","Northern Kabuntalan, Maguindanao","7.13333, 124.46667",Yes,9.61,Exceeded,Awaiting Pickup
