In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, time
import random
from geopy.distance import geodesic

# Constants
NUM_RECORDS = 100
ORDER_START = datetime(2025, 5, 1)
ORDER_END = datetime(2025, 5, 9)
TIMESTAMP_DAY = datetime(2025, 5, 9)
PEAK_HOUR_PROB = 0.85  # 85% chance of peak‐hour timestamp

# Load and clean CSV
locations_df = pd.read_csv("philippine_cities_and_municipalities.csv")
locations_df.columns = [col.strip() for col in locations_df.columns]
pop_col = locations_df.columns[1]
locations_df = locations_df.rename(columns={pop_col: "Population"})
locations_df = locations_df.dropna(subset=["Latitude", "Longitude"])
locations_df["Population"] = pd.to_numeric(locations_df["Population"], errors="coerce")
locations_df["Class"] = locations_df["Class"].replace({
    "CC": "City", "ICC": "City", "HUC": "City", "Mun": "Municipality"
})

# Valid origins: HUC or City with Pop ≥ 300k
valid_origins = locations_df[
    ((locations_df["Class"] == "City") & (locations_df["Population"] >= 300_000)) |
    (locations_df["Class"] == "HUC")
].copy()

# Valid deliveries: HUC or City with Pop ≥ 100k
valid_deliveries = locations_df[
    ((locations_df["Class"] == "City") & (locations_df["Population"] >= 100_000)) |
    (locations_df["Class"] == "HUC")
].copy()

all_locations = locations_df.copy()

def compute_order_date(idx):
    total_days = (ORDER_END - ORDER_START).days
    day_offset = (idx * total_days) // (NUM_RECORDS - 1)
    return (ORDER_START + timedelta(days=day_offset)).date()

def generate_timestamp():
    # 85% chance peak hours 11–22, else outside
    if random.random() < PEAK_HOUR_PROB:
        hour = random.randint(11, 22)
    else:
        hour = random.choice(list(range(0, 11)) + [23])
    minute = random.randint(0, 59)
    second = random.randint(0, 59)
    return datetime.combine(TIMESTAMP_DAY.date(), time(hour, minute, second))

def pick_origin():
    return valid_origins.sample(1).iloc[0]

def pick_delivery(origin_coords):
    pool = valid_deliveries.copy()
    pool = pool[~(
        (pool["Latitude"] == origin_coords[0]) &
        (pool["Longitude"] == origin_coords[1])
    )]
    return pool.sample(1).iloc[0] if not pool.empty else valid_deliveries.sample(1).iloc[0]

def nearest_city_name(lat, lon):
    dists = all_locations.apply(
        lambda row: geodesic((lat, lon), (row["Latitude"], row["Longitude"])).km,
        axis=1
    )
    idx_min = dists.idxmin()
    row = all_locations.loc[idx_min]
    return f"{row['City or municipality']}, {row['Province']}"

def get_delivery_days(dist_km):
    if dist_km < 5:
        return 1
    elif dist_km < 20:
        return 2
    elif dist_km < 100:
        return random.randint(3, 5)
    elif dist_km < 500:
        return random.randint(6, 9)
    elif dist_km < 1000:
        return random.randint(10, 13)
    else:
        return random.randint(14, 18)

# Predefine indices for Cancelled (~1%), Delayed (~2%)
cancelled_indices = set(random.sample(range(NUM_RECORDS), 1))
delayed_indices = set(random.sample(
    [i for i in range(NUM_RECORDS) if i not in cancelled_indices],
    max(1, int(NUM_RECORDS * 0.02))
))

records = []

for i in range(NUM_RECORDS):
    idx = i + 1
    order_date = compute_order_date(i)
    timestamp = generate_timestamp()

    origin_row = pick_origin()
    origin_coords = (origin_row["Latitude"], origin_row["Longitude"])
    origin_name = f"{origin_row['City or municipality']}, {origin_row['Province']}"

    delivery_row = pick_delivery(origin_coords)
    delivery_coords = (delivery_row["Latitude"], delivery_row["Longitude"])
    delivery_name = f"{delivery_row['City or municipality']}, {delivery_row['Province']}"

    distance_km = geodesic(origin_coords, delivery_coords).km

    # Determine delivery_date
    if distance_km <= 100:
        gap_days = random.choice([0, 1, 2])
        delivery_date_dt = datetime.combine(order_date, time.min) + timedelta(days=gap_days)
    else:
        gap_days = get_delivery_days(distance_km)
        delivery_date_dt = datetime.combine(order_date, time.min) + timedelta(days=gap_days)
    delivery_date = delivery_date_dt.date()

    status = None
    curr_lat, curr_lon = origin_coords
    hour = timestamp.time().hour

    # 1) Cancelled
    if i in cancelled_indices:
        status = "Cancelled"
        curr_lat, curr_lon = origin_coords

    # 2) Delayed
    elif i in delayed_indices:
        status = "Delayed"
        frac = random.uniform(0.5, 0.8)
        curr_lat = origin_coords[0] + frac * (delivery_coords[0] - origin_coords[0])
        curr_lon = origin_coords[1] + frac * (delivery_coords[1] - origin_coords[1])

    else:
        # 3) Past delivery_date → Delivered
        if timestamp.date() > delivery_date:
            status = "Delivered"
            curr_lat, curr_lon = delivery_coords

        else:
            # 4) To Ship by order_date rules
            if order_date == datetime(2025, 5, 7).date() and distance_km > 200:
                status = "To Ship"
                curr_lat, curr_lon = origin_coords
            elif order_date == datetime(2025, 5, 8).date() and distance_km > 100:
                status = "To Ship"
                curr_lat, curr_lon = origin_coords
            elif order_date == datetime(2025, 5, 9).date() and distance_km >= 30:
                status = "To Ship"
                curr_lat, curr_lon = origin_coords

            # 5) Proximity‐based status if still None
            if status is None:
                # Pick a point 30%–70% from origin toward delivery
                frac = random.uniform(0.3, 0.7)
                curr_lat = origin_coords[0] + frac * (delivery_coords[0] - origin_coords[0])
                curr_lon = origin_coords[1] + frac * (delivery_coords[1] - origin_coords[1])
                d_cur_delivery = geodesic((curr_lat, curr_lon), delivery_coords).km

                # Delivered / Awaiting Pickup if < 1 km
                if d_cur_delivery < 1.0:
                    if random.random() < 0.8:
                        status = "Delivered"
                        curr_lat, curr_lon = delivery_coords
                    else:
                        status = "Awaiting Pickup"
                        curr_lat = delivery_coords[0] + random.uniform(-0.001, 0.001)
                        curr_lon = delivery_coords[1] + random.uniform(-0.001, 0.001)

                # Out for Delivery: 1 km ≤ d < 10 km, during 6–20h
                elif 1.0 <= d_cur_delivery < 10.0 and 6 <= hour <= 20:
                    status = "Out for Delivery"
                    for _ in range(10):
                        jl = delivery_coords[0] + random.uniform(-0.009, 0.009)
                        jk = delivery_coords[1] + random.uniform(-0.009, 0.009)
                        if 1.0 <= geodesic((jl, jk), delivery_coords).km < 10.0:
                            curr_lat, curr_lon = jl, jk
                            break

                # To Ship: if <1 delivered not triggered and far or timestamp day conditions
                if status is None:
                    status = "In Transit"

    current_name = nearest_city_name(curr_lat, curr_lon)

    perishable = random.choice(["Yes", "No"])
    if perishable == "Yes":
        temperature = round(np.random.uniform(2.0, 12.0), 2)
    else:
        temperature = round(np.random.uniform(10.0, 30.0), 2)
    if perishable == "Yes" and temperature > 7.0:
        temp_issue = "Temp Alert >7°C"
    elif perishable == "Yes":
        temp_issue = "Normal"
    else:
        temp_issue = "N/A"

    records.append({
        "timestamp": timestamp.strftime("%Y-%m-%d %H:%M:%S"),
        "package_id": f"PKG{str(idx).zfill(3)}",
        "order_date": order_date.strftime("%Y-%m-%d"),
        "delivery_date": delivery_date.strftime("%Y-%m-%d"),
        "origin": origin_name,
        "origin_coordinates": f"{origin_coords[0]:.6f},{origin_coords[1]:.6f}",
        "current_location": current_name,
        "current_coordinates": f"{curr_lat:.6f},{curr_lon:.6f}",
        "delivery_location": delivery_name,
        "delivery_coordinates": f"{delivery_coords[0]:.6f},{delivery_coords[1]:.6f}",
        "perishable": perishable,
        "temperature_celsius": f"{temperature}°C",
        "temperature_issue": temp_issue,
        "status": status
    })

df = pd.DataFrame(records)
df.sort_values("timestamp", inplace=True)

# Save outputs
df.to_csv("iot_data.csv", index=False)
df.to_json("iot_data.json", orient="records")

# Preview
df.head()

Unnamed: 0,timestamp,package_id,order_date,delivery_date,origin,origin_coordinates,current_location,current_coordinates,delivery_location,delivery_coordinates,perishable,temperature_celsius,temperature_issue,status
95,2025-05-09 02:21:27,PKG096,2025-05-08,2025-05-08,"Malabon, Metro Manila","14.657850,120.951126","Manila, Metro Manila","14.590449,120.980362","Manila, Metro Manila","14.590449,120.980362",Yes,4.93°C,Normal,Delivered
89,2025-05-09 02:32:16,PKG090,2025-05-08,2025-05-15,"Tarlac City, Tarlac","15.486122,120.589347","Tarlac City, Tarlac","15.486122,120.589347","Laoag, Ilocos Norte","18.197323,120.593543",No,21.48°C,,To Ship
35,2025-05-09 02:59:40,PKG036,2025-05-03,2025-05-12,"Iligan, Lanao del Norte","8.130121,124.214946","Libungan, Cotabato","7.273097,124.621498","General Santos, South Cotabato","6.112222,125.172189",Yes,2.38°C,Normal,In Transit
26,2025-05-09 03:41:04,PKG027,2025-05-03,2025-05-16,"Mandaluyong, Metro Manila","14.577439,121.033897","Cadiz, Negros Occidental","11.086337,123.263006","Panabo, Davao del Norte","7.299870,125.680709",Yes,3.78°C,Normal,In Transit
21,2025-05-09 04:01:13,PKG022,2025-05-02,2025-05-10,"Baguio, Benguet","16.411991,120.593372","Burdeos, Quezon","15.018394,122.079754","Naga, Camarines Sur","13.694500,123.491794",No,10.53°C,,In Transit
