### Synthetic Data Generation - Code

In [43]:
"""
Synthetic nested e-commerce orders generator (privacy-safe)
- Generates orders from 2025-01-01 to 2025-07-31
- Writes ONE JSON ARRAY file per month (NOT JSONL), e.g.:
    orders_2025_01.json
    orders_2025_02.json
    ...
- Each order contains nested objects (customer, payment) and an array (items)
  so you can practice "flatten/explode" in AWS Lambda.

NOTE:
- This produces standard JSON arrays (valid JSON). Each monthly file is an array: [ {...}, {...} ]
- No personal data (no names/emails/addresses). IDs are synthetic.
"""

import json
import random
from datetime import datetime, timedelta
from uuid import uuid4
from pathlib import Path

# -----------------------
# CONFIG (edit these)
# -----------------------
START_DATE = datetime(2025, 1, 1)
END_DATE = datetime(2025, 7, 31)

ORDERS_PER_DAY = 200              # Scale up/down as needed
OUTPUT_DIR = Path("out_monthly")  # Folder where monthly JSON files will be written
CURRENCY = "USD"

# Synthetic, non-personal dimension values
COUNTRIES = ["US", "CA", "UK", "DE", "FR", "IN"]
SEGMENTS = ["consumer", "corporate", "small_business"]
SALES_CHANNELS = ["online", "store", "mobile_app"]
PAYMENT_METHODS = ["credit_card", "debit_card", "paypal", "wallet"]

# Product catalog: product_id -> category -> price (prevents category mismatch)
PRODUCT_CATALOG = [
    {"product_id": "PROD-101", "category": "electronics",     "price": 899.99},
    {"product_id": "PROD-205", "category": "accessories",     "price": 29.99},
    {"product_id": "PROD-309", "category": "furniture",       "price": 249.50},
    {"product_id": "PROD-412", "category": "office_supplies", "price": 12.99},
    {"product_id": "PROD-501", "category": "electronics",     "price": 199.00},
]

# -----------------------
# HELPERS
# -----------------------
def random_timestamp_within_day(day_start: datetime) -> datetime:
    """Return a random timestamp within the given calendar day (UTC-like naive)."""
    # pick 0..86399 seconds from start of day
    seconds = random.randint(0, 24 * 60 * 60 - 1)
    return day_start + timedelta(seconds=seconds)

def generate_items() -> list[dict]:
    """Create 1-5 line items; each line item references a catalog product deterministically."""
    items: list[dict] = []
    num_items = random.randint(1, 5)

    for _ in range(num_items):
        product = random.choice(PRODUCT_CATALOG)
        qty = random.randint(1, 4)

        items.append({
            "product_id": product["product_id"],
            "category": product["category"],
            "quantity": qty,
            "unit_price": round(product["price"], 2),
        })

    return items

def calculate_order_total(items: list[dict]) -> float:
    """Compute total = sum(quantity * unit_price) for all items."""
    return round(sum(i["quantity"] * i["unit_price"] for i in items), 2)

def month_key(dt: datetime) -> str:
    """Return YYYY_MM string for a datetime."""
    return dt.strftime("%Y_%m")

def month_filename(dt: datetime) -> Path:
    """Return output file path for month of dt."""
    return OUTPUT_DIR / f"orders_{dt.strftime('%Y_%m')}.json"

def write_month_file(path: Path, orders: list[dict]) -> None:
    """Write a list of orders as a JSON array to disk."""
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        # indent is optional; makes GitHub-friendly readable files (slightly larger)
        json.dump(orders, f, indent=2, ensure_ascii=False)

# -----------------------
# MAIN GENERATION (monthly batching)
# -----------------------
def main() -> None:
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

    current_day = START_DATE
    current_month = month_key(current_day)

    # Buffer orders for the current month, then flush to disk when month changes
    monthly_orders: list[dict] = []

    total_orders = 0
    files_written: list[Path] = []

    while current_day <= END_DATE:
        # If month changes, write previous month file and reset buffer
        if month_key(current_day) != current_month:
            out_path = month_filename(datetime.strptime(current_month + "_01", "%Y_%m_%d"))
            write_month_file(out_path, monthly_orders)
            files_written.append(out_path)

            # reset for new month
            monthly_orders = []
            current_month = month_key(current_day)

        # Generate orders for this day
        for _ in range(ORDERS_PER_DAY):
            ts = random_timestamp_within_day(current_day)
            items = generate_items()

            order = {
                "order_id": f"ORD-{ts.strftime('%Y%m%d')}-{uuid4().hex[:8]}",
                "order_timestamp": ts.isoformat() + "Z",  # ISO timestamp string
                "customer": {
                    "customer_id": f"CUST-{random.randint(10000, 99999)}",
                    "country": random.choice(COUNTRIES),
                    "segment": random.choice(SEGMENTS),
                },
                "sales_channel": random.choice(SALES_CHANNELS),
                "payment": {
                    "method": random.choice(PAYMENT_METHODS),
                    "status": "paid",
                },
                "items": items,
                "order_total": calculate_order_total(items),
                "currency": CURRENCY,
            }

            monthly_orders.append(order)
            total_orders += 1

        # Move to next day
        current_day += timedelta(days=1)

    # Flush the last month at the end
    out_path = month_filename(datetime.strptime(current_month + "_01", "%Y_%m_%d"))
    write_month_file(out_path, monthly_orders)
    files_written.append(out_path)

    # Print summary
    print(f"Done. Generated {total_orders} orders across {len(files_written)} monthly files.")
    for p in files_written:
        print(f"  - {p}")

if __name__ == "__main__":
    main()


Done. Generated 42400 orders across 7 monthly files.
  - out_monthly\orders_2025_01.json
  - out_monthly\orders_2025_02.json
  - out_monthly\orders_2025_03.json
  - out_monthly\orders_2025_04.json
  - out_monthly\orders_2025_05.json
  - out_monthly\orders_2025_06.json
  - out_monthly\orders_2025_07.json


In [45]:
df_2 = pd.read_json('out_monthly\orders_2025_02.json')

In [46]:
df_2[df_2['order_id']=='ORD-20250201-2048e5ad']

Unnamed: 0,order_id,order_timestamp,customer,sales_channel,payment,items,order_total,currency
2,ORD-20250201-2048e5ad,2025-02-01T12:46:24Z,"{'customer_id': 'CUST-14295', 'country': 'DE',...",store,"{'method': 'credit_card', 'status': 'paid'}","[{'product_id': 'PROD-501', 'category': 'elect...",398.0,USD
