In [0]:
%run /Workspace/Users/kianhow2000@gmail.com/Databricks/01_Data_Engineer_Learning_Plan/Lab-Setup/common-utils

In [0]:
%python

import json, random, base64, time
from pyspark.sql import functions as F

# -----------------------------
# Config
# -----------------------------
CATALOG = "workspace"
SCHEMA_PREFIX = "data_engineering_labs"
VOLUME = "v01"

RUN_ID = get_run_id(default="00")              # uses widgets: run_id=00
SCHEMA = f"{SCHEMA_PREFIX}_{RUN_ID}"           # data_engineering_labs_00

OUT_DIR_REL = "raw/events-kafka"
NUM_FILES = 11            # 000..010
EVENTS_PER_FILE = 500     # tweak if you want bigger/smaller
PARTITIONS = 3            # will vary partitions 0..2
TOPIC = "clickstream"

BASE_OFFSET = 219255030   # like your sample
SEED = 42

random.seed(SEED)

banner(f"Setting up Kafka-esque events in {CATALOG}.{SCHEMA}")

ensure_catalog_and_schema(CATALOG, SCHEMA)
VOLUME_ROOT = ensure_volume(CATALOG, SCHEMA, VOLUME)

OUT_DIR = f"{VOLUME_ROOT}/{OUT_DIR_REL}"
dbutils.fs.mkdirs(OUT_DIR)

print("Output dir:", OUT_DIR)
print(f"Writing {NUM_FILES} files x {EVENTS_PER_FILE} events/file = {NUM_FILES*EVENTS_PER_FILE} total events")

# -----------------------------
# Variation pools
# -----------------------------
DEVICES = ["Android", "iOS", "Web", "Windows", "MacOS"]
EVENT_NAMES = ["main", "product_view", "add_to_cart", "begin_checkout", "purchase", "search"]
TRAFFIC = ["google", "facebook", "instagram", "tiktok", "email", "direct", "referral", "bing"]

# City/state combos
GEO = [
    ("New York", "NY"),
    ("San Francisco", "CA"),
    ("Seattle", "WA"),
    ("Austin", "TX"),
    ("Chicago", "IL"),
    ("Boston", "MA"),
    ("Miami", "FL"),
    ("Los Angeles", "CA"),
    ("Denver", "CO"),
    ("Singapore", "SG"),
]

ITEM_CATALOG = [
    ("M_STAN_K", "Standard King Mattress", 1195.0),
    ("M_STAN_Q", "Standard Queen Mattress", 995.0),
    ("M_STAN_F", "Standard Full Mattress", 849.0),
    ("P_BASIC", "Basic Pillow", 59.0),
    ("B_FRAME_Q", "Queen Bed Frame", 399.0),
    ("SHEET_K", "King Sheet Set", 129.0),
]

COUPONS = [None, "NEWBED10", "SLEEP5", "WELCOME15", "FREESHIP", None, None]

def rand_key(n=14):
    # "encoded-looking" (base32-ish alphabet)
    alphabet = "ABCDEFGHJKLMNPQRSTUVWXYZ23456789"
    return "".join(random.choice(alphabet) for _ in range(n))

def rand_user_id():
    # UA + digits
    return "UA" + "".join(str(random.randint(0,9)) for _ in range(10))

def make_items_and_ecommerce():
    # 0..3 items; if 0 items => no purchase revenue
    k = random.choices([0,1,2,3], weights=[20,45,25,10], k=1)[0]
    chosen = random.sample(ITEM_CATALOG, k=k) if k > 0 else []

    items = []
    total_qty = 0
    unique_items = len(chosen)
    revenue = 0.0

    for (item_id, item_name, price) in chosen:
        qty = random.randint(1, 2)
        coupon = random.choice(COUPONS)
        # simulate discount a bit
        discount = 0.0
        if coupon in ("NEWBED10", "SLEEP5"):
            discount = 0.05 if coupon == "SLEEP5" else 0.10
        if coupon == "WELCOME15":
            discount = 0.15

        item_rev = round(price * qty * (1.0 - discount), 2)
        revenue += item_rev
        total_qty += qty

        items.append({
            "coupon": coupon,
            "item_id": item_id,
            "item_name": item_name,
            "item_revenue_in_usd": item_rev,
            "price_in_usd": price,
            "quantity": qty
        })

    ecommerce = {
        "purchase_revenue_in_usd": round(revenue, 2),
        # keep your field spelling as given (total_item_quantity)
        "total_item_quantity": int(total_qty),
        "unique_items": int(unique_items),
    }
    return items, ecommerce

def b64_encode_json(obj: dict) -> str:
    payload = json.dumps(obj, separators=(",", ":"), ensure_ascii=False)
    return base64.b64encode(payload.encode("utf-8")).decode("utf-8")

# Base timestamps (so data looks realistic and consistent)
now_ms = int(time.time() * 1000)
base_event_ts = now_ms - (7 * 24 * 3600 * 1000)  # ~7 days ago

# -----------------------------
# Generate and write JSONL files
# -----------------------------
offset = BASE_OFFSET

for file_idx in range(NUM_FILES):
    file_name = f"{file_idx:03d}.json"
    file_path = f"{OUT_DIR}/{file_name}"

    lines = []
    for i in range(EVENTS_PER_FILE):
        user_id = rand_user_id()
        device = random.choice(DEVICES)
        event_name = random.choice(EVENT_NAMES)
        traffic_source = random.choice(TRAFFIC)
        city, state = random.choice(GEO)

        # clickstream payload timestamps
        event_timestamp = base_event_ts + random.randint(0, 7*24*3600*1000)   # ms-ish
        user_first_touch = event_timestamp - random.randint(1, 180) * 24*3600*1000

        items, ecommerce = make_items_and_ecommerce()

        # If no purchase revenue, keep event_name less likely to be purchase
        if ecommerce["purchase_revenue_in_usd"] == 0.0 and event_name == "purchase":
            event_name = random.choice(["main", "product_view", "search", "add_to_cart"])

        clickstream = {
            "device": device,
            "ecommerce": ecommerce,
            "event_name": event_name,
            # keep as integer (as in your example). Use a "big" number style:
            "event_timestamp": int(event_timestamp) * 100000 + random.randint(0, 99999),
            "geo": {"city": city, "state": state},
            "items": items,
            # keep your field spelling: "traffic_source" (3 f's)
            "traffic_source": traffic_source,
            "user_first_touch_timestamp": int(user_first_touch),
            "user_id": user_id
        }

        envelope = {
            "key": rand_key(14),
            "offset": int(offset),
            "partition": int(offset % PARTITIONS),
            "timestamp": int(event_timestamp),  # epoch ms
            "topic": TOPIC,
            "value": b64_encode_json(clickstream)
        }

        lines.append(json.dumps(envelope, separators=(",", ":")))
        offset += 1

    # Write as a single JSONL file
    dbutils.fs.rm(file_path, True)
    dbutils.fs.put(file_path, "\n".join(lines) + "\n", overwrite=True)

print("âœ… Done writing files:", [f.name for f in dbutils.fs.ls(OUT_DIR)])

# -----------------------------
# Quick verification: decode one record
# -----------------------------
sample_file = f"{OUT_DIR}/000.json"
first_line = dbutils.fs.head(sample_file, 5000).splitlines()[0]
outer = json.loads(first_line)
decoded = json.loads(base64.b64decode(outer["value"]).decode("utf-8"))

print("\nSample outer JSON keys:", list(outer.keys()))
print("Sample decoded clickstream keys:", list(decoded.keys()))
print("\nSample decoded clickstream (pretty):")
print(json.dumps(decoded, indent=2))
