In [0]:
%run /Workspace/Users/kianhow2000@gmail.com/Databricks/01_Data_Engineer_Learning_Plan/Lab-Setup/common-utils

In [0]:

# Databricks notebook source
# ==========================================================
# Setup for Auto Loader CSV Labs
# ==========================================================

from pyspark.sql import functions as F
import json
import random

# -----------------------------
# Config
# -----------------------------
CATALOG = "workspace"
SCHEMA_PREFIX = "data_engineering_labs"
VOLUME = "v01"

RUN_ID = get_run_id()                # e.g. "00"
SCHEMA = f"{SCHEMA_PREFIX}_{RUN_ID}" # data_engineering_labs_00

SOURCE_ROWS = 3150
STAGING_ROWS_1 = 1000
STAGING_ROWS_2 = 2000

random.seed(42)

banner("Setting up Auto Loader CSV datasets")

ensure_catalog_and_schema(CATALOG, SCHEMA)
VOLUME_ROOT = ensure_volume(CATALOG, SCHEMA, VOLUME)

SOURCE_DIR = f"{VOLUME_ROOT}/csv_files_autoloader_source"
STAGING_DIR = f"{VOLUME_ROOT}/csv_files_autoloader_staging"

dbutils.fs.mkdirs(SOURCE_DIR)
dbutils.fs.mkdirs(STAGING_DIR)

print("Source dir :", SOURCE_DIR)
print("Staging dir:", STAGING_DIR)

# -----------------------------
# Helper to generate items JSON
# -----------------------------
ITEM_TEMPLATES = [
    {
        "coupon": "NEWBED10",
        "item_id": "M_STAN_F",
        "item_name": "standard full mattress",
        "item_revenue_in_usd": "850.3",
        "price": "849"
    },
    {
        "coupon": "SLEEP5",
        "item_id": "P_BASIC",
        "item_name": "basic pillow",
        "item_revenue_in_usd": "55.0",
        "price": "59"
    },
    {
        "coupon": None,
        "item_id": "B_FRAME_Q",
        "item_name": "queen bed frame",
        "item_revenue_in_usd": "399.0",
        "price": "399"
    }
]

def random_items():
    n = random.randint(1, 3)
    return json.dumps(random.sample(ITEM_TEMPLATES, n))

# -----------------------------
# Base dataframe generator
# -----------------------------
def generate_orders_df(start_id, rows):
    return (
        spark.range(start_id, start_id + rows)
        .withColumnRenamed("id", "order_id")
        .withColumn("email", F.concat(F.lit("user"), F.col("order_id"), F.lit("@example.com")))
        .withColumn(
            "transactions_timestamp",
            F.expr("timestampadd(SECOND, cast(rand()*86400 as int), current_timestamp())")
        )
        .withColumn("total_item_quantity", (F.rand() * 5 + 1).cast("int"))
        .withColumn("purchase_revenue_in_usd", F.round(F.rand() * 1200 + 20, 2))
        .withColumn("unique_items", (F.rand() * 3 + 1).cast("int"))
        .withColumn("items", F.udf(lambda: random_items(), "string")())
        .select(
            "order_id",
            "email",
            "transactions_timestamp",
            "total_item_quantity",
            "purchase_revenue_in_usd",
            "unique_items",
            "items"
        )
    )

# -----------------------------
# Write 000.csv (3150 rows)
# -----------------------------
df_000 = generate_orders_df(1, SOURCE_ROWS)

(df_000
 .coalesce(1)
 .write
 .mode("overwrite")
 .option("header", "true")
 .csv(f"{SOURCE_DIR}/000_tmp"))

# Rename part file → 000.csv
src_file = [f.path for f in dbutils.fs.ls(f"{SOURCE_DIR}/000_tmp") if f.name.startswith("part-")][0]
dbutils.fs.mv(src_file, f"{SOURCE_DIR}/000.csv", True)
dbutils.fs.rm(f"{SOURCE_DIR}/000_tmp", True)

# -----------------------------
# Write 001.csv (1000 rows)
# -----------------------------
df_001 = generate_orders_df(10_000, STAGING_ROWS_1)

(df_001
 .coalesce(1)
 .write
 .mode("overwrite")
 .option("header", "true")
 .csv(f"{STAGING_DIR}/001_tmp"))

src_file = [f.path for f in dbutils.fs.ls(f"{STAGING_DIR}/001_tmp") if f.name.startswith("part-")][0]
dbutils.fs.mv(src_file, f"{STAGING_DIR}/001.csv", True)
dbutils.fs.rm(f"{STAGING_DIR}/001_tmp", True)

# -----------------------------
# Write 002.csv (2000 rows)
# -----------------------------
df_002 = generate_orders_df(20_000, STAGING_ROWS_2)

(df_002
 .coalesce(1)
 .write
 .mode("overwrite")
 .option("header", "true")
 .csv(f"{STAGING_DIR}/002_tmp"))

src_file = [f.path for f in dbutils.fs.ls(f"{STAGING_DIR}/002_tmp") if f.name.startswith("part-")][0]
dbutils.fs.mv(src_file, f"{STAGING_DIR}/002.csv", True)
dbutils.fs.rm(f"{STAGING_DIR}/002_tmp", True)

# -----------------------------
# Final verification
# -----------------------------
print("Source files:", [f.name for f in dbutils.fs.ls(SOURCE_DIR)])
print("Staging files:", [f.name for f in dbutils.fs.ls(STAGING_DIR)])

print("000.csv rows :", spark.read.option("header","true").csv(f"{SOURCE_DIR}/000.csv").count())
print("001.csv rows :", spark.read.option("header","true").csv(f"{STAGING_DIR}/001.csv").count())
print("002.csv rows :", spark.read.option("header","true").csv(f"{STAGING_DIR}/002.csv").count())

print("✅ Auto Loader CSV setup complete.")
