In [0]:
%run /Workspace/Users/kianhow2000@gmail.com/Databricks/01_Data_Engineer_Learning_Plan/Lab-Setup/common-utils

In [0]:

from pyspark.sql import functions as F

# -----------------------------
# Lab-specific configuration
# -----------------------------
CATALOG = "workspace"
SCHEMA_PREFIX = "data_engineering_labs"
VOLUME = "v01"

TOTAL_RECORDS = 10_000
NUM_PART_FILES = 4
SEED = 42

# -----------------------------
# Resolve run context
# -----------------------------
RUN_ID = get_run_id()
SCHEMA = f"{SCHEMA_PREFIX}_{RUN_ID}"

banner(f"Setting up Lab 01 in {CATALOG}.{SCHEMA}")

ensure_catalog_and_schema(CATALOG, SCHEMA)
VOLUME_ROOT = ensure_volume(CATALOG, SCHEMA, VOLUME)

OUT_DIR = f"{VOLUME_ROOT}/raw/users-historical"
dbutils.fs.mkdirs(OUT_DIR)

print("Output directory:", OUT_DIR)


# -----------------------------
# Generate 10,000 records
# -----------------------------
# user_id: 1..10000
# user_first_touch_timestamp: BIGINT epoch millis (portable / common in labs)
# email: user<id>@example.com

# Fixed base time: 2024-01-01 00:00:00 UTC in epoch millis
BASE_EPOCH_MS = 1704067200000

df = (spark.range(1, TOTAL_RECORDS + 1)
      .withColumnRenamed("id", "user_id")
      # Add a deterministic pseudo-random offset (0..~365 days in ms) per user
      .withColumn(
          "user_first_touch_timestamp",
          (F.lit(BASE_EPOCH_MS) +
           (F.pmod(F.xxhash64(F.col("user_id"), F.lit(SEED)), F.lit(365 * 24 * 60 * 60 * 1000))).cast("bigint"))
      )
      .withColumn("email", F.concat(F.lit("user"), F.col("user_id"), F.lit("@example.com")))
      .select("user_id", "user_first_touch_timestamp", "email")
)

# Sanity check row count before write
count_before = df.count()
if count_before != TOTAL_RECORDS:
    raise Exception(f"Expected {TOTAL_RECORDS} rows, got {count_before}")

# -----------------------------
# Write Parquet with exactly 4 part files
# -----------------------------
# repartition(NUM_PART_FILES) ensures 4 output part-xxxxx files
(df.repartition(NUM_PART_FILES)
   .write
   .mode("overwrite")
   .parquet(OUT_DIR))

# -----------------------------
# Verify outputs: 4 parquet part files, and 10,000 records on read
# -----------------------------
files = [f.path for f in dbutils.fs.ls(OUT_DIR) if f.path.endswith(".parquet")]
files_sorted = sorted(files)

print("Parquet files written:")
for p in files_sorted:
    print(" -", p.split("/")[-1])

if len(files_sorted) != NUM_PART_FILES:
    raise Exception(f"Expected {NUM_PART_FILES} parquet files, found {len(files_sorted)}")

# Check expected prefix pattern part-00000, part-00001, ...
# Spark will typically name them like part-00000-<uuid>.snappy.parquet
for i, p in enumerate(files_sorted):
    expected_prefix = f"part-{i:05d}"
    actual_name = p.split("/")[-1]
    if not actual_name.startswith(expected_prefix):
        raise Exception(f"Expected file starting with {expected_prefix}, got {actual_name}")

# Verify record count by reading back
read_back = spark.read.parquet(OUT_DIR)
count_after = read_back.count()
if count_after != TOTAL_RECORDS:
    raise Exception(f"After write: expected {TOTAL_RECORDS} rows, got {count_after}")

print("âœ… Setup complete.")
print(f"Dataset location: {OUT_DIR}")
print("Schema:")
read_back.printSchema()
