In [0]:

%run /Workspace/Users/kianhow2000@gmail.com/Databricks/01_Data_Engineer_Learning_Plan/Lab-Setup/common-utils

In [0]:
%python

### Setup:
### -  sales CSV + malformed CSV demo files (Unity Catalog)
### Creates:
### - 1. /Volumes/workspace/data_engineering_labs_<run_id>/v01/raw/sales-csv/000.csv..003.csv (1500 rows each)
### - 2. /Volumes/workspace/data_engineering_labs_<run_id>/v01/ops/csv_demo_files/malformed_example_1_data.csv
### - 3. /Volumes/workspace/data_engineering_labs_<run_id>/v01/ops/csv_demo_files/malformed_example_2_data.csv
### - IMPORTANT: absolute path to your common-utils
### `%run /Workspace/Users/kianhow2000@gmail.com/Databricks/01_Data_Engineer_Learning_Plan/Lab-Setup/common-utils`


from pyspark.sql import functions as F
from pyspark.sql.window import Window
import json
import random

# -----------------------------
# Config
# -----------------------------
CATALOG = "workspace"
SCHEMA_PREFIX = "data_engineering_labs"
VOLUME = "v01"

RUN_ID = get_run_id(default="00")             
SCHEMA = f"{SCHEMA_PREFIX}_{RUN_ID}"           

ROWS_PER_FILE = 1500
NUM_SALES_FILES = 4
SEED = 42

random.seed(SEED)

banner(f"Setting up SALES CSV + malformed demos in {CATALOG}.{SCHEMA}")

ensure_catalog_and_schema(CATALOG, SCHEMA)
VOLUME_ROOT = ensure_volume(CATALOG, SCHEMA, VOLUME)

RAW_SALES_DIR = f"{VOLUME_ROOT}/raw/sales-csv"
OPS_DEMO_DIR  = f"{VOLUME_ROOT}/ops/csv_demo_files"

dbutils.fs.mkdirs(RAW_SALES_DIR)
dbutils.fs.mkdirs(OPS_DEMO_DIR)

print("RAW sales dir:", RAW_SALES_DIR)
print("OPS demo dir :", OPS_DEMO_DIR)

# -----------------------------
# Helpers
# -----------------------------
ITEM_TEMPLATES = [
    {
        "coupon": "NEWBED10",
        "item_id": "M_STAN_F",
        "item_name": "standard full mattress",
        "item_revenue_in_usd": "850.3",
        "price": "849"
    },
    {
        "coupon": "SLEEP5",
        "item_id": "P_BASIC",
        "item_name": "basic pillow",
        "item_revenue_in_usd": "55.0",
        "price": "59"
    },
    {
        "coupon": None,
        "item_id": "B_FRAME_Q",
        "item_name": "queen bed frame",
        "item_revenue_in_usd": "399.0",
        "price": "399"
    }
]

def random_items():
    # returns a JSON string representing a list of dicts (like your example)
    n = random.randint(1, 3)
    return json.dumps(random.sample(ITEM_TEMPLATES, n))

random_items_udf = F.udf(lambda: random_items(), "string")

def generate_sales_df(start_order_id: int, rows: int):
    # transactions_timestamp as string to make it easy to inject 'aaa' in malformed file
    return (
        spark.range(start_order_id, start_order_id + rows)
        .withColumnRenamed("id", "order_id")
        .withColumn("email", F.concat(F.lit("user"), F.col("order_id"), F.lit("@example.com")))
        .withColumn(
            "transactions_timestamp",
            F.date_format(
                F.expr("timestampadd(SECOND, cast(rand()*86400 as int), current_timestamp())"),
                "yyyy-MM-dd HH:mm:ss"
            )
        )
        .withColumn("total_item_quantity", (F.rand() * 5 + 1).cast("int"))
        .withColumn("purchase_revenue_in_usd", F.round(F.rand() * 1200 + 20, 2))
        .withColumn("unique_items", (F.rand() * 3 + 1).cast("int"))
        .withColumn("items", random_items_udf())
        .select(
            "order_id",
            "email",
            "transactions_timestamp",
            "total_item_quantity",
            "purchase_revenue_in_usd",
            "unique_items",
            "items"
        )
    )

def write_single_csv(df, dest_csv_path: str):
    """
    Writes a single CSV file to dest_csv_path by coalescing to 1 part file,
    then moving it to the requested filename.
    """
    tmp_dir = dest_csv_path.replace(".csv", "_tmp")
    (df.coalesce(1)
       .write.mode("overwrite")
       .option("header", "true")
       .csv(tmp_dir))

    part_file = [f.path for f in dbutils.fs.ls(tmp_dir) if f.name.startswith("part-")][0]
    dbutils.fs.rm(dest_csv_path, True)                 # remove if exists
    dbutils.fs.mv(part_file, dest_csv_path, True)      # rename part -> desired name
    dbutils.fs.rm(tmp_dir, True)                       # cleanup

def write_text_file(dest_path: str, text: str):
    dbutils.fs.rm(dest_path, True)
    dbutils.fs.put(dest_path, text, overwrite=True)

# -----------------------------
# 1) Create raw/sales-csv/000.csv..003.csv (1500 rows each)
# -----------------------------
for i in range(NUM_SALES_FILES):
    start_id = 1 + i * ROWS_PER_FILE
    df_i = generate_sales_df(start_id, ROWS_PER_FILE)
    out_path = f"{RAW_SALES_DIR}/{i:03d}.csv"
    write_single_csv(df_i, out_path)

print("✅ Sales CSV files created:", [f.name for f in dbutils.fs.ls(RAW_SALES_DIR)])

# We'll base malformed files on 000.csv's content
df_000 = spark.read.option("header", "true").csv(f"{RAW_SALES_DIR}/000.csv")

# -----------------------------
# 2) malformed_example_1_data.csv
#    Same as 000.csv, except FIRST ROW transactions_timestamp = 'aaa'
# -----------------------------
# Make a deterministic "first row" using row_number by order_id
w = Window.orderBy(F.col("order_id").cast("long"))

df_m1 = (df_000
    .withColumn("rn", F.row_number().over(w))
    .withColumn(
        "transactions_timestamp",
        F.when(F.col("rn") == 1, F.lit("aaa")).otherwise(F.col("transactions_timestamp"))
    )
    .drop("rn")
)

m1_path = f"{OPS_DEMO_DIR}/malformed_example_1_data.csv"
write_single_csv(df_m1, m1_path)

# -----------------------------
# 3) malformed_example_2_data.csv
#    Header has only 6 columns (missing order_id),
#    but each record has 7 fields (includes order_id as extra first field)
# -----------------------------
cols_6 = [
    "email",
    "transactions_timestamp",
    "total_item_quantity",
    "purchase_revenue_in_usd",
    "unique_items",
    "items"
]

# IMPORTANT: data rows will include order_id + the 6 cols = 7 fields per row
cols_7_for_data = ["order_id"] + cols_6

tmp_dir = f"{OPS_DEMO_DIR}/m2_tmp"

(df_000.select(*cols_7_for_data)
   .coalesce(1)
   .write.mode("overwrite")
   .option("header", "false")
   .csv(tmp_dir))

part_file = [f.path for f in dbutils.fs.ls(tmp_dir) if f.name.startswith("part-")][0]

# Pull the data lines (7-field CSV lines)
lines = [r.value for r in spark.read.text(part_file).collect()]

# Write a 6-column header (missing order_id)
custom_header = ",".join(cols_6)

m2_path = f"{OPS_DEMO_DIR}/malformed_example_2_data.csv"
dbutils.fs.rm(m2_path, True)
dbutils.fs.put(m2_path, custom_header + "\n" + "\n".join(lines) + "\n", overwrite=True)

dbutils.fs.rm(tmp_dir, True)


# -----------------------------
# Verification
# -----------------------------
print("\nRAW sales-csv directory listing:")
print([f.name for f in dbutils.fs.ls(RAW_SALES_DIR)])

print("\nOPS csv_demo_files directory listing:")
print([f.name for f in dbutils.fs.ls(OPS_DEMO_DIR)])

# Row counts for the 4 raw files
for i in range(NUM_SALES_FILES):
    p = f"{RAW_SALES_DIR}/{i:03d}.csv"
    c = spark.read.option("header","true").csv(p).count()
    print(f"{i:03d}.csv rows = {c}")

# Quick checks for malformed
m1_first_ts = (spark.read.option("header","true").csv(m1_path)
               .orderBy(F.col("order_id").cast("long"))
               .select("transactions_timestamp").first()[0])
print("\nmalformed_example_1_data.csv first transactions_timestamp:", m1_first_ts)

# malformed2: read as text to confirm header has 6 cols
m2_head = dbutils.fs.head(m2_path, 200)
print("\nmalformed_example_2_data.csv header line:")
print(m2_head.splitlines()[0])

print("\n✅ Setup complete.")
