In [0]:
# Databricks notebook cell
# ---------------------------------------------------------------
# Generate & append Delta‑format transaction data
#  • One human‑readable folder per workspace user
#  • Partitions by transaction_date=YYYY‑MM‑DD
#  • Half‑year backfill  +  small “yesterday” demo batch
# ---------------------------------------------------------------

from databricks.sdk import WorkspaceClient      # Databricks SDK ≥0.18
from pyspark.sql import functions as F, DataFrame
import re

# ─── CONFIG ────────────────────────────────────────────────────
init_rows   = 1_000_000                       # per‑user initial load (initial 100K)
days_back   = 90                           # random dates within last N days (initial 180 days)
# ───────────────────────────────────────────────────────────────


def build_transactions_df(n_rows: int, days_back: int | None) -> DataFrame:
    """
    Create a DataFrame of random transactions.
    • If days_back is None → every row dated yesterday.
    • Else → random date within the last `days_back` days.
    """

    df = spark.range(n_rows).withColumn(
             "shop_id", (F.rand() * 15).cast("int") + 1
         )

    # choose a valid base_date for each row
    if days_back is None:
        df = df.withColumn("transaction_date", F.date_sub(F.current_date(), 1))
    else:
        df = df.withColumn(
                 "transaction_date",
                 F.date_sub(F.current_date(), (F.rand() * days_back).cast("int"))
             )

    # add 0‑86 399 seconds inside that day via timestampadd
    df = df.withColumn(
             "transaction_time",
             F.expr("""
                 timestampadd(
                     SECOND,
                     cast(rand() * 86400 as int),
                     cast(transaction_date as timestamp)
                 )
             """)
         ) \
         .withColumn("amount", F.round(F.rand() * 1000, 2).cast("decimal(10,2)")) \
         .withColumn("tax",    F.lit(0.05).cast("decimal(3,2)")) \
         .withColumn("description", F.md5(F.col("shop_id").cast("string"))) \
         .drop("id")

    return df
# ───────────────────────────────────────────────────────────────

# ─── CREATE TABLE IF NOT EXISTS ────────────────────────────────
spark.sql("""
CREATE TABLE IF NOT EXISTS default.transactions (
    id BIGINT GENERATED ALWAYS AS IDENTITY,
    shop_id INT,
    transaction_date DATE,
    transaction_time TIMESTAMP,
    amount DECIMAL(10,2),
    tax DECIMAL(3,2),
    description STRING
)
TBLPROPERTIES (
    delta.enableChangeDataFeed = true,
    delta.appendOnly = true
)
CLUSTER BY AUTO;
""")

# ─── MAIN LOOP ─────────────────────────────────────────────────

# ① Initial half‑year data (overwrite once)
(build_transactions_df(init_rows, days_back=days_back)
        .write
        .mode("append")
        .saveAsTable("default.transactions"))

In [0]:
%sql
OPTIMIZE default.transactions;

In [0]:
%sql
SELECT
  *
FROM
  default.transactions
WHERE
  id = 50000;

In [0]:
%sql
SELECT
  *
FROM
  default.transactions
WHERE
  id = 50000;