In [0]:
sample_size = dbutils.widgets.get("sample_size")
num_records = int(float(sample_size))

In [0]:
spark.sql(f"""CREATE TABLE IF NOT EXISTS tests.bronze_source.transactions_{sample_size} (
    transaction_id      BIGINT GENERATED ALWAYS AS IDENTITY NOT NULL PRIMARY KEY,
    transaction_date    DATE GENERATED ALWAYS AS (CAST(DATE(transaction_time) AS DATE)) NOT NULL,
    transaction_time    TIMESTAMP NOT NULL,
    amount              DECIMAL(10,2),
    tax                 DECIMAL(3,2),
    description         STRING NOT NULL
)
TBLPROPERTIES (
    'delta.appendOnly' = true,
    'delta.enableChangeDataFeed' = true,
    'delta.enableRowTracking' = true
);""")


In [0]:
from pyspark.sql.functions import (
    col, rand, round, current_date, date_sub, expr, lit, md5, concat
)

# Generate a range dataframe [0..(num_records - 1)]
df = spark.range(num_records)

df = (
    df
      .withColumn(
          "transaction_time",
          expr("""
            make_timestamp(
              year(date_sub(current_date(),1)),
              month(date_sub(current_date(),1)),
              day(date_sub(current_date(),CAST(FLOOR(RAND() * 10) + 1 AS INT))),
              cast(rand() * 24 as int),  -- hour: 0-23
              cast(rand() * 60 as int),  -- minute: 0-59
              cast(rand() * 60 as int)   -- second: 0-59
            )
          """)
      )
      .withColumn("random_num", round(rand() * 1000, 2)) 
      .withColumn("amount", col("random_num").cast("decimal(10,2)"))
      .withColumn("tax", lit(0.05).cast("decimal(3,2)"))
      .withColumn("description", md5(concat(col("random_num"))))
      .drop("id", "random_num")
)

df.write.mode("append").saveAsTable(f"tests.bronze_source.transactions_{sample_size}")

In [0]:
spark.sql(f"OPTIMIZE tests.bronze_source.transactions_{sample_size}")