#Labs Data Generator

In [0]:
%python
dbutils.widgets.text("catalog", "")
dbutils.widgets.text("schema", "")
catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")
spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.{schema}")
spark.sql(f"USE {catalog}.{schema}")

In [0]:
%pip install dbldatagen

In [0]:
import dbldatagen as dg
import pyspark.sql.functions as F

partitions_requested = 32
data_rows = 10 * 1000 * 1000

uniqueCustomers = 10 * 1000000

dataspec = (
    dg.DataGenerator(spark, rows=data_rows, partitions=partitions_requested)
      .withColumn("customer_id","long", uniqueValues=uniqueCustomers)
      .withColumn("name", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
      .withColumn("alias", percentNulls=0.01, template=r'\\w \\w|\\w a. \\w')
      .withColumn("payment_instrument_type", values=['paypal', 'Visa', 'Mastercard',
                  'American Express', 'discover', 'branded visa', 'branded mastercard'],
                  random=True, distribution="normal")
      .withColumn("int_payment_instrument", "int",  minValue=0000, maxValue=9999,
                  baseColumn="customer_id", baseColumnType="hash", omit=True)
      .withColumn("payment_instrument",
                  expr="format_number(int_payment_instrument, '**** ****** *####')",
                  baseColumn="int_payment_instrument")
      .withColumn("email", template=r'\\w.\\w@\\w.com|\\w-\\w@\\w')
      .withColumn("email2", template=r'\\w.\\w@\\w.com')
      .withColumn("ip_address", template=r'\\n.\\n.\\n.\\n')
      .withColumn("md5_payment_instrument",
                  expr="md5(concat(payment_instrument_type, ':', payment_instrument))",
                  base_column=['payment_instrument_type', 'payment_instrument'])
      .withColumn("customer_notes", text=dg.ILText(words=(1,8)))
      .withColumn("created_ts", "timestamp", expr="now()")
      .withColumn("modified_ts", "timestamp", expr="now()")
      .withColumn("memo", expr="'original data'")
      )
df1 = dataspec.build()

In [0]:
spark.sql(f"CREATE VOLUME IF NOT EXISTS {catalog}.{schema}.customers_lz")
df1.write.mode("overwrite").json(f"/Volumes/{catalog}/{schema}/customers_lz")