In [0]:
%pip install faker
# Update these to match the catalog and schema
# that you used for the pipeline in step 1.

In [0]:
catalog = "workspace"
schema = dbName = db = "default"

spark.sql(f'USE CATALOG `{catalog}`')
spark.sql(f'USE SCHEMA `{schema}`')
spark.sql(f'CREATE VOLUME IF NOT EXISTS `{catalog}`.`{db}`.`cdc`')
volume_folder =  f"/Volumes/{catalog}/{db}/cdc"

try:
  dbutils.fs.ls(volume_folder+"/customers")
except:
  print(f"folder doesn't exist, generating the data under {volume_folder}...")
  from pyspark.sql import functions as F
  from faker import Faker
  from collections import OrderedDict
  import uuid
  fake = Faker()
  import random

  fake_firstname = F.udf(fake.first_name)
  fake_lastname = F.udf(fake.last_name)
  fake_email = F.udf(fake.ascii_company_email)
  fake_date = F.udf(lambda:fake.date_time_this_month().strftime("%m-%d-%Y %H:%M:%S"))
  fake_address = F.udf(fake.address)
  operations = OrderedDict([("APPEND", 0.5),("DELETE", 0.1),("UPDATE", 0.3),(None, 0.01)])
  fake_operation = F.udf(lambda:fake.random_elements(elements=operations, length=1)[0])
  fake_id = F.udf(lambda: str(uuid.uuid4()) if random.uniform(0, 1) < 0.98 else None)

  df = spark.range(0, 100000).repartition(100)
  df = df.withColumn("id", fake_id())
  df = df.withColumn("firstname", fake_firstname())
  df = df.withColumn("lastname", fake_lastname())
  df = df.withColumn("email", fake_email())
  df = df.withColumn("address", fake_address())
  df = df.withColumn("operation", fake_operation())
  df_customers = df.withColumn("operation_date", fake_date())
  df_customers.repartition(100).write.format("json").mode("overwrite").save(volume_folder+"/customers")

In [0]:
catalog = "workspace"
schema = "default"

display(spark.read.json(f"/Volumes/{catalog}/{schema}/cdc/customers"))

In [0]:
from pyspark.sql import functions as F

# 1. sample data
existed = spark.read.json("/Volumes/workspace/default/cdc/customers").sample(0.3).limit(2000).select("id", "firstname", "lastname", "email", "address")

# 2. update data
updates_df = existed.withColumn("operation_type_rand", F.rand()) \
    .withColumn("operation", 
        F.expr("CASE WHEN operation_type_rand > 0.3 THEN 'UPDATE' ELSE 'DELETE' END")) \
    .withColumn("firstname", 
        F.expr("CASE WHEN operation = 'UPDATE' THEN 'Updated_Name' ELSE firstname END")) \
    .withColumn("address", 
        F.lit("Updated Address")) \
    .withColumn("email", 
        F.lit("test@email.com")) \
    .withColumn("operation_date", F.current_timestamp()) \
    .withColumn("_rescued_data", F.lit(None)) \
    .drop("operation_type_rand")

# 3. Save file
path = "/Volumes/workspace/default/cdc/customers"
updates_df.write.format("json").mode("append").save(path)