##### Complaint Generator

This notebook generates synthetic customer complaints from delivered orders using AI SQL functions

In [None]:
CATALOG = dbutils.widgets.get("CATALOG")
COMPLAINT_RATE = float(dbutils.widgets.get("COMPLAINT_RATE"))
LLM_MODEL = dbutils.widgets.get("LLM_MODEL")

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import StringType
import random

# Complaint category weights
CATEGORIES = ["delivery_delay", "missing_items", "food_quality", "service_issue", "other"]
WEIGHTS = [0.40, 0.25, 0.20, 0.10, 0.05]

# Create a deterministic but varied category assignment
# Using hash of order_id to ensure same order always gets same category
@F.udf(returnType=StringType())
def assign_category(order_id: str) -> str:
    """Assign category based on hash of order_id for deterministic but varied distribution."""
    # Use hash to get deterministic random selection
    seed = hash(order_id) % (2**31)
    rng = random.Random(seed)
    return rng.choices(CATEGORIES, weights=WEIGHTS, k=1)[0]

In [None]:
# Create schema and checkpoint volume
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.complaints")
spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.complaints.checkpoints")

In [None]:
# Stream processing pipeline
# Samples COMPLAINT_RATE of both historical and new delivered orders
complaints_stream = (
    spark.readStream
    .table(f"{CATALOG}.lakeflow.all_events")
    .filter("event_type = 'delivered'")
    .filter(F.rand() < COMPLAINT_RATE)  # Sample orders for complaints (same rate for historical and new)
    .withColumn("complaint_id", F.expr("uuid()"))
    .withColumn("complaint_category", assign_category(F.col("order_id")))
    .withColumn("ts", F.current_timestamp())
    .selectExpr(
        "complaint_id",
        "order_id",
        "ts",
        "complaint_category",
        f"""ai_gen(
            concat(
                'You are an upset customer writing a complaint about your food delivery. ',
                'Write a realistic 1-2 sentence complaint about: ', 
                complaint_category, 
                '. Order ID: ', 
                order_id, 
                '. Be specific and sound frustrated but realistic. Do not include greeting or signature.'
            )
        ) as complaint_text""",
        "'llm_generator' as generated_by"
    )
)

In [None]:
# Create target table schema
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {CATALOG}.complaints.raw_complaints (
  complaint_id STRING,
  order_id STRING,
  ts TIMESTAMP,
  complaint_category STRING,
  complaint_text STRING,
  generated_by STRING
)
""")

In [None]:
# Write stream to raw_complaints table
complaints_stream.writeStream \
    .option("checkpointLocation", f"/Volumes/{CATALOG}/complaints/checkpoints/complaint_generator") \
    .trigger(availableNow=True) \
    .table(f"{CATALOG}.complaints.raw_complaints")