In [0]:
import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import *

@dlt.table(
    name="bronze_yelp_raw",
    comment="Raw data from Yelp volume including mixed files"
)
def bronze_yelp_raw():
    return (
        spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "json")
        .option("cloudFiles.inferColumnTypes", "true")
        .load("/Volumes/workspace/default/yelp-reviews/")
    )



@dlt.table(name="silver_yelp_reviews")
@dlt.expect_or_drop("valid_review", "review_id IS NOT NULL") 
def silver_yelp_reviews():
    return (
        dlt.read_stream("bronze_yelp_raw")
        .filter(col("review_id").isNotNull())
        .select(
            "review_id", "user_id", "business_id", 
            col("stars").cast("double"), 
            "text", "date"
        )
    )

def silver_yelp_reviews():
    return (
        dlt.read_stream("bronze_yelp_raw")
        .select(
            col("review_id"),
            col("user_id"),
            col("business_id"),
            col("stars").cast("double"),
            to_timestamp(col("date")).alias("review_date"),
            col("text"),
            current_timestamp().alias("processing_time")
        )
    )


@dlt.table(
    name="gold_business_stats",
    comment="Business performance metrics"
)
def gold_business_stats():
    return (
        dlt.read_stream("silver_yelp_reviews")
        .groupBy("business_id")
        .agg(
            avg("stars").alias("avg_rating"),
            count("review_id").alias("total_reviews")
        )
    )