In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
streaming_schema = StructType([
    StructField("review_id", StringType(), True),
    StructField("user_id", StringType(), True),
    StructField("business_id", StringType(), True),
    StructField("stars", DoubleType(), True),
    StructField("date", StringType(), True),
    StructField("text", StringType(), True)
])

In [0]:
streaming_reviews = spark.readStream \
    .schema(streaming_schema) \
    .option("recursiveFileLookup", "true") \
    .json("/Volumes/workspace/default/yelp-reviews/") 


processed_stream = streaming_reviews \
    .filter(col("review_id").isNotNull()) \
    .withColumn("processing_time", current_timestamp()) \
    .withColumn("date", to_timestamp(col("date")))


query = processed_stream.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", "/Volumes/workspace/default/yelp-reviews/_checkpoints/reviews_only_v5") \
    .trigger(availableNow=True) \
    .toTable("workspace.default.streaming_reviews")

In [0]:
display(spark.table("workspace.default.streaming_reviews"))

In [0]:
display(spark.sql("""
  SELECT business_id, avg(stars) as avg_rating, count(*) as total_reviews
  FROM workspace.default.streaming_reviews
  GROUP BY business_id
  ORDER BY avg_rating DESC
"""))