In [0]:
# Libraries
from pyspark.sql import DataFrame
from pyspark.sql.functions import sum

In [0]:
# Variables
silver_table = 'silver.training_io.rt_marketing'
gold_table = 'gold.training_io.rt_marketing'
checkpoint_path = '/Volumes/gold/checkpoints/realtime'

In [0]:
%sql
CREATE TABLE IF NOT EXISTS gold.training_io.rt_marketing
(
  movie_id STRING,
  title STRING,
  sum_duration DOUBLE
)
USING DELTA

In [0]:
def read_silver_table() -> DataFrame:
    """
    Reads the silver Delta table as a streaming DataFrame.
    Returns:
        DataFrame
    """
    return (
        spark.readStream
        .format("delta")
        .option("ignoreChanges", "true")
        .table(silver_table)
    )

def aggregate_duration(df: DataFrame) -> DataFrame:
    """
    Aggregates total duration per movie_id and title.
    Args:
        df: DataFrame
    Returns:
        DataFrame
    """
    return (
        df.groupBy("movie_id", "title")
            .agg(sum("duration").alias("sum_duration"))
    )

def write_to_gold_table(df: DataFrame) -> None:
    """
    Writes the aggregated DataFrame to the gold Delta table using streaming.
    Args:
        df: DataFrame
    """
    (
        df.writeStream
        .format("delta")
        .outputMode("complete")
        .option("checkpointLocation", checkpoint_path)
        .trigger(processingTime="1 minute")
        .table(gold_table)
    )


In [0]:
def main():
    # Streaming read from silver
    silver_df = read_silver_table()

    # Aggregate
    silver_df = aggregate_duration(silver_df)

    # Write to gold
    write_to_gold_table(silver_df)

if __name__ == "__main__":
    main()