In [0]:
#Libraries
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType, IntegerType, DateType
from pyspark.sql import DataFrame
from delta.tables import DeltaTable

In [0]:
# Variables
bronze_table = "bronze.training_io.rt_marketing"
silver_table = "silver.training_io.rt_marketing"
checkpoint_path = "/Volumes/silver/checkpoints/realtime"
key_columns = ["id"]
timestamp_column = "kafka_ts"

In [0]:
# Functions
def flat_bronze_df(df: DataFrame) -> DataFrame:
    """
    Reads the bronze table as a streaming DataFrame, parses JSON and flattens.
    """
    schema = StructType([StructField("id", IntegerType(), True),
                     StructField("datetime", TimestampType(), True),
                     StructField("duration", DoubleType(), True),
                     StructField("title", StringType(), True),
                     StructField("genres", StringType(), True),
                     StructField("release_date", DateType(), True),
                     StructField("movie_id", StringType(), True),
                     StructField("user_id", StringType(), True)])

    df = df.select(
        col("timestamp").alias("kafka_ts"),
        from_json(col("json_event"), schema=schema).alias("parsed_json")
    )

    df = df.select(
        col("kafka_ts"),
        col("parsed_json.id"),
        col("parsed_json.datetime"),
        col("parsed_json.duration"),
        col("parsed_json.title"),
        col("parsed_json.genres"),
        col("parsed_json.release_date"),
        col("parsed_json.movie_id"),
        col("parsed_json.user_id")
    )

    return df


def upsert(microBatchDF, batchId, silver_table: str, key_columns: list, timestamp_column: str) -> None:
    """
    Perform SCD1 upsert (merge) into a Delta table using timestamp comparison.
    """

    silver_delta_table = DeltaTable.forName(spark, silver_table)

    join_condition = " AND ".join([f"target.{col} = source.{col}" for col in key_columns])
    update_set = {col: f"source.{col}" for col in microBatchDF.columns}

    silver_delta_table.alias("target").merge(
        microBatchDF.alias("source"),
        join_condition
    ).whenMatchedUpdate(
        condition=f"source.{timestamp_column} > target.{timestamp_column}",
        set=update_set
    ).whenNotMatchedInsertAll().execute()

In [0]:
def main():
    # Read the data from the bronze table
    bronze_df = spark.readStream.table(bronze_table)

    # Flat the json data
    bronze_df = flat_bronze_df(bronze_df)

    # Deduplicate by key columns
    bronze_df = bronze_df.dropDuplicates(key_columns)

    # Start streaming with foreachBatch for upserts
    bronze_df.writeStream \
        .format("delta") \
        .foreachBatch(lambda df, batchId: upsert(df, batchId, silver_table, key_columns, timestamp_column)) \
        .outputMode("append") \
        .option("checkpointLocation", checkpoint_path) \
        .trigger(processingTime="1 minute") \
        .start()

if __name__ == "__main__":
    main()
