In [None]:
# Create the Spark Session
import pyspark
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Window Operations and Watermarks")
    .config("spark.streaming.stopGracefullyOnShutdown", True)
    .config("spark.sql.shuffle.partitions", 8)
    .master("spark://spark-master:7077")
    .getOrCreate()
)

spark

In [None]:
# Create the kafka_df to read from kafka

kafka_df = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("subscribe", "words")
    .option("startingOffsets", "earliest")
    .option("maxOffsetsPerTrigger", "1")
    .load()
)

In [None]:
# Convert binary to string value column
from pyspark.sql import functions as F

kafka_json_df = kafka_df.withColumn("value", F.expr("cast(value as string)"))

In [None]:
# JSON Schema
json_schema = "event_time string, data string"

# Expand JSON from Value column using Schema
json_df = kafka_json_df.withColumn(
    "values_json", F.from_json(col("value"), json_schema)
)

In [None]:
# Select the required columns

flattened_df = json_df.select("values_json.event_time", "values_json.data")

In [None]:
# Split the data in words

words_df = (
    flattened_df.withColumn("words", F.split("data", " "))
    .withColumn("word", F.explode("words"))
    .withColumn("event_time", F.col("event_time").cast("timestamp"))
)

In [None]:
words_df.printSchema()

In [None]:
# Aggregate the words to generate count
from pyspark.sql.functions import count, lit, window

df_agg = (
    words_df.withWatermark("event_time", "10 minutes")
    .groupBy(F.window("event_time", "10 minutes", "5 minutes"), "word")
    .count()
)

In [None]:
df_final = df_agg.selectExpr(
    "window.start as start_time", "window.end as end_time", "word", "count"
)

In [None]:
df_final.printSchema()

In [None]:
# In complete mode, watermark takes no effect

(
    df_final.writeStream.format("console")
    .outputMode("complete")
    .trigger(processingTime="5 seconds")
    .start()
    .awaitTermination()
)

In [None]:
(
    df_final.writeStream.format("console")
    .outputMode("update")
    .trigger(processingTime="5 seconds")
    .start()
    .awaitTermination()
)

In [None]:
spark.stop()