In [None]:
# Create the Spark Session
import pyspark
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Triggers in Spark Streaming")
    .config(
        "spark.streaming.stopGracefullyOnShutdown", True
    )  # Even if we manually stop de job, it continues processing
    .config(
        "spark.jars",
        f"/opt/spark/jars/spark-sql-kafka-0-10_2.12-{pyspark.__version__}.jar",
    )
    .config("spark.sql.shuffle.partitions", 8)
    .master("spark://spark-master:7077")
    .getOrCreate()
)

spark

24/12/02 17:14:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [None]:
# Create the kafka_df to read from kafka

kafka_df = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("subscribe", "device-data")
    .option("startingOffsets", "earliest")
    # .option("maxOffsetsPerTrigger", "1") # we limit the number of records per micro batch
    .load()
)

In [3]:
from utils import flatten_df

flattened_df = flatten_df(kafka_df)

In [None]:
# PROCESSING TIME 10 SECONDS

(
    flattened_df.writeStream.queryName("kafka_table")
    .format("memory")
    .outputMode("append")
    .trigger(availableNow=True)
    # .option("checkpointLocation", f"/home/jovyan/streaming_checkpoint_dir/{spark.sparkContext.appName.replace(' ', '_')}")
    .start()
    .awaitTermination()
)

24/12/02 17:14:32 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-3b422c91-7fdd-413c-83ea-a08345e74b8b. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/12/02 17:14:32 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/12/02 17:14:33 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

In [5]:
# View data from Memory Sink
spark.sql("select * from kafka_table").show()

[Stage 1:>                                                          (0 + 1) / 1]

+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+
|customerId|             eventId|eventOffset|eventPublisher|           eventTime|deviceId|measure| status|temperature|
+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+
|   CI00103|e3cb26d3-41b2-49a...|      10001|        device|2023-01-05 11:13:...|    D001|      C|  ERROR|         15|
|   CI00103|e3cb26d3-41b2-49a...|      10001|        device|2023-01-05 11:13:...|    D002|      C|SUCCESS|         16|
|   CI00108|aa90011f-3967-496...|      10003|        device|2023-01-05 11:13:...|    D004|      C|SUCCESS|         16|
|   CI00106|804e8fa3-307b-482...|      10005|        device|2023-01-05 11:13:...|    D002|      C|  ERROR|         30|
|   CI00106|804e8fa3-307b-482...|      10005|        device|2023-01-05 11:13:...|    D001|      C|STANDBY|         10|
|   CI00106|804e8fa3-307b-482...|      10005|   

                                                                                

In [6]:
spark.stop()