In [17]:
# Create the Spark Session
import pyspark
from pyspark.sql import SparkSession

spark = (
    SparkSession 
    .builder 
    .appName("Streaming from Kafka") 
    .config("spark.streaming.stopGracefullyOnShutdown", True) 
    .config('spark.jars.packages', f'org.apache.spark:spark-sql-kafka-0-10_2.12:{pyspark.__version__}')
    .config("spark.sql.shuffle.partitions", 4)
    .master("spark://spark-master:7077") 
    .getOrCreate()
)

spark

In [18]:
# Create the kafka_df to read from kafka

kafka_df = (
    spark
    .readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("subscribe", "device-data")
    .option("startingOffsets", "earliest")
    .load()
)

kafka_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [19]:
# Parse value from binay to string into kafka_json_df
from pyspark.sql import functions as F

kafka_json_df = kafka_df.withColumn("value", F.expr("cast(value as string)"))

In [20]:
# Schema of the Pyaload

from pyspark.sql.types import StringType, StructField, StructType, ArrayType, LongType

json_schema = (
    StructType(
    [StructField('customerId', StringType(), True), 
    StructField('data', StructType(
        [StructField('devices', 
                     ArrayType(StructType([ 
                        StructField('deviceId', StringType(), True), 
                        StructField('measure', StringType(), True), 
                        StructField('status', StringType(), True), 
                        StructField('temperature', LongType(), True)
                    ]), True), True)
        ]), True), 
    StructField('eventId', StringType(), True), 
    StructField('eventOffset', LongType(), True), 
    StructField('eventPublisher', StringType(), True), 
    StructField('eventTime', StringType(), True)
    ])
)

In [21]:
streaming_df = kafka_json_df.withColumn("values_json", F.from_json(F.col("value"), json_schema)).selectExpr("values_json.*")
streaming_df.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- devices: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- deviceId: string (nullable = true)
 |    |    |    |-- measure: string (nullable = true)
 |    |    |    |-- status: string (nullable = true)
 |    |    |    |-- temperature: long (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)



In [22]:
# Lets explode the data as devices contains list/array of device reading
exploded_df = streaming_df.withColumn("data_devices", F.explode("data.devices"))
exploded_df.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- devices: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- deviceId: string (nullable = true)
 |    |    |    |-- measure: string (nullable = true)
 |    |    |    |-- status: string (nullable = true)
 |    |    |    |-- temperature: long (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)
 |-- data_devices: struct (nullable = true)
 |    |-- deviceId: string (nullable = true)
 |    |-- measure: string (nullable = true)
 |    |-- status: string (nullable = true)
 |    |-- temperature: long (nullable = true)



In [23]:
# Flatten the exploded df
flattened_df = (
    exploded_df
    .drop("data")
    .withColumn("deviceId", F.col("data_devices.deviceId"))
    .withColumn("measure", F.col("data_devices.measure"))
    .withColumn("status", F.col("data_devices.status"))
    .withColumn("temperature", F.col("data_devices.temperature"))
    .drop("data_devices")
)
flattened_df.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)
 |-- deviceId: string (nullable = true)
 |-- measure: string (nullable = true)
 |-- status: string (nullable = true)
 |-- temperature: long (nullable = true)



In [24]:
# Write the output to console sink to check the output

(flattened_df
 .writeStream
 .format("console")
 .outputMode("append")
 .option("checkpointLocation", f"/home/jovyan/streaming_checkpoint_dir/{spark.sparkContext.appName.replace(' ', '_')}")
 .start()
 .awaitTermination())

24/11/16 12:48:11 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/11/16 12:48:12 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


-------------------------------------------
Batch: 0
-------------------------------------------
+----------+-------+-----------+--------------+---------+--------+-------+------+-----------+
|customerId|eventId|eventOffset|eventPublisher|eventTime|deviceId|measure|status|temperature|
+----------+-------+-----------+--------------+---------+--------+-------+------+-----------+
+----------+-------+-----------+--------------+---------+--------+-------+------+-----------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+----------+-------+-----------+--------------+---------+--------+-------+------+-----------+
|customerId|eventId|eventOffset|eventPublisher|eventTime|deviceId|measure|status|temperature|
+----------+-------+-----------+--------------+---------+--------+-------+------+-----------+
+----------+-------+-----------+--------------+---------+--------+-------+------+-----------+

-------------------------------------------
Batch: 2
-------------------------------------------
+----------+-------+-----------+--------------+---------+--------+-------+------+-----------+
|customerId|eventId|eventOffset|eventPublisher|eventTime|deviceId|measure|status|temperature|
+----------+-------+-----------+--------------+---------+--------+-------+------+-----------+
+----------+-------+-----------+--------------+---------+--------+-------+------+-----------+

-------------------------------------------
Batch: 3

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [25]:
spark.stop()