In [None]:
# Create the Spark Session
import pyspark
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Writing to Multiple Sinks")
    .config("spark.streaming.stopGracefullyOnShutdown", True)
    .config(
        "spark.jars",
        f"/opt/spark/jars/spark-sql-kafka-0-10_2.12-{pyspark.__version__}.jar",
    )
    .config("spark.jars", "/opt/spark/jars/postgresql-42.7.4.jar")
    .config("spark.sql.shuffle.partitions", 8)
    .master("spark://spark-master:7077")
    .getOrCreate()
)

spark

24/12/02 17:25:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [None]:
# Create the kafka_df to read from kafka

kafka_df = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("subscribe", "device-data")
    .option("startingOffsets", "earliest")
    .load()
)


In [3]:
from utils import flatten_df

flattened_df = flatten_df(kafka_df)

In [None]:
# Check the schema of the flattened_df, place a sample json file and change readStream to read
flattened_df.printSchema()

root
 |-- customerId: string (nullable = true)
 |-- eventId: string (nullable = true)
 |-- eventOffset: long (nullable = true)
 |-- eventPublisher: string (nullable = true)
 |-- eventTime: string (nullable = true)
 |-- deviceId: string (nullable = true)
 |-- measure: string (nullable = true)
 |-- status: string (nullable = true)
 |-- temperature: long (nullable = true)



In [None]:
# Python function to write to multiple sinks
def device_data_output(df, batch_id):
    print("Batch id: " + str(batch_id))

    # Write to parquet
    df.write.format("parquet").mode("append").save(
        "hdfs://namenode:9000/output/streaming/05/device_data"
    )

    # Write to JDBC Postgres
    (
        df.write.mode("append")
        .format("jdbc")
        .option("driver", "org.postgresql.Driver")
        .option("url", "jdbc:postgresql://postgres:5432/streaming_db")
        .option("dbtable", "device_data")
        .option("user", "postgres")
        .option("password", "postgres")
        .save()
    )

    # Diplay
    df.show()

In [None]:
# Running foreachBatch
# Write the output to Multiple Sinks

(
    flattened_df.writeStream.foreachBatch(device_data_output)
    # .trigger(processingTime='10 seconds')
    # .option("checkpointLocation", f"/home/jovyan/streaming_checkpoint_dir/{spark.sparkContext.appName.replace(' ', '_')}")
    .start()
    .awaitTermination()
)

24/12/02 17:25:35 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-f1c18493-0d1d-4c6b-916e-03dfd933edf0. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/12/02 17:25:35 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/12/02 17:25:36 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


Batch id: 0


                                                                                

+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+
|customerId|             eventId|eventOffset|eventPublisher|           eventTime|deviceId|measure| status|temperature|
+----------+--------------------+-----------+--------------+--------------------+--------+-------+-------+-----------+
|   CI00103|e3cb26d3-41b2-49a...|      10001|        device|2023-01-05 11:13:...|    D001|      C|  ERROR|         15|
|   CI00103|e3cb26d3-41b2-49a...|      10001|        device|2023-01-05 11:13:...|    D002|      C|SUCCESS|         16|
|   CI00108|aa90011f-3967-496...|      10003|        device|2023-01-05 11:13:...|    D004|      C|SUCCESS|         16|
|   CI00106|804e8fa3-307b-482...|      10005|        device|2023-01-05 11:13:...|    D002|      C|  ERROR|         30|
|   CI00106|804e8fa3-307b-482...|      10005|        device|2023-01-05 11:13:...|    D001|      C|STANDBY|         10|
|   CI00106|804e8fa3-307b-482...|      10005|   

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [7]:
spark.stop()