In [None]:
# Create the Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.appName("Streaming Process Files")
    .config("spark.streaming.stopGracefullyOnShutdown", True)
    .config("spark.sql.streaming.forceDeleteTempCheckpointLocation", True)
    .master("spark://spark-master:7077")
    .getOrCreate()
)

spark

In [None]:
# To allow automatic schemaInference while reading
spark.conf.set("spark.sql.streaming.schemaInference", True)

# Create the streaming_df to read from input directory
streaming_df = (
    spark.readStream.option("cleanSource", "archive")  # can be delete or archive.
    .option(
        "sourceArchiveDir", "hdfs://namenode:9000/archive_dir/"
    )  # if cleanSource, archive -> archive directory
    .option("maxFilesPerTrigger", 1)
    .format("json")
    .load("hdfs://namenode:9000/input/data/device_data/")
)

In [None]:
# To the schema of the data, place a sample json file and change readStream to read
streaming_df.printSchema()
# streaming_df.show(truncate=False)

In [None]:
# Lets explode the data as devices contains list/array of device reading
from pyspark.sql import functions as F

exploded_df = streaming_df.withColumn("data_devices", F.explode("data.devices"))
exploded_df.printSchema()

In [None]:
# Flatten the exploded df
from pyspark.sql.functions import col

flattened_df = (
    exploded_df.drop("data")
    .withColumn("deviceId", F.col("data_devices.deviceId"))
    .withColumn("measure", F.col("data_devices.measure"))
    .withColumn("status", F.col("data_devices.status"))
    .withColumn("temperature", F.col("data_devices.temperature"))
    .drop("data_devices")
)
flattened_df.printSchema()

In [None]:
# Write the output to console sink to check the output

(
    flattened_df.writeStream.format("csv")
    .option("header", True)
    .outputMode("append")
    .option("path", "hdfs://namenode:9000/output/streaming/02/device_data")
    .option(
        "checkpointLocation",
        f"/home/jovyan/streaming_checkpoint_dir/{spark.sparkContext.appName.replace(' ', '_')}",
    )
    .start()
    .awaitTermination()
)

In [None]:
# CHECK THE WRITTEN FILE

df = (
    spark.read.option("header", True)
    .option("inferSchema", True)
    .csv("hdfs://namenode:9000/output/streaming/02/device_data")
)
df.show()

In [None]:
spark.stop()