In [None]:
# Create the Spark Session
import pyspark
from pyspark.sql import SparkSession

spark = (
    SparkSession 
    .builder 
    .appName("Triggers in Spark Streaming") 
    .config("spark.streaming.stopGracefullyOnShutdown", True) # Even if we manually stop de job, it continues processing
    .config('spark.jars', f'/opt/spark/jars/spark-sql-kafka-0-10_2.12-{pyspark.__version__}.jar')
    .config("spark.sql.shuffle.partitions", 8)
    .master("spark://spark-master:7077") 
    .getOrCreate()
)

spark

In [None]:
# Create the kafka_df to read from kafka

kafka_df = (
    spark
    .readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("subscribe", "device-data")
    .option("startingOffsets", "earliest")
    .option("maxOffsetsPerTrigger", "1") # we limit the number of records per micro batch
    .load()
)

In [None]:
from utils import flatten_df

flattened_df = flatten_df(kafka_df)

In [None]:
# PROCESSING TIME 10 SECONDS

(flattened_df
 .writeStream
 .format("console")
 .outputMode("append")
 .trigger(processingTime='3 seconds')
 .option("checkpointLocation", f"/home/jovyan/streaming_checkpoint_dir/{spark.sparkContext.appName.replace(' ', '_')}")
 .start()
 .awaitTermination())

In [None]:
# View data from Memory Sink
spark.sql("select * from kafka_table").show()

In [None]:
spark.stop()