<h1> Data Design and Streaming</h1>

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.3.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0 pyspark-shell'

from pymongo import MongoClient
from pyspark.sql import SparkSession
from pyspark.sql.types import  StructType, StructField, StringType, LongType, DoubleType, IntegerType, ArrayType
from pyspark.sql.functions import col, split, element_at, when,from_json
import pandas



In [2]:
topic_1 = 'Camera_A'
topic_2 = 'Camera_B'
topic_3 = 'Camera_C'
hostip = "192.168.0.102" # change it to your IP


In [3]:
spark = (
    SparkSession.builder
    .master('local[*]')
    .appName('[Demo] Spark Streaming from Kafka into MongoDB')
    .getOrCreate()
)


In [4]:
topic_stream_df = (
    spark.readStream.format('kafka')
    .option('kafka.bootstrap.servers', f'{hostip}:9092')
    .option('subscribe', "Camera_A, Camera_B,Camera_C") # Subscribe to the topics here
    .load()
)

In [5]:
topic_stream_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



<h2> Decode the information received from the broker </h2>

In [6]:
json_schema = StructType([
    StructField("event_id", StringType(), True),
    StructField("batch_id", IntegerType(), True),
    StructField("car_plate", StringType(), True),
    StructField("camera_id", IntegerType(), True),
    StructField("timestamp", StringType(), True),
    StructField("speed_reading", DoubleType(), True)
])

In [7]:
output_stream_df = (
    topic_stream_df
    .select(
        from_json(col("value").cast("string"), json_schema).alias("data")
    )
    .select("data.*") 
)

In [8]:
output_stream_df.printSchema()

root
 |-- event_id: string (nullable = true)
 |-- batch_id: integer (nullable = true)
 |-- car_plate: string (nullable = true)
 |-- camera_id: integer (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- speed_reading: double (nullable = true)



In [None]:
console_logger = (
    output_stream_df
    .writeStream
    .outputMode('append')
    .format('console')
)

writer = console_logger


try:
    query = writer.start()
    query.awaitTermination()
except KeyboardInterrupt:
    print('Interrupted by CTRL-C. Stopped query')
except StreamingQueryException as exc:
    print(exc)
finally:
    query.stop()