<h1> Data Design and Streaming</h1>

In [13]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.3.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0 pyspark-shell'

from pymongo import MongoClient
from pyspark.sql import SparkSession
from pyspark.sql.types import  StructType, StructField, StringType, LongType, DoubleType, IntegerType, ArrayType
from pyspark.sql.functions import col, split, element_at, when,from_json, to_timestamp, coalesce,broadcast
import pandas as pd
from datetime import datetime


hostip = "192.168.0.116" 
spark = (
    SparkSession.builder
    .master('local[*]')
    .appName('[Demo] Spark Streaming from Kafka into MongoDB')
    .getOrCreate()
)

topic_1 = 'Camera_A'
topic_2 = 'Camera_B'
topic_3 = 'Camera_C'


<h2>Database setup</h2>

In [14]:

db_client = MongoClient(hostip, 27017) 

def insert_data(file_name, db_name, collection_name, client):
    file_df = pd.read_csv(file_name)

    db = client[db_name]
    collection = db[collection_name]  # This will auto-create the collection if it doesn't exist
    collection.drop()

    for _, row in file_df.iterrows():
        collection.insert_one(row.to_dict())

insert_data("camera.csv","fit3182_a2_db","camera",db_client)    


## insert camera information to database
insert camera information to mongdodb and create dataframe for the camera information to join with camera event stream

In [15]:
db_client = MongoClient(hostip, 27017) 
db = db_client.fit3182_a2_db
data = list(db.camera.find({}, { "_id": 0 }))

pd_camera = pd.DataFrame(data)

camera_df = spark.createDataFrame(pd_camera)

camera_df = camera_df.select(
    col("camera_id").cast(IntegerType()).alias("camera_id"),
    col("position"),
    col("speed_limit")
).cache()

print(pd_camera)
camera_df.printSchema()

   camera_id  latitude   longitude  position  speed_limit
0        1.0  2.157731  102.660100     152.5        110.0
1        2.0  2.162419  102.652455     153.5        110.0
2        3.0  2.167353  102.644914     154.5         90.0
root
 |-- camera_id: integer (nullable = true)
 |-- position: double (nullable = true)
 |-- speed_limit: double (nullable = true)



  for column, series in pdf.iteritems():
  for column, series in pdf.iteritems():


## setting up consumer

In [16]:

json_schema = StructType([
    StructField("event_id", StringType(), True),
    StructField("batch_id", IntegerType(), True),
    StructField("car_plate", StringType(), True),
    StructField("camera_id", IntegerType(), True),
    StructField("timestamp", StringType(), True),
    StructField("speed_reading", DoubleType(), True)
])

def create_consumer(topic):
    topic_stream = (
        spark.readStream.format('kafka')
        .option('kafka.bootstrap.servers', f'{hostip}:9092')
        .option('subscribe', topic) # Subscribe to the topics here (work with camera A and B first)
        .load()
    )
    
    modified_stream = (
        topic_stream
        .select(
            from_json(col("value").cast("string"), json_schema).alias("data")
        )
        .select("data.*")
        .withColumn("event_time", to_timestamp(col("timestamp")))
        .withColumnRenamed("camera_id", "cam_id")
        .join(broadcast(camera_df), col("cam_id") == col("camera_id"))
        .drop("cam_id")
    )
    
    return modified_stream

In [17]:
topic_stream_cam_a_df = create_consumer(topic_1)
topic_stream_cam_b_df = create_consumer(topic_2)
topic_stream_cam_c_df = create_consumer(topic_2)

## Join between stream a and b

In [18]:
from pyspark.sql.functions import expr,abs,unix_timestamp
cam_a_watermarked = (
    topic_stream_cam_a_df.withWatermark("event_time", "10 minutes")
    .select([col(c).alias(f"left_{c}") for c in topic_stream_cam_a_df.columns])
) 

cam_b_watermarked = (
    topic_stream_cam_b_df.withWatermark("event_time", "10 minutes")
    .select([col(c).alias(f"right_{c}") for c in topic_stream_cam_b_df.columns])
) 

joined_stream_a_b = cam_a_watermarked.join(
    cam_b_watermarked,
    expr("""
        left_car_plate = right_car_plate AND
        right_event_time > left_event_time AND
        right_event_time <= left_event_time + interval 2 minutes
    """),
    "full_outer"
)


stream_with_avg_speed_a_b = (
    joined_stream_a_b
    .filter("left_car_plate IS NOT NULL AND right_car_plate IS NOT NULL")
    .withColumn("distance_km", abs(col("left_position") - col("right_position")))
    .withColumn("time_diff_hrs", (unix_timestamp("right_event_time") - unix_timestamp("left_event_time")) / 3600)
    .withColumn("avg_speed", col("distance_km") / col("time_diff_hrs"))
)

unmatched_records_a_b = joined_stream_a_b.filter(
    "left_car_plate IS NULL OR right_car_plate IS NULL"
).select(
    coalesce(col("left_event_id"), col("right_event_id")).alias("event_id"),
    coalesce(col("left_batch_id"), col("right_batch_id")).alias("batch_id"),
    coalesce(col("left_car_plate"), col("right_car_plate")).alias("car_plate"),
    coalesce(col("left_camera_id"), col("right_camera_id")).alias("camera_id"),
    coalesce(col("left_speed_reading"), col("right_speed_reading")).alias("speed_reading"),
    coalesce(col("left_timestamp"), col("right_timestamp")).alias("timestamp"),
    coalesce(col("left_speed_limit"), col("right_speed_limit")).alias("speed_limit"),
    coalesce(col("left_position"), col("right_position")).alias("position")
)
    
stream_with_avg_speed_a_b.printSchema()
unmatched_records_a_b.printSchema()
    

root
 |-- left_event_id: string (nullable = true)
 |-- left_batch_id: integer (nullable = true)
 |-- left_car_plate: string (nullable = true)
 |-- left_timestamp: string (nullable = true)
 |-- left_speed_reading: double (nullable = true)
 |-- left_event_time: timestamp (nullable = true)
 |-- left_camera_id: integer (nullable = true)
 |-- left_position: double (nullable = true)
 |-- left_speed_limit: double (nullable = true)
 |-- right_event_id: string (nullable = true)
 |-- right_batch_id: integer (nullable = true)
 |-- right_car_plate: string (nullable = true)
 |-- right_timestamp: string (nullable = true)
 |-- right_speed_reading: double (nullable = true)
 |-- right_event_time: timestamp (nullable = true)
 |-- right_camera_id: integer (nullable = true)
 |-- right_position: double (nullable = true)
 |-- right_speed_limit: double (nullable = true)
 |-- distance_km: double (nullable = true)
 |-- time_diff_hrs: double (nullable = true)
 |-- avg_speed: double (nullable = true)

root
 |-- 

## Join stream b and c

In [19]:

cam_b_watermarked = (
    topic_stream_cam_b_df.withWatermark("event_time", "10 minutes")
    .select([col(c).alias(f"left_{c}") for c in topic_stream_cam_b_df.columns])
) 

cam_c_watermarked = (
    topic_stream_cam_c_df.withWatermark("event_time", "10 minutes")
    .select([col(c).alias(f"right_{c}") for c in topic_stream_cam_c_df.columns])
) 

joined_stream_b_c = cam_b_watermarked.join(
    cam_c_watermarked,
    expr("""
        left_car_plate = right_car_plate AND
        right_event_time > left_event_time AND
        right_event_time <= left_event_time + interval 2 minutes
    """),
    "full_outer"
)


stream_with_avg_speed_b_c = (
    joined_stream_b_c
    .filter("left_car_plate IS NOT NULL AND right_car_plate IS NOT NULL")
    .withColumn("distance_km", abs(col("left_position") - col("right_position")))
    .withColumn("time_diff_hrs", (unix_timestamp("right_event_time") - unix_timestamp("left_event_time")) / 3600)
    .withColumn("avg_speed", col("distance_km") / col("time_diff_hrs"))
)

unmatched_records_b_c = joined_stream_b_c.filter(
    "left_car_plate IS NULL OR right_car_plate IS NULL"
).select(
    coalesce(col("left_event_id"), col("right_event_id")).alias("event_id"),
    coalesce(col("left_batch_id"), col("right_batch_id")).alias("batch_id"),
    coalesce(col("left_car_plate"), col("right_car_plate")).alias("car_plate"),
    coalesce(col("left_camera_id"), col("right_camera_id")).alias("camera_id"),
    coalesce(col("left_speed_reading"), col("right_speed_reading")).alias("speed_reading"),
    coalesce(col("left_timestamp"), col("right_timestamp")).alias("timestamp"),
    coalesce(col("left_speed_limit"), col("right_speed_limit")).alias("speed_limit"),
    coalesce(col("left_position"), col("right_position")).alias("position")
)
    
stream_with_avg_speed_b_c.printSchema()
unmatched_records_b_c.printSchema()



root
 |-- left_event_id: string (nullable = true)
 |-- left_batch_id: integer (nullable = true)
 |-- left_car_plate: string (nullable = true)
 |-- left_timestamp: string (nullable = true)
 |-- left_speed_reading: double (nullable = true)
 |-- left_event_time: timestamp (nullable = true)
 |-- left_camera_id: integer (nullable = true)
 |-- left_position: double (nullable = true)
 |-- left_speed_limit: double (nullable = true)
 |-- right_event_id: string (nullable = true)
 |-- right_batch_id: integer (nullable = true)
 |-- right_car_plate: string (nullable = true)
 |-- right_timestamp: string (nullable = true)
 |-- right_speed_reading: double (nullable = true)
 |-- right_event_time: timestamp (nullable = true)
 |-- right_camera_id: integer (nullable = true)
 |-- right_position: double (nullable = true)
 |-- right_speed_limit: double (nullable = true)
 |-- distance_km: double (nullable = true)
 |-- time_diff_hrs: double (nullable = true)
 |-- avg_speed: double (nullable = true)

root
 |-- 

In [20]:
from pymongo import InsertOne
class DbWriterJoin:
    def open(self):
        self.client = MongoClient(host= hostip, port=27017)  # replace with actual host/port
        self.db = self.client['fit3182_db']
        self.buffer = []
        self.dropped_record = []
        return True

    def process(self, row):
        row_dict = row.asDict()
        print(f'row dict : {row_dict}')
     
        if row_dict["avg_speed"] > row_dict["right_speed_limit"] :
            
            # unique_id = f"{row_dict["left_event_id"]}_{row_dict["right_event_id"]}"

            record = {
                "car_plate": row_dict["left_car_plate"],
                "camera_id_end": row_dict["right_camera_id"],
                "camera_id_start": row_dict["left_camera_id"],
                "timestamp_start": row_dict["left_timestamp"],
                "timestamp_end": row_dict["right_timestamp"],
                "speed_reading": row_dict["avg_speed"],
            }

            self.buffer.append(InsertOne(record))
        else :
            self.dropped_record.append(row_dict)
      
        
    def close(self, error):
        if error is None and self.buffer:
            self.db["violations"].bulk_write(self.buffer)
            print(f'bufferx : {self.buffer}')
            print(f'pairs that does not violate average speed :{self.dropped_record}')
        self.client.close()

class DbWriterSingle:
    def open(self, partition_id, epoch_id):
        self.client = MongoClient(host = hostip, port=27017)  # replace with actual host/port
        self.db = self.client['fit3182_db']
        self.buffer = []
        self.dropped_record = []
        return True

    def process(self, row):
        row_dict = row.asDict()
        if row_dict["speed_reading"] > row_dict["speed_limit"] :
            record = {
                "car_plate": row_dict["car_plate"],
                "camera_id_end": row_dict["camera_id"],
                "camera_id_start": row_dict["camera_id"],
                "timestamp_start": row_dict["timestamp"],
                "timestamp_end": row_dict["timestamp"],
                "speed_reading": row_dict["speed_reading"],
            }

            self.buffer.append(
                InsertOne(record)
            )
            
        else :
            self.dropped_record.append(row_dict)


    def close(self, error):
        if error is None and self.buffer:
            self.db["violations"].bulk_write(self.buffer)
            print(f'dropped record (no violation) : {self.dropped_record}')
                  
        self.client.close()


In [21]:
stream_with_average = stream_with_avg_speed_a_b.union(stream_with_avg_speed_b_c) 

union_cam_a_b_c = topic_stream_cam_a_df.union(topic_stream_cam_b_df).union(topic_stream_cam_a_df) 

no_match_record = unmatched_records_b_c.union(unmatched_records_a_b)


# no matching record
drop_logger = (
    no_match_record
    .writeStream
    .outputMode('append')
    .format('console')
)


# violation of average speed
joined_pair_writer = ( 
    stream_with_average
    .writeStream
    .outputMode('append')
    .foreach(DbWriterJoin())
)

#instant violation
single_record_writer = (
    union_cam_a_b_c
    .writeStream
    .outputMode('append')
    .foreach(DbWriterSingle())
)



In [22]:
try:
    query_1 = joined_pair_writer.start()
    query_2 = single_record_writer.start()
    query_3 = drop_logger.start()
    
    query_1.awaitTermination()
    query_2.awaitTermination()
    query_3.awaitTermination()

    
except KeyboardInterrupt:
    print('Interrupted by CTRL-C. Stopped query')
except StreamingQueryException as exc:
    print(exc)
finally:
    query_1.stop()
    query_2.stop()
    query_3.stop()


NameError: name 'StreamingQueryException' is not defined