<h1> Data Design and Streaming</h1>

In [3]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.3.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0 pyspark-shell'

from pymongo import MongoClient
from pyspark.sql import SparkSession
from pyspark.sql.types import  StructType, StructField, StringType, LongType, DoubleType, IntegerType, ArrayType
from pyspark.sql.functions import col, split, element_at, when,from_json
import pandas as pd
from datetime import datetime


hostip = "192.168.0.102" # CHANGE TO YOUR IP ADDRESS HERE

<h2>Database setup</h2>

In [4]:

class DBClient:
    def __init__(self):
        self.mongo_client = MongoClient(
            host=f'{hostip}',
            port=27017
        )
        
        self.database = self.mongo_client['fit3182_a2_db']
    
    
    def collection(self,collection_name):
        return self.database[collection_name]
    
db_client = DBClient()
        
# insert camera, vehicle, and camera event historic into the database (assuming we are using purely referencing)
def insert_data(file_name,collection_name):
    file_df = pd.read_csv(file_name)
    collection = db_client.collection(collection_name)
    for index,row in file_df.iterrows():
        collection.insert_one(row.to_dict())
        

# WILL ONLY NEED TO RUN IT ONCE TO IMPORT STATIC DATA
# insert_data("camera.csv","camera")    
# insert_data("vehicle.csv","vehicle")
# insert_data("camera_event_historic.csv","camera_event_historic") # I think we need to process this data to a violations table


    
    

In [2]:
# Database Writer
class DbWriter:
    # called at the start of processing each partition in each output micro-batch
    def open(self, partition_id, epoch_id):
        from pymongo import MongoClient
        self.mongo_client = MongoClient(host=hostip, port=27017)
        self.db = self.mongo_client["fit3182_a2_db"]
        return True
    
    # called once per row of the result dataframe
    # the current code DOES NOT handle duplicate processing
    #   e.g., query fails and restarts just before current micro-batch was fully inserted
    def process(self, row):
        
        # for every new batch
        row_dict = row.asDict()
        
        # instantaneous speed violation
        if row_dict["speed_reading"] > self.db["camera"].find_one({"camera_id" : row_dict["camera_id"]})["speed_limit"]:
            print("instantaneous")
            self.db["violations"].insert_one({
                    "car_plate" : row_dict["car_plate"],
                    "camera_id_start" : row_dict["camera_id"],
                    "camera_id_end" : row_dict["camera_id"],
                    "timestamp_start" : row_dict["timestamp"],
                    "timestamp_end" : row_dict["timestamp"],
                    "speed_reading": row_dict["speed_reading"],
                    "violation_type": "instantaneous"
                })
            
            
    
        existing = self.db["temporary"].find_one({"car_plate": row_dict["car_plate"]})
        print("hello")
        
        # check if the number plate exist in the temporary collection (let's call it "temporary")
        # average speed violation
        if existing:
            
            start_camera_id = existing["camera_id"]
            end_camera_id = row_dict["camera_id"]
            
            # get the distance
            start_pos = self.db["camera"].find_one({"camera_id" : existing["camera_id"]})["position"]
            end_pos = self.db["camera"].find_one({"camera_id": row_dict["camera_id"]})["position"]
            distance = abs(end_pos - start_pos)
            
            # get the time
            start_time = existing["timestamp"]
            end_time = row_dict["timestamp"]

            time_object_start = datetime.fromisoformat(start_time)
            time_object_end = datetime.fromisoformat(end_time)
            
            # handle late arrivals
            if time_object_start > time_object_end:
                time_object_start, time_object_end = time_object_end, time_object_start
                start_camera_id, end_camera_id = end_camera_id, start_camera_id

            time_difference = (time_object_end - time_object_start).total_seconds() / 3600
            
            if time_difference <= 0:
                print("Invalid time difference")
                print(row_dict["car_plate"])
                print("Between camera", existing["camera_id"],"and ",row_dict["camera_id"])
                return
                
            
            # calculate the average speed
            avg_speed = distance / time_difference
            print(row_dict["car_plate"])
            print("Between camera", existing["camera_id"],"and ",row_dict["camera_id"])
            print(avg_speed)
            
            
            # check if the average speed is larger than the speed limit of the camera (logic might need to change as i asumme 2 cameras currently have the same speed limit)
            if avg_speed > self.db["camera"].find_one({"camera_id" : existing["camera_id"]})["speed_limit"]: 
                # update the violations table
                self.db["violations"].insert_one({
                    "car_plate" : row_dict["car_plate"],
                    "camera_id_start" : start_camera_id,
                    "camera_id_end" : end_camera_id,
                    "timestamp_start" : start_time,
                    "timestamp_end" : end_time,
                    "speed_reading": avg_speed,
                    "violation_type": "average"
                })
                
                # remove the record from the temporary table
                
            # Remove from temporary collection
            self.db["temporary"].delete_one({"car_plate": row_dict["car_plate"]})
            
        
        # if not then we insert the camera event inside the temporary collection
        else:
            self.db["temporary"].insert_one(row_dict)
            
            # date inside the temporary will need to be process simulateniuslt as well
        
        
        
        # i want to die 
        
    
    # called once all rows have been processed (possibly with error)
    def close(self, err):
        self.mongo_client.close()
    
    
    

<h2> Spark Streaming Setup</h2>

In [3]:
topic_1 = 'Camera_A'
topic_2 = 'Camera_B'
topic_3 = 'Camera_C'



In [4]:
spark = (
    SparkSession.builder
    .master('local[*]')
    .appName('[Demo] Spark Streaming from Kafka into MongoDB')
    .getOrCreate()
)


In [5]:
topic_stream_df = (
    spark.readStream.format('kafka')
    .option('kafka.bootstrap.servers', f'{hostip}:9092')
    .option('subscribe', "Camera_A,Camera_B") # Subscribe to the topics here (work with camera A and B first)
    .load()
)

In [6]:
topic_stream_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



<h2> Decode the information received from the broker </h2>

In [7]:
json_schema = StructType([
    StructField("event_id", StringType(), True),
    StructField("batch_id", IntegerType(), True),
    StructField("car_plate", StringType(), True),
    StructField("camera_id", IntegerType(), True),
    StructField("timestamp", StringType(), True),
    StructField("speed_reading", DoubleType(), True)
])

In [8]:
output_stream_df = (
    topic_stream_df
    .select(
        from_json(col("value").cast("string"), json_schema).alias("data")
    )
    .select("data.*") 
)

In [5]:
# df for camera and make a stream
camera_files = db_client.collection("camera").find()
camera_collection = spark.createDataFrame(camera_static_df_pandas).cache()







base_stream_df = (
    topic_stream_df
    .select(
        from_json(col("value").cast("string"), json_schema).alias("data")
    )
    .select("data.*") 
    .withColumn("timestamp")
    
)

NameError: name 'topic_stream_df' is not defined

In [9]:
output_stream_df.printSchema()

root
 |-- event_id: string (nullable = true)
 |-- batch_id: integer (nullable = true)
 |-- car_plate: string (nullable = true)
 |-- camera_id: integer (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- speed_reading: double (nullable = true)



In [10]:
# DEBUGGING

console_logger = (
    output_stream_df
    .writeStream
    .outputMode('append')
    .format('console')
)

db_writer = (
    output_stream_df
    .writeStream
    .outputMode('append')
    .foreach(DbWriter())
)

writer = db_writer # check if we can consume data being stored in the broker


try:
    query = writer.start()
    query.awaitTermination()
except KeyboardInterrupt:
    print('Interrupted by CTRL-C. Stopped query')
except StreamingQueryException as exc:
    print(exc)
finally:
    query.stop()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/conda/lib/python3.8/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


Interrupted by CTRL-C. Stopped query


----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 46618)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.8/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.8/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.8/socketserver.py", line 747, in __init__
    self.handle()
  File "/opt/conda/lib/python3.8/site-packages/pyspark/accumulators.py", line 281, in handle
    poll(accum_updates)
  File "/opt/conda/lib/python3.8/site-packages/pyspark/accumulators.py", line 253, in poll
    if func():
  File "/opt/conda/lib/python3.8/site-packages/pyspark/accumulators.py", line 257, in accum_updates
    num_updates = read_int(sel

ConnectionRefusedError: [Errno 111] Connection refused

<h1> Utility Class to process our streaming data </h1>
