In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.types import StructType, StructField, IntegerType, LongType, StringType


#read Schema tables
model_schema = StructType([
    StructField("model_id", IntegerType(), True),
    StructField("car_brand", StringType(), True),
    StructField("car_model", StringType(), True)
])

color_schema = StructType([
    StructField("color_id", IntegerType(), True),
    StructField("color_name", StringType(), True)
])

car_schema = StructType([
    StructField("car_id", IntegerType(), True),
    StructField("driver_id", LongType(), True),
    StructField("model_id", IntegerType(), True),
    StructField("color_id", IntegerType(), True)
])

event_schema = StructType([
    StructField("event_id", StringType(), True),
    StructField("event_time", StringType(), True),
    StructField("car_id", IntegerType(), True),
    StructField("speed", IntegerType(), True),
    StructField("rpm", IntegerType(), True),
    StructField("gear", IntegerType(), True)
])

# Create a Spark session
spark = SparkSession\
    .builder\
    .master("local[*]")\
    .appName('data_enrichment')\
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2') \
    .getOrCreate()    
    

# Read static tables
cars = spark.read\
    .schema(car_schema)\
    .parquet('s3a://spark/data/cars')
    
models = spark.read\
    .schema(model_schema)\
    .parquet('s3a://spark/data/car_models')
    
colors = spark.read\
    .schema(color_schema)\
    .parquet('s3a://spark/data/car_colors')


# readStream data from Kafka
streaming_df = spark.readStream \
    .format('kafka')\
    .option("kafka.bootstrap.servers", "course-kafka:9092") \
    .option("subscribe", "sensors-sample") \
    .option('startingOffsets', 'latest') \
    .load()\
    .select(F.col('value').cast(T.StringType()))
    

parsed_df = streaming_df.withColumn('parsed_json', F.from_json(F.col('value'), event_schema)).select(F.col('parsed_json.*'))
    
joined_df = parsed_df.join(F.broadcast(cars), 'car_id')\
                    .join(F.broadcast(models), 'model_id')\
                    .join(F.broadcast(colors), 'color_id')\
                    .select('event_id', 'event_time', 'car_id', 'driver_id', 'car_brand', 'car_model',\
                        'color_name', 'speed', 'rpm', 'gear')
                    
enriched_df = joined_df.withColumn('expected_gear', F.ceil(F.col('speed')/ F.lit(30.0)).cast('int'))

# # Print the data
# query = joined_df.writeStream \
#     .trigger(processingTime='5 seconds') \
#     .outputMode("update") \
#    .format("console") \
#     .option("truncate", "false") \
#    .start()


query = enriched_df.selectExpr("to_json(struct(*)) AS value") \
    .writeStream \
    .format('kafka') \
    .option("kafka.bootstrap.servers", "course-kafka:9092") \
    .option("topic", "samples-enriched") \
    .option('checkpointLocation', 's3a://spark/checkpoints/project/samples-enriched2') \
    .outputMode('append') \
    .start()
    
# wait until stream finishes
query.awaitTermination()           
    
# Stop the Spark session
spark.stop()


24/09/30 11:13:33 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-57eb18e2-1350-4cea-9fb4-5db0c643395a. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/09/30 11:13:33 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
24/09/30 11:13:33 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
24/09/30 11:13:33 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
24/09/30 11:13:33 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
24/09/30 11:13:33 WARN AdminClientConfig: The configuration 'max.poll.records' was supplied but isn't a known con

-------------------------------------------
Batch: 0
-------------------------------------------
+--------+----------+------+---------+---------+---------+----------+-----+---+----+
|event_id|event_time|car_id|driver_id|car_brand|car_model|color_name|speed|rpm|gear|
+--------+----------+------+---------+---------+---------+----------+-----+---+----+
+--------+----------+------+---------+---------+---------+----------+-----+---+----+

-------------------------------------------
Batch: 1
-------------------------------------------
+-----------------+------------------------+-------+---------+---------+---------+----------+-----+----+----+
|event_id         |event_time              |car_id |driver_id|car_brand|car_model|color_name|speed|rpm |gear|
+-----------------+------------------------+-------+---------+---------+---------+----------+-----+----+----+
|75777671727694819|2024-09-30T11:13:39.460Z|7577767|745486029|Kia      |Rio      |White     |124  |7835|4   |
|17959641727694819|2024-0

                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+-----------------+------------------------+-------+---------+---------+---------+----------+-----+----+----+
|event_id         |event_time              |car_id |driver_id|car_brand|car_model|color_name|speed|rpm |gear|
+-----------------+------------------------+-------+---------+---------+---------+----------+-----+----+----+
|75777671727694830|2024-09-30T11:13:50.620Z|7577767|745486029|Kia      |Rio      |White     |141  |3027|4   |
|17959641727694830|2024-09-30T11:13:50.620Z|1795964|538626499|Mazda    |3        |Gray      |160  |1816|1   |
|60908361727694830|2024-09-30T11:13:50.620Z|6090836|766428924|Toyota   |Corolla  |Gray      |126  |4378|2   |
|31240061727694830|2024-09-30T11:13:50.620Z|3124006|177119983|Toyota   |Corolla  |Green     |188  |4841|5   |
|55835031727694830|2024-09-30T11:13:50.620Z|5583503|911634586|Toyota   |Corolla  |Gray      |174  |5081|1   |
|44887701727694830|2024

                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+-----------------+------------------------+-------+---------+---------+---------+----------+-----+----+----+
|event_id         |event_time              |car_id |driver_id|car_brand|car_model|color_name|speed|rpm |gear|
+-----------------+------------------------+-------+---------+---------+---------+----------+-----+----+----+
|75777671727694835|2024-09-30T11:13:55.123Z|7577767|745486029|Kia      |Rio      |White     |21   |250 |6   |
|17959641727694835|2024-09-30T11:13:55.123Z|1795964|538626499|Mazda    |3        |Gray      |51   |4012|5   |
|60908361727694835|2024-09-30T11:13:55.123Z|6090836|766428924|Toyota   |Corolla  |Gray      |119  |4764|6   |
|31240061727694835|2024-09-30T11:13:55.123Z|3124006|177119983|Toyota   |Corolla  |Green     |18   |670 |3   |
|55835031727694835|2024-09-30T11:13:55.123Z|5583503|911634586|Toyota   |Corolla  |Gray      |148  |2562|1   |
|44887701727694835|2024

                                                                                

-------------------------------------------
Batch: 6
-------------------------------------------
+-----------------+------------------------+-------+---------+---------+---------+----------+-----+----+----+
|event_id         |event_time              |car_id |driver_id|car_brand|car_model|color_name|speed|rpm |gear|
+-----------------+------------------------+-------+---------+---------+---------+----------+-----+----+----+
|75777671727694840|2024-09-30T11:14:00.724Z|7577767|745486029|Kia      |Rio      |White     |49   |4345|3   |
|17959641727694840|2024-09-30T11:14:00.724Z|1795964|538626499|Mazda    |3        |Gray      |16   |4122|5   |
|60908361727694840|2024-09-30T11:14:00.724Z|6090836|766428924|Toyota   |Corolla  |Gray      |148  |7932|6   |
|31240061727694840|2024-09-30T11:14:00.724Z|3124006|177119983|Toyota   |Corolla  |Green     |102  |7304|1   |
|55835031727694840|2024-09-30T11:14:00.724Z|5583503|911634586|Toyota   |Corolla  |Gray      |15   |5540|1   |
|44887701727694840|2024

                                                                                

-------------------------------------------
Batch: 8
-------------------------------------------
+-----------------+------------------------+-------+---------+---------+---------+----------+-----+----+----+
|event_id         |event_time              |car_id |driver_id|car_brand|car_model|color_name|speed|rpm |gear|
+-----------------+------------------------+-------+---------+---------+---------+----------+-----+----+----+
|75777671727694850|2024-09-30T11:14:10.787Z|7577767|745486029|Kia      |Rio      |White     |194  |5574|6   |
|17959641727694850|2024-09-30T11:14:10.787Z|1795964|538626499|Mazda    |3        |Gray      |52   |2398|6   |
|60908361727694850|2024-09-30T11:14:10.787Z|6090836|766428924|Toyota   |Corolla  |Gray      |26   |7547|5   |
|31240061727694850|2024-09-30T11:14:10.787Z|3124006|177119983|Toyota   |Corolla  |Green     |81   |4617|5   |
|55835031727694850|2024-09-30T11:14:10.787Z|5583503|911634586|Toyota   |Corolla  |Gray      |37   |4862|3   |
|44887701727694850|2024

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

-------------------------------------------
Batch: 9
-------------------------------------------
+-----------------+------------------------+-------+---------+---------+---------+----------+-----+----+----+
|event_id         |event_time              |car_id |driver_id|car_brand|car_model|color_name|speed|rpm |gear|
+-----------------+------------------------+-------+---------+---------+---------+----------+-----+----+----+
|75777671727694855|2024-09-30T11:14:15.217Z|7577767|745486029|Kia      |Rio      |White     |163  |6545|5   |
|17959641727694855|2024-09-30T11:14:15.217Z|1795964|538626499|Mazda    |3        |Gray      |7    |577 |1   |
|60908361727694855|2024-09-30T11:14:15.217Z|6090836|766428924|Toyota   |Corolla  |Gray      |63   |1384|6   |
|31240061727694855|2024-09-30T11:14:15.217Z|3124006|177119983|Toyota   |Corolla  |Green     |191  |3241|2   |
|55835031727694855|2024-09-30T11:14:15.217Z|5583503|911634586|Toyota   |Corolla  |Gray      |72   |2004|5   |
|44887701727694855|2024