In [2]:
from time import sleep
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, LongType


car_schema = StructType([
    StructField("car_id", IntegerType(), True),
    StructField("driver_id", LongType(), True),
    StructField("model_id", IntegerType(), True),
    StructField("color_id", IntegerType(), True)
])

spark = SparkSession\
    .builder\
    .master("local[*]")\
    .appName('data_generator')\
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2') \
    .config("fs.s3a.endpoint", "http://minio:9000") \
    .config("fs.s3a.access.key", "minioadmin") \
    .config("fs.s3a.secret.key", "minioadmin") \
    .config("fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") \
    .getOrCreate()


streaming_cars = spark.read\
    .schema(car_schema)\
    .parquet('s3a://spark/data/cars')

streaming_cars.cache()
    
while True:
    
    event = streaming_cars.select(F.col('car_id'))\
        .withColumn('event_id' ,F.concat(F.col('car_id'), F.unix_timestamp()))\
        .withColumn('event_time' , F.current_timestamp())\
        .withColumn('speed' , F.round(F.rand() * 200).cast("int"))\
        .withColumn('rpm' , F.round(F.rand() * 8000).cast("int"))\
        .withColumn('gear' ,  1+ F.round(F.rand() * 6).cast("int"))

    # # Show the car DataFrame    
    event.show(truncate=False)
        
    kafka_writer = event.selectExpr("to_json(struct(*)) AS value")\
        .write\
        .format('kafka')\
        .option("kafka.bootstrap.servers", "course-kafka:9092")\
        .option("topic", "sensors-sample")\
        .option("checkpointLocation", "s3a://spark/checkpoints/final_ex/events")\
        .save()
        
    sleep(1)
                
     


24/09/30 11:13:39 WARN CacheManager: Asked to cache already cached data.


+-------+-----------------+--------------------------+-----+----+----+
|car_id |event_id         |event_time                |speed|rpm |gear|
+-------+-----------------+--------------------------+-----+----+----+
|7577767|75777671727694819|2024-09-30 11:13:39.414438|124  |7835|4   |
|1795964|17959641727694819|2024-09-30 11:13:39.414438|12   |7340|7   |
|6090836|60908361727694819|2024-09-30 11:13:39.414438|145  |568 |2   |
|3124006|31240061727694819|2024-09-30 11:13:39.414438|93   |3142|2   |
|5583503|55835031727694819|2024-09-30 11:13:39.414438|115  |4795|4   |
|4488770|44887701727694819|2024-09-30 11:13:39.414438|29   |2111|3   |
|5817020|58170201727694819|2024-09-30 11:13:39.414438|5    |1128|4   |
|6815898|68158981727694819|2024-09-30 11:13:39.414438|33   |6872|2   |
|4352035|43520351727694819|2024-09-30 11:13:39.414438|157  |7828|1   |
|4574332|45743321727694819|2024-09-30 11:13:39.414438|80   |6267|5   |
|3133822|31338221727694819|2024-09-30 11:13:39.414438|199  |509 |7   |
|60379

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
KeyboardInterrupt


KeyboardInterrupt: 