In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructType, StructField, TimestampType, IntegerType
from pyspark.sql.functions import from_json, col

# Define the path to the jars on the EC2 instance
spark_jars_path = "/home/ec2-user/stream-processing-template/jars"  # <-- Update this path

spark = SparkSession.builder.appName("retail_pysaprk_consumer") \
    .config("spark.jars", f"{spark_jars_path}/commons-pool2-2.11.1.jar,"
            f"{spark_jars_path}/spark-sql-kafka-0-10_2.12-3.4.0.jar,"
            f"{spark_jars_path}/spark-streaming-kafka-0-10-assembly_2.12-3.4.0.jar") \
    .getOrCreate()


In [6]:
# Define the schema for our data
schema = StructType([
    StructField("store_location", StringType(), True),
    StructField("time_of_purchase", TimestampType(), True),
    StructField("product_ID", StringType(), True),
    StructField("transaction_amount", IntegerType(), True)
])

# Stream from Kafka topic
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "b-1.monstercluster1.6xql65.c3.kafka.eu-west-2.amazonaws.com:9092") \
    .option("subscribe", "retail_transactions") \
    .load()

In [11]:
import time

transactions = (df.selectExpr("CAST(value AS STRING)")
                .withColumn("data", from_json(col("value"), schema))
                .select("data.*"))

query = transactions.writeStream \
    .outputMode("append") \
    .format("console") \
    .start()

# query.awaitTermination()

# query.awaitTermination(timeout=3000)  # Wait for a maximum of 60,000 milliseconds (3 seconds)

# Wait for 3 seconds
time.sleep(20)

# Stop the streaming query
query.stop()

23/12/12 14:01:06 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-b36c9205-3825-41e7-99a4-8ce2f0e71476. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/12/12 14:01:06 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/12/12 14:01:06 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


-------------------------------------------
Batch: 0
-------------------------------------------
+--------------+----------------+----------+------------------+
|store_location|time_of_purchase|product_ID|transaction_amount|
+--------------+----------------+----------+------------------+
+--------------+----------------+----------+------------------+

-------------------------------------------
Batch: 1
-------------------------------------------
+--------------+-------------------+----------+------------------+
|store_location|   time_of_purchase|product_ID|transaction_amount|
+--------------+-------------------+----------+------------------+
|       Phoenix|2023-12-12 14:01:06|    P00042|               236|
+--------------+-------------------+----------+------------------+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+--------------+-------------------+----------+------------------+
|store_location|   time_of_purchase|product_ID|transaction_amount|
+--------------+-------------------+----------+------------------+
|  Philadelphia|2023-12-12 14:01:08|    P00090|               504|
+--------------+-------------------+----------+------------------+

-------------------------------------------
Batch: 3
-------------------------------------------
+--------------+-------------------+----------+------------------+
|store_location|   time_of_purchase|product_ID|transaction_amount|
+--------------+-------------------+----------+------------------+
|       Houston|2023-12-12 14:01:11|    P00079|                31|
+--------------+-------------------+----------+------------------+

-------------------------------------------
Batch: 4
-------------------------------------------
+--------------+-------------------+-

In [8]:
query.stop()