In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructType, StructField, TimestampType, IntegerType
from pyspark.sql.functions import from_json, col, avg, sum, window, round
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import time
import matplotlib.pyplot as plt
import pandas as pd

In [2]:

# Define the path to the jars on the EC2 instance
spark_jars_path = "/home/ec2-user/stream-processing-template/jars"  # <-- Update this path

spark = SparkSession.builder.appName("retail_pysaprk_consumer") \
    .config("spark.jars", f"{spark_jars_path}/commons-pool2-2.11.1.jar,"
            f"{spark_jars_path}/spark-sql-kafka-0-10_2.12-3.4.0.jar,"
            f"{spark_jars_path}/spark-streaming-kafka-0-10-assembly_2.12-3.4.0.jar") \
    .getOrCreate()


23/12/13 15:54:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
# Define the schema for our data (Column names and Datatype for each column)
schema = StructType([
    StructField("store_location", StringType(), True),
    StructField("time_of_purchase", TimestampType(), True),
    StructField("product_ID", StringType(), True),
    StructField("transaction_amount", IntegerType(), True)
])

# Stream from Kafka topic

# initalise a spark object and read the stream of data using the readStream method
# set the format of the streaming source to kafka
# set the kafka topic to retail_transactions (topic is a feed name to which messages are published)
# load the data and store it in the df variable
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "b-1.monstercluster1.6xql65.c3.kafka.eu-west-2.amazonaws.com:9092") \
    .option("subscribe", "retail_transactions") \
    .load()

In [4]:
# Extract and parse the JSON data - convert the json data to strings and add it to a new column 'data'

# selectExpr allows you to select and transform columns using SQL
# CAST(value AS STRING) - SQL expression that set the datatype for the selected column as a string
# withColumn - Pyspark method used to add a or replace a column to the DF, in this case creating a new column named 'data'
# the from_json function takes the json data from the value column of the stream and inserts into the new column
transactions = (df.selectExpr("CAST(value AS STRING)")
                .withColumn("data", from_json(col("value"), schema))
                .select("data.*"))


# write the transactions DF to an in memory table called temporary_view_two

# writeStream - write the streaming data to memory, whereas readStream reads streaming data
# set the format to memory
# give the query a name using queryName
# start the streaming query
# query = transactions.writeStream \
# .format("memory") \
# .queryName("temporary_view_four") \
# .start()

# query.awaitTermination(180)

23/12/13 15:54:53 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-d370429f-7cd6-4928-a0bc-ab3763d70de2. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/12/13 15:54:53 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/12/13 15:54:56 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

False

In [5]:
processed_data = spark.sql("SELECT * FROM temporary_view_four")

# Now you can perform aggregations or other transformations on `processed_data`

In [None]:
import time
import matplotlib.pyplot as plt
from IPython import display

# This assumes you've set up a Spark streaming query that's writing aggregated results to an in-memory table.
# Here's a simple example of how you'd do that with your data (this part isn't in the loop):
query = transactions.groupBy("store_location").agg(sum("transaction_amount").alias("total_amount")) \
    .writeStream \
    .outputMode("complete") \
    .format("memory") \
    .queryName("aggByLocation") \
    .start()

# Now, for the dynamic visualization:
while True:
    # Clear the previous plot
    display.clear_output(wait=True)
    
    # Get the latest aggregated data
    agg_by_location_pd = spark.sql("SELECT * FROM aggByLocation").toPandas()
    
    # Plotting
    plt.figure(figsize=(10, 6))
    plt.bar(agg_by_location_pd['store_location'], agg_by_location_pd['total_amount'], color='skyblue')
    plt.title('Total Transaction Amounts by Store Location')
    plt.xlabel('Store Location')
    plt.ylabel('Total Amount')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    time.sleep(10)  # Refresh every 10 seconds. Adjust this to your needs.