In [1]:
import json
import uuid
import os
import json
from dotenv import load_dotenv
from pathlib import Path
from kafka import KafkaProducer
from faker import Faker
from time import sleep

In [2]:
from pyspark.sql import SparkSession

spark = (
    SparkSession 
    .builder 
    .appName("Dibimbing Spark-Kafka") 
    .config("spark.streaming.stopGracefullyOnShutdown", True) 
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.2')
    .config("spark.sql.shuffle.partitions", 4)
    .master("local[*]") 
    .getOrCreate()
)

spark

In [3]:
dotenv_path = Path('/resources/.env')
load_dotenv(dotenv_path=dotenv_path)

True

In [4]:
kafka_host = os.getenv("KAFKA_HOST")
kafka_topic = "dibimbing_assignment_2"

# Batch Simulation

In [5]:
kafka_df = (
    spark
    .read
    .format("kafka")
    .option("kafka.bootstrap.servers", f'{kafka_host}:9092')
    .option("subscribe", kafka_topic)
    .option("startingOffsets", "earliest")
    .load()
)

In [6]:
kafka_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [7]:
kafka_df.show()

+----+--------------------+--------------------+---------+------+--------------------+-------------+
| key|               value|               topic|partition|offset|           timestamp|timestampType|
+----+--------------------+--------------------+---------+------+--------------------+-------------+
|null|[7B 22 65 76 65 6...|dibimbing_assignm...|        0|     0|2025-01-26 14:33:...|            0|
|null|[7B 22 65 76 65 6...|dibimbing_assignm...|        0|     1|2025-01-26 14:33:...|            0|
|null|[7B 22 65 76 65 6...|dibimbing_assignm...|        0|     2|2025-01-26 14:33:...|            0|
|null|[7B 22 65 76 65 6...|dibimbing_assignm...|        0|     3|2025-01-26 14:33:...|            0|
+----+--------------------+--------------------+---------+------+--------------------+-------------+



In [8]:
from pyspark.sql.functions import expr

kafka_json_df = kafka_df.withColumn("value", expr("cast(value as string)"))

In [9]:
kafka_json_df.show(5)

+----+--------------------+--------------------+---------+------+--------------------+-------------+
| key|               value|               topic|partition|offset|           timestamp|timestampType|
+----+--------------------+--------------------+---------+------+--------------------+-------------+
|null|{"event_id": "59a...|dibimbing_assignm...|        0|     0|2025-01-26 14:33:...|            0|
|null|{"event_id": "f75...|dibimbing_assignm...|        0|     1|2025-01-26 14:33:...|            0|
|null|{"event_id": "63f...|dibimbing_assignm...|        0|     2|2025-01-26 14:33:...|            0|
|null|{"event_id": "78a...|dibimbing_assignm...|        0|     3|2025-01-26 14:33:...|            0|
+----+--------------------+--------------------+---------+------+--------------------+-------------+



In [10]:
(
    kafka_json_df
    .select('value')
    .limit(5)
    .collect()
)

[Row(value='{"event_id": "59a3ecd1-e4a3-4212-8cf7-8b15b1cca7a5", "user_id": "e723bd68-7b49-4698-be5b-fa8f5b932258", "item_id": "ebd6b464-c5e5-47ab-a3ed-0ec9c036b9b8", "item_name": "Follow", "category": "Books", "quantity": 4, "price": 23, "payment_method": "cash", "ts": 1737898526}'),
 Row(value='{"event_id": "f757f668-051f-483e-a45b-80babf8cdbd7", "user_id": "c8cfe2e3-637b-484a-8682-c5d2fc8974ee", "item_id": "66f5f2ce-b70d-4111-a9a7-fdbe938ab8d5", "item_name": "Writer", "category": "Home Goods", "quantity": 3, "price": 90, "payment_method": "cash", "ts": 1737901624}'),
 Row(value='{"event_id": "63fefc0e-f97e-40ec-a708-c7b8017fc941", "user_id": "5de85378-aee4-4341-a55c-cb62f0fd320d", "item_id": "f3ae3a3f-bb65-497f-8ec6-4574a9c81858", "item_name": "Floor", "category": "Electronics", "quantity": 1, "price": 22, "payment_method": "cash", "ts": 1737901167}'),
 Row(value='{"event_id": "78a53967-554b-4d0b-8b8e-833ea3a96f2a", "user_id": "fc9ba3eb-5111-4dee-8939-00c7f7358b7a", "item_id": "bdff

In [11]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType

schema = StructType(
    [
        StructField("event_id", StringType(), True),
        StructField("user_id", StringType(), True),
        StructField("item_id", StringType(), True),
        StructField("item_name", StringType(), True),
        StructField("category", StringType(), True),
        StructField("quantity", IntegerType(), True),
        StructField("price", IntegerType(), True),
        StructField("payment_method", StringType(), True),
        StructField("ts", LongType(), True),
    ]
)

In [12]:
from pyspark.sql.functions import from_json, col

(
    kafka_json_df
    .select(
        from_json(col("value"), schema)
        .alias("data")
    )
    .select("data.*")
    .show()
)

+--------------------+--------------------+--------------------+---------+-----------+--------+-----+--------------+----------+
|            event_id|             user_id|             item_id|item_name|   category|quantity|price|payment_method|        ts|
+--------------------+--------------------+--------------------+---------+-----------+--------+-----+--------------+----------+
|59a3ecd1-e4a3-421...|e723bd68-7b49-469...|ebd6b464-c5e5-47a...|   Follow|      Books|       4|   23|          cash|1737898526|
|f757f668-051f-483...|c8cfe2e3-637b-484...|66f5f2ce-b70d-411...|   Writer| Home Goods|       3|   90|          cash|1737901624|
|63fefc0e-f97e-40e...|5de85378-aee4-434...|f3ae3a3f-bb65-497...|    Floor|Electronics|       1|   22|          cash|1737901167|
|78a53967-554b-4d0...|fc9ba3eb-5111-4de...|bdffacf6-689c-47a...|  Service|      Books|       5|   90|   credit_card|1737898595|
+--------------------+--------------------+--------------------+---------+-----------+--------+-----+---

# Stream Simulation

In [13]:
# read from kafka_topic
kafka_df = (
    spark
    .readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", f'{kafka_host}:9092')
    .option("subscribe", kafka_topic)
    .option("startingOffsets", "earliest")
    .load()
)

In [14]:
from pyspark.sql.functions import from_json, col

parsed_df = (
    kafka_df
    .withColumn("value", expr("cast(value as string)"))
    .select(
        from_json(col("value"), schema)
        .alias("data")
    )
    .select("data.*")
)

In [15]:
from pyspark.sql.functions import col, from_unixtime
# change unix to timestamp
parsed_df_with_timestamp = (
    parsed_df
    .withColumn("event_time", from_unixtime(col("ts")).cast("timestamp"))
)

In [16]:
from pyspark.sql.functions import window, count
aggregated_df = (
    parsed_df_with_timestamp
    .withWatermark("event_time", "60 minutes") # data can be late up to 1 hour
    .groupBy(window(col("event_time"), "1 day", "1 day")) # to make it daily
    .agg(count("*").alias("total_records")) # count records
)

In [17]:
(
    aggregated_df
    .writeStream
    .outputMode("complete")
    .format("console")
    .trigger(processingTime='5 minutes')
    .option("checkpointLocation", "checkpoint_dir_3")
    .start()
    .awaitTermination()
)

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 