# Initialization

In [1]:
import json
import uuid
import os
from dotenv import load_dotenv
from pathlib import Path
from kafka import KafkaProducer
from faker import Faker
from time import sleep

In [2]:
from pyspark.sql import SparkSession

spark = (
    SparkSession 
    .builder 
    .appName("Dibimbing Spark-Kafka") 
    .config("spark.streaming.stopGracefullyOnShutdown", True) 
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.2')
    .config("spark.sql.shuffle.partitions", 4)
    .master("local[*]") 
    .getOrCreate()
)

spark

# Spark - Kafka Streaming

In [3]:
dotenv_path = Path('/resources/.env')
load_dotenv(dotenv_path=dotenv_path)

True

In [4]:
kafka_host = os.getenv('KAFKA_HOST')
kafka_topic = os.getenv('KAFKA_TOPIC_NAME')
kafka_topic_partition = os.getenv('KAFKA_TOPIC_NAME')+"-1"

In [5]:
kafka_df = (
    spark
    .read
    .format("kafka")
    .option("kafka.bootstrap.servers", f'{kafka_host}:9092')
    .option("subscribe", kafka_topic)
    .option("startingOffsets", "earliest")
    .load()
)

In [6]:
kafka_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [7]:
kafka_df.show()

+----+--------------------+--------------------+---------+------+--------------------+-------------+
| key|               value|               topic|partition|offset|           timestamp|timestampType|
+----+--------------------+--------------------+---------+------+--------------------+-------------+
|null|[7B 22 74 72 61 6...|streaming-assignm...|        0|     0|2025-01-26 08:21:...|            0|
|null|[7B 22 74 72 61 6...|streaming-assignm...|        0|     1|2025-01-26 08:21:...|            0|
|null|[7B 22 74 72 61 6...|streaming-assignm...|        0|     2|2025-01-26 08:21:...|            0|
|null|[7B 22 74 72 61 6...|streaming-assignm...|        0|     3|2025-01-26 08:21:...|            0|
|null|[7B 22 74 72 61 6...|streaming-assignm...|        0|     4|2025-01-26 08:21:...|            0|
|null|[7B 22 74 72 61 6...|streaming-assignm...|        0|     5|2025-01-26 08:21:...|            0|
|null|[7B 22 74 72 61 6...|streaming-assignm...|        0|     6|2025-01-26 08:21:...|     

In [8]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType,TimestampType


schema = StructType([
    StructField("transaction_id", StringType(), True),
    StructField("customer_name", StringType(), True),
    StructField("store_name", StringType(), True),
    StructField("location", StringType(), True),
    StructField("product_type", StringType(), True),
    StructField("product_model", StringType(), True),
    StructField("payment", StringType(), True),
    StructField("merchant", StringType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("unit_price", IntegerType(), True),
    StructField("total_price", IntegerType(), True),
    StructField("transaction_date", StringType(), True),
    StructField("ts", TimestampType(), True),
])


## Stream Simulation

In [9]:
kafka_df = (
    spark
    .readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", f'{kafka_host}:9092')
    .option("subscribe", kafka_topic)
    .option("startingOffsets", "earliest")
    .load()
)

In [10]:
from pyspark.sql.functions import from_json, col, expr

parsed_df = (
    kafka_df
    .withColumn("value", expr("cast(value as string)"))
    .select(
        from_json(col("value"), schema)
        .alias("data")
    )
    .select("data.*")
)

parsed_df.printSchema()

root
 |-- transaction_id: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- store_name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- product_type: string (nullable = true)
 |-- product_model: string (nullable = true)
 |-- payment: string (nullable = true)
 |-- merchant: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- unit_price: integer (nullable = true)
 |-- total_price: integer (nullable = true)
 |-- transaction_date: string (nullable = true)
 |-- ts: timestamp (nullable = true)



In [11]:
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import to_timestamp, window
from pyspark.sql.window import Window

parsed_df_with_watermark = parsed_df.withWatermark("ts", "60 minutes")

aggregated_df = parsed_df_with_watermark.groupBy(
        window("ts", "5 minute")  # 5 menit adalah interval waktu untuk window
    ) \
    .agg(
        F.sum("total_price").alias("total_sales")  # Agregasi sum dari total_price
    ) \
    .select(
        "window.start", "window.end", "total_sales"
    )

In [12]:
checkpoint_location = '/tmp/spark/checkpoints'

In [None]:
# Tampilkan hasil di console (output mode complete)
query = (
    aggregated_df.writeStream
    .outputMode("complete")  # Gunakan "complete" untuk agregasi
    .format("console")  # Menampilkan hasil ke console
    .trigger(processingTime="5 minutes")  # Men-trigger event setiap 5 menit
    .option("checkpointLocation", checkpoint_location)  # Lokasi checkpoint
    # .option("failOnDataLoss", "false") 
    .start()
)

# Menunggu stream untuk berjalan
query.awaitTermination()