In [1]:
from pyspark.sql import SparkSession

In [2]:
# App 2
spark = (SparkSession.builder
         .appName("KafkaStructuredStreaming")
         .config("spark.executor.cores", "4")
         .config("spark.cores.max", "4")
         .getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/13 10:59:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/07/13 10:59:27 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/07/13 10:59:27 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "broker:29092") \
    .option("subscribe", "demo.ecomm.users") \
    .option("startingOffsets", "earliest") \
    .load()



In [4]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType

# 1. Define schema for Debezium JSON
debezium_schema = StructType([
    StructField("before", StructType([
        StructField("id", IntegerType()),
        StructField("username", StringType()),
        StructField("email", StringType()),
        StructField("password", StringType()),
        StructField("created_at", StringType())
    ]), True),
    StructField("after", StructType([
        StructField("id", IntegerType()),
        StructField("username", StringType()),
        StructField("email", StringType()),
        StructField("password", StringType()),
        StructField("created_at", StringType())
    ]), True),
    StructField("source", StructType([
        StructField("db", StringType()),
        StructField("table", StringType())
    ]), True),
    StructField("op", StringType())
])

# 2. Cast value to string and parse JSON
df_string = df.selectExpr("CAST(value AS STRING) as json_str")

parsed_df = df_string.select(from_json(col("json_str"), debezium_schema).alias("data"))

# 3. Flatten to select only 'after' data (i.e., inserted/updated row)
flattened = parsed_df.select("data.after.*", "data.op")

# 4. Show in console
query = flattened.writeStream \
    .outputMode("append") \
    .format("console") \
    .option("truncate", "false") \
    .start()


25/07/13 10:59:35 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-112db72a-e488-4df1-bf21-22a5a16331f5. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/07/13 10:59:35 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/07/13 10:59:35 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+----+--------+-----+--------+----------+----+
|id  |username|email|password|created_at|op  |
+----+--------+-----+--------+----------+----+
|null|null    |null |null    |null      |null|
|null|null    |null |null    |null      |null|
|null|null    |null |null    |null      |null|
+----+--------+-----+--------+----------+----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+----+--------+-----+--------+----------+----+
|id  |username|email|password|created_at|op  |
+----+--------+-----+--------+----------+----+
|null|null    |null |null    |null      |null|
|null|null    |null |null    |null      |null|
|null|null    |null |null    |null      |null|
+----+--------+-----+--------+----------+----+

