'org.apache.spark:spark-sql-kafka-0-10_2.13:3.2.0'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType

scala_version = '2.13'
spark_version = '3.4.3'
packages = [
    f'org.apache.spark:spark-sql-kafka-0-10_{scala_version}:{spark_version}',
    'org.apache.kafka:kafka-clients:3.2.0'
]

spark = SparkSession.builder\
   .master("local[*]")\
   .appName("kafka-example-1")\
   .config("spark.jars.packages", ",".join(packages))\
   .getOrCreate()

spark

In [3]:
topic_name = 'user-topic'

In [5]:
df = spark.readStream\
  .format("kafka")\
  .option("kafka.bootstrap.servers", "localhost:9092")\
  .option("subscribe", topic_name)\
  .option("startingOffsets", "earliest")\
  .load()

In [6]:
from pyspark.sql.types import StructType, StringType

json_schema = StructType() \
    .add("gender", StringType()) \
    .add("name", StructType()
         .add("title", StringType())
         .add("first", StringType())
         .add("last", StringType())) \
    .add("location", StructType()
         .add("street", StructType()
              .add("number", StringType())
              .add("name", StringType()))
         .add("city", StringType())
         .add("state", StringType())
         .add("country", StringType())
         .add("postcode", StringType())
         .add("coordinates", StructType()
              .add("latitude", StringType())
              .add("longitude", StringType()))
         .add("timezone", StructType()
              .add("offset", StringType())
              .add("description", StringType()))) \
    .add("email", StringType()) \
    .add("login", StructType()
         .add("uuid", StringType())
         .add("username", StringType())
         .add("password", StringType())
         .add("salt", StringType())
         .add("md5", StringType())
         .add("sha1", StringType())
         .add("sha256", StringType())) \
    .add("dob", StructType()
         .add("date", StringType())
         .add("age", StringType())) \
    .add("registered", StructType()
         .add("date", StringType())
         .add("age", StringType())) \
    .add("phone", StringType()) \
    .add("cell", StringType()) \
    .add("id", StructType()
         .add("name", StringType())
         .add("value", StringType())) \
    .add("picture", StructType()
         .add("large", StringType())
         .add("medium", StringType())
         .add("thumbnail", StringType())) \
    .add("nat", StringType())


In [7]:
parsed_df = df.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), json_schema).alias("parsed_value")).select("parsed_value.*")

flattened_df = parsed_df.select(
    col("gender"),
    col("name.first").alias("first_name"),
    col("name.last").alias("last_name"),
    col("location.city").alias("city"),
    col("location.country").alias("country"),
    col("email"),
    col("login.username").alias("username"),
    col("registered.date").alias("registered_date"),
    col("dob.date").alias("dob"),
    col("phone")
)

In [8]:
query = flattened_df \
    .writeStream \
    .format("json") \
    .outputMode("append") \
    .option("path", "output") \
    .option("checkpointLocation", "checkpoint") \
    .option("failOnDataLoss", "false") \
    .trigger(processingTime="1 minute") \
    .start()

query.awaitTermination()

phần bên dưới dùng python bình thường chạy consumer ok nhưng pyspark phía trên ko đc => lỗi không phải do kafka

In [1]:
import json 
from kafka import KafkaConsumer

consumer = KafkaConsumer(
        'user-topic',
        bootstrap_servers='localhost:9092',
        auto_offset_reset='earliest'
    )
for message in consumer:
    print(json.loads(message.value))

{'gender': 'male', 'name': {'title': 'Mr', 'first': 'Otfried', 'last': 'Sternberg'}, 'location': {'street': {'number': 6784, 'name': 'Kirchweg'}, 'city': 'Neustadt In Holstein', 'state': 'Thüringen', 'country': 'Germany', 'postcode': 11479, 'coordinates': {'latitude': '-52.8793', 'longitude': '-60.2213'}, 'timezone': {'offset': '+11:00', 'description': 'Magadan, Solomon Islands, New Caledonia'}}, 'email': 'otfried.sternberg@example.com', 'login': {'uuid': 'bdfa89f0-ccc5-4543-83af-e960a221a9eb', 'username': 'redzebra942', 'password': 'chandler', 'salt': 'WVWwvtnc', 'md5': '9160654250118b8258cd241ee7abd599', 'sha1': 'e65004e56a5300a2d64bde979ed387c826ebad64', 'sha256': 'c081d19197b46fa2f06faef1322f9215a135f52ea653b1f5f826f41fa61b87cf'}, 'dob': {'date': '1977-08-09T18:57:00.318Z', 'age': 46}, 'registered': {'date': '2010-10-31T05:15:25.088Z', 'age': 13}, 'phone': '0611-1659787', 'cell': '0173-7569607', 'id': {'name': 'SVNR', 'value': '45 090877 S 066'}, 'picture': {'large': 'https://rando

KeyboardInterrupt: 