'org.apache.spark:spark-sql-kafka-0-10_2.13:3.2.0'

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType

scala_version = '2.13'
spark_version = '3.4.3'
packages = [
    f'org.apache.spark:spark-sql-kafka-0-10_{scala_version}:{spark_version}',
    'org.apache.kafka:kafka-clients:3.2.0'
]

spark = SparkSession.builder\
   .master("local[*]")\
   .appName("kafka-example-1")\
   .config("spark.jars.packages", ",".join(packages))\
   .getOrCreate()

spark

In [2]:
topic_name = 'user-topic'

In [3]:
schema = StructType([
        StructField("id", StringType(), False),
        StructField("first_name", StringType(), False),
        StructField("last_name", StringType(), False),
        StructField("gender", StringType(), False),
        StructField("address", StringType(), False),
        StructField("post_code", StringType(), False),
        StructField("email", StringType(), False),
        StructField("username", StringType(), False),
        StructField("registered_date", StringType(), False),
        StructField("phone", StringType(), False),
        StructField("picture", StringType(), False)
    ])


In [4]:
df = spark.readStream\
  .format("kafka")\
  .option("kafka.bootstrap.servers", "localhost:9092")\
  .option("subscribe", topic_name)\
  .option("startingOffsets", "earliest")\
  .load()

In [7]:
from pyspark.sql.types import StructType, StringType

json_schema = StructType() \
    .add("gender", StringType()) \
    .add("name", StructType()
         .add("title", StringType())
         .add("first", StringType())
         .add("last", StringType())) \
    .add("location", StructType()
         .add("street", StructType()
              .add("number", StringType())
              .add("name", StringType()))
         .add("city", StringType())
         .add("state", StringType())
         .add("country", StringType())
         .add("postcode", StringType())
         .add("coordinates", StructType()
              .add("latitude", StringType())
              .add("longitude", StringType()))
         .add("timezone", StructType()
              .add("offset", StringType())
              .add("description", StringType()))) \
    .add("email", StringType()) \
    .add("login", StructType()
         .add("uuid", StringType())
         .add("username", StringType())
         .add("password", StringType())
         .add("salt", StringType())
         .add("md5", StringType())
         .add("sha1", StringType())
         .add("sha256", StringType())) \
    .add("dob", StructType()
         .add("date", StringType())
         .add("age", StringType())) \
    .add("registered", StructType()
         .add("date", StringType())
         .add("age", StringType())) \
    .add("phone", StringType()) \
    .add("cell", StringType()) \
    .add("id", StructType()
         .add("name", StringType())
         .add("value", StringType())) \
    .add("picture", StructType()
         .add("large", StringType())
         .add("medium", StringType())
         .add("thumbnail", StringType())) \
    .add("nat", StringType())


In [8]:
parsed_df = df.selectExpr("CAST(value AS STRING)") \
    .select(from_json(col("value"), json_schema).alias("parsed_value")).select("parsed_value.*")

flattened_df = parsed_df.select(
    col("gender"),
    col("name.first").alias("first_name"),
    col("name.last").alias("last_name"),
    col("location.city").alias("city"),
    col("location.country").alias("country"),
    col("email"),
    col("login.username").alias("username"),
    col("dob.date").alias("dob"),
    col("phone")
)

In [10]:
query = flattened_df \
    .writeStream \
    .format("json") \
    .outputMode("append") \
    .option("path", "output") \
    .option("checkpointLocation", "checkpoint") \
    .option("failOnDataLoss", "false") \
    .trigger(processingTime="1 minute") \
    .start()

query.awaitTermination()

phần bên dưới dùng python bình thường chạy consumer ok nhưng pyspark phía trên ko đc => lỗi không phải do kafka

In [3]:
# import json 
# from kafka import KafkaConsumer

# consumer = KafkaConsumer(
#         'test_topic',
#         bootstrap_servers='localhost:9092',
#         auto_offset_reset='earliest'
#     )
# for message in consumer:
#     print(json.loads(message.value))

{'gender': 'male', 'name': {'title': 'Mr', 'first': 'Vir', 'last': 'Dragan'}, 'location': {'street': {'number': 9501, 'name': 'Holmogorskiy provulok'}, 'city': 'Balakliya', 'state': 'Zaporizka', 'country': 'Ukraine', 'postcode': 76335, 'coordinates': {'latitude': '75.2747', 'longitude': '-94.4065'}, 'timezone': {'offset': '+7:00', 'description': 'Bangkok, Hanoi, Jakarta'}}, 'email': 'vir.dragan@example.com', 'login': {'uuid': '24dfe04b-7f92-4f9c-b7b3-5c22b674c483', 'username': 'redduck427', 'password': 'homeboy', 'salt': 'E7Jk7LV7', 'md5': '69a26b14e67acca97e2984cbd7e4f246', 'sha1': 'f83593362a0af967c7ec7e03fc3b8a30d45ffaad', 'sha256': '258c9ddd6291d61ecde5834fb1b870dd83aa2ca7233c23a141c1c67789bb0706'}, 'dob': {'date': '1989-11-21T13:20:45.539Z', 'age': 34}, 'registered': {'date': '2005-02-11T04:32:25.610Z', 'age': 19}, 'phone': '(098) L43-7252', 'cell': '(068) S83-0135', 'id': {'name': '', 'value': None}, 'picture': {'large': 'https://randomuser.me/api/portraits/men/14.jpg', 'medium':

KeyboardInterrupt: 