In [1]:
from pyspark import SparkConf, __version__ as pyspark_version
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [2]:
main_pyspark_version = ".".join(pyspark_version.split(".")[:-1])
KAFKA_HOST = "localhost:9094"
packages = f"org.apache.spark:spark-sql-kafka-0-10_2.12:{main_pyspark_version}.0"

In [3]:
def get_spark_session(app_name: str) -> SparkSession:
    builder = (
        SparkSession.builder.appName(app_name)
        .config(
            "spark.jars.packages",
            packages,
        ).config(
            "spark.sql.warehouse.dir", "file:///spark-warehouse"
        )
    )

    return builder.enableHiveSupport().getOrCreate()

In [4]:
def read_data(topic_name):
    # Kafka parameters
    kafkaParams = {
        "kafka.bootstrap.servers": "localhost:9094",  # Replace with your broker addresses
        "subscribe":f"{topic_name}",  # Replace with your topic name
        "startingOffsets": "earliest",  # or "earliest"
        "key.deserializer": "org.apache.kafka.common.serialization.StringDeserializer",
        "value.deserializer": "org.apache.kafka.common.serialization.StringDeserializer"
    }
    return spark \
        .readStream \
        .format("kafka") \
        .options(**kafkaParams) \
        .load()

In [5]:
def process_json():
    df = read_data("test")

    df = df.selectExpr("CAST(value AS STRING)")
    
    df.writeStream \
        .outputMode("append") \
        .format("console") \
        .option("truncate", "false") \
        .foreach(lambda row: print(row)) \
        .start()

    

In [6]:
spark = get_spark_session("kafka-spark-example")

In [None]:
process_json()