In [1]:
from pyspark.sql import SparkSession

# setting a checkpoint directory is mandatory
# without it streaming jobs simply won't work
# integration with Kafka is done as recommended by the official doc:
# http://spark.apache.org/docs/latest/streaming-kafka-0-10-integration.html
spark = (
    SparkSession.builder
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1")
    .config("spark.sql.streaming.checkpointLocation", "checkpoints")
    .getOrCreate()
)

In [2]:
# this should work for any Kafka
# but for Confluent it hanged on "getting offsets" stage

input_topic = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("subscribe", "input_topic")
    .load()
)

In [6]:
from pyspark.sql.functions import col
from pyspark.sql.types import StringType

# two examples: without aggregation and with aggregation
# as one can see, DataStream has an API similar to DataFrame
transformations = (
#    input_topic.selectExpr("value || value AS value")
     input_topic.groupby().count()
     .selectExpr("CAST(count AS String) AS value")
)

In [7]:
# jobs without aggregation should run in "append" mode
# jobs with aggregation should run in "complete" mode
# you can even write to a file or console
job = (
    transformations
    .writeStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .outputMode("complete")
    .option("topic", "output_topic")
    .start()
)

In [8]:
job.stop()