In [1]:
from pyspark.sql import SparkSession

# first we enrich our Spark session with some knowledge about Kafka
spark = (
    SparkSession.builder
    # integration with Kafka is done as recommended by the official doc:
    # http://spark.apache.org/docs/latest/streaming-kafka-0-10-integration.html
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.2")
    # setting a checkpoint directory is mandatory
    # without it streaming jobs simply won't work
    .config("spark.sql.streaming.checkpointLocation", "checkpoints")
    .getOrCreate()
)

:: loading settings :: url = jar:file:/usr/lib/spark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e696dc1c-05f5-443d-8260-4477f1e784d9;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.1.2 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.1.2 in central
	found org.apache.kafka#kafka-clients;2.6.0 in central
	found com.github.luben#zstd-jni;1.4.8-1 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.2 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
:: resolution report :: resolve 532ms :: artifacts dl 10ms
	:: modules in use:
	com.github.luben#zstd-jni;1.4.8-1 from central in [default]
	org.apache.commons#commons-pool2;2.6.2 from central in [default]


In [2]:
input_topic = (
    spark.readStream.format("kafka")
    .option("subscribe", "input_topic")
    # this should come from the cluster configuration page
    .option(
        "kafka.bootstrap.servers",
        "pkc-4r297.europe-west1.gcp.confluent.cloud:9092"
    )
    # these two lines are similar to configuring `kafka-python`
    .option("kafka.security.protocol", "SASL_SSL")
    .option("kafka.sasl.mechanism","PLAIN")
    # this mysterious line must include API key and secret from the cluster
    # see more in the documentation https://kafka.apache.org/documentation/#security_jaas_broker
    .option(
        "kafka.sasl.jaas.config",
        """org.apache.kafka.common.security.plain.PlainLoginModule required
        username="4KLAOHXKUM6GFMLJ"
        password="tdpFCiSB+pMISzXXsLnkBeYZAKc9+lb4rIRWcuv7simAlUWUbqRi8KxUt21w5+XY";
        """
    )
    .load()
)

In [3]:
from pyspark.sql.functions import col

# in Spark we use nearly identical DataFrame API
# both for files on disks and for streams
transformations = (
    input_topic
    .select(
        (
            # by default, the streaming DataFrame has `value` column
            # it contain bytes, so first it should be decoded as a string
            # then we transform it to float and double
            col("value").astype("string").astype("float") * 2
        # to save data, we encode it to string (Spark will make them bytes for us)
        # also, only `value` and other Kafka-related columns are written back
        ).astype("string").alias("value")
    )
)

In [6]:
# write stream back to Kafka
output_topic = (
    transformations
    .writeStream
    .format("kafka")
    .option("topic", "output_topic")
    # these line are the same as in the input topic
    .option(
        "kafka.bootstrap.servers",
        "pkc-4r297.europe-west1.gcp.confluent.cloud:9092"
    )
    .option("kafka.security.protocol", "SASL_SSL")
    .option("kafka.sasl.mechanism","PLAIN")
    .option(
        "kafka.sasl.jaas.config",
        """org.apache.kafka.common.security.plain.PlainLoginModule required
        username="4KLAOHXKUM6GFMLJ"
        password="tdpFCiSB+pMISzXXsLnkBeYZAKc9+lb4rIRWcuv7simAlUWUbqRi8KxUt21w5+XY";
        """
    )
)

In [7]:
# the most important part! It's an actions. Everything else were transformations
job = output_topic.start()

21/11/10 21:06:49 WARN org.apache.spark.sql.streaming.StreamingQueryManager: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.

In [8]:
# job runs asynchronously
# simply changing its definition and starting again won't kill it
# stop your jobs gracefully
job.stop()

21/11/10 21:08:53 ERROR org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@503a9df2 is aborting.
21/11/10 21:08:53 ERROR org.apache.spark.sql.execution.datasources.v2.WriteToDataSourceV2Exec: Data source write support org.apache.spark.sql.execution.streaming.sources.MicroBatchWrite@503a9df2 aborted.
21/11/10 21:08:53 WARN org.apache.spark.scheduler.TaskSetManager: Lost task 158.0 in stage 97.0 (TID 9827) (cluster-b2c2-w-0.europe-west1-c.c.aiqdsc22.internal executor 1): TaskKilled (Stage cancelled)


# Do it Yourself

* compute average value and the number of values
* inside a 10 seconds window
* output the window description and the computation results to Kafka