In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

config = SparkConf() \
                    .setAppName('Spark Streaming') \
                    .set("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.0") \
                    .set("spark.sql.streaming.checkpointLocation", "hdfs:///user/root/checkpoint")

sc = SparkContext(conf=config)
spark = SparkSession(sc)

spark

2021-10-24 01:00:35,983 WARN util.Utils: Your hostname, localhost resolves to a loopback address: 127.0.0.1; using 172.22.0.5 instead (on interface eth0)
2021-10-24 01:00:35,983 WARN util.Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-68595fb9-9853-4183-9dc4-09da75ac10d5;1.0
	confs: [default]


:: loading settings :: url = jar:file:/opt/spark-3.2.0-bin-without-hadoop/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.2.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.2.0 in central
	found org.apache.kafka#kafka-clients;2.8.0 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.8.4 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.1 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.hadoop#hadoop-client-api;3.3.1 in central
	found org.apache.htrace#htrace-core4;4.1.0-incubating in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
downloading https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_2.12/3.2.0/spark-sql-kafka-0-10_2.12-3.2.0.jar ...
	[SUCCESSFUL ] org.apache.spark#spark-sql-kafka-0-10_2.12;3.2.0!spark-sql-kafka-0-10_2.12.jar (507ms)
downloadin

### Import some spark functions

In [2]:
from pyspark.sql.functions import lit, col

## Structured Streaming

### Streaming Queries

In [3]:
# Subscribe to 1 topic, with headers
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka-broker:29092") \
    .option("subscribe", "quickstart-events") \
    .option("startingOffsets", "latest") \
    .option("minPartitions", "10") \
    .option("failOnDataLoss", "true") \
    .option("includeHeaders", "true") \
    .load()

In [None]:
# Subscribe to a pattern
df = spark \
     .readStream \
     .format("kafka") \
     .option("kafka.bootstrap.servers", "kafka-broker:29092") \
     .option("subscribePattern", "topic.*") \
     .load()

#### Sink

In [4]:
# Write key-value data from a DataFrame to a specific Kafka topic specified in an option
df = df \
  .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)", "headers") \
  .withColumn("value", lit("changing events value")) \
  .writeStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka-broker:29092") \
  .option("topic", "another-topic") \
  .start()

                                                                                

In [None]:
# Write key-value data from a DataFrame to Kafka using a topic specified in the data
df = df \
  .selectExpr("topic", "CAST(key AS STRING)", "CAST(value AS STRING)") \
  .withColumn("value", lit("outro valor")) \
  .writeStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka-broker:29092") \
  .start()

### Batch Queries

In [5]:
# Subscribe to 1 topic, with headers
df = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka-broker:29092") \
  .option("subscribe", "quickstart-events") \
  .option("startingOffsets", "earliest") \
  .option("endingOffsets", "latest") \
  .option("includeHeaders", "true") \
  .load()
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)", "headers").show()

                                                                                

+----+--------------------+-------+
| key|               value|headers|
+----+--------------------+-------+
|null|            testando|   null|
|null|          novo texto|   null|
|null|       aqui vai dado|   null|
|null|                  oi|   null|
|null|                bora|   null|
|null|                 vai|   null|
|null|          tudo novo?|   null|
|null|           ola,mundo|   null|
|null|             eu,jose|   null|
|null|         aqui,denovo|   null|
|null|E la vamos nos de...|   null|
|null|    testando mais um|   null|
|null|                e ai|   null|
|null|                 foi|   null|
|null|            e agora?|   null|
|null|            so isto?|   null|
+----+--------------------+-------+



In [None]:
# Subscribe to multiple topics, specifying explicit Kafka offsets
df = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka-broker:29092") \
  .option("subscribe", "topic1,topic2") \
  .option("startingOffsets", """{"topic1":{"0":23,"1":-2},"topic2":{"0":-2}}""") \
  .option("endingOffsets", """{"topic1":{"0":50,"1":-1},"topic2":{"0":-1}}""") \
  .load()
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").show()

In [None]:
# Subscribe to a pattern, at the earliest and latest offsets
df = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka-broker:29092") \
  .option("subscribePattern", "topic.*") \
  .option("startingOffsets", "earliest") \
  .option("endingOffsets", "latest") \
  .load()
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").show()

In [6]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)
 |-- headers: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- key: string (nullable = true)
 |    |    |-- value: binary (nullable = true)



In [None]:
df = df.withColumn("value", lit("some text"))
df.show()

#### Sink

In [8]:
# Write key-value data from a DataFrame to a specific Kafka topic specified in an option
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
  .write \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka-broker:29092") \
  .option("topic", "another-topic") \
  .save()

In [None]:
# Write key-value data from a DataFrame to Kafka using a topic specified in the data
df.selectExpr("topic", "CAST(key AS STRING)", "CAST(value AS STRING)") \
  .write \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka-broker:29092") \
  .save()

## Spark Streaming (DStreams)