In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

config = SparkConf() \
                    .setAppName("Spark Streaming") \
                    .set("spark.jars.packages","org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1") \
                    .set("spark.sql.streaming.checkpointLocation", "hdfs:///user/root/checkpoint")

sc = SparkContext(conf=config)
spark = SparkSession(sc)

spark

### Import some spark functions

In [2]:
from pyspark.sql.functions import lit, col

## Structured Streaming

### Streaming Queries

In [3]:
# Subscribe to 1 topic, with headers
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka-broker:29092") \
    .option("subscribe", "quickstart-events") \
    .option("startingOffsets", "latest") \
    .option("minPartitions", "10") \
    .option("failOnDataLoss", "true") \
    .option("includeHeaders", "true") \
    .load()

In [None]:
# Subscribe to a pattern
df = spark \
     .readStream \
     .format("kafka") \
     .option("kafka.bootstrap.servers", "kafka-broker:29092") \
     .option("subscribePattern", "topic.*") \
     .load()

#### Sink

In [4]:
# Write key-value data from a DataFrame to a specific Kafka topic specified in an option
df = df \
  .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)", "headers") \
  .withColumn("value", lit("changing events value")) \
  .writeStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka-broker:29092") \
  .option("topic", "another-topic") \
  .start()

                                                                                

In [None]:
# Write key-value data from a DataFrame to Kafka using a topic specified in the data
df = df \
  .selectExpr("topic", "CAST(key AS STRING)", "CAST(value AS STRING)") \
  .withColumn("value", lit("outro valor")) \
  .writeStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka-broker:29092") \
  .start()

### Batch Queries

In [5]:
# Subscribe to 1 topic, with headers
df = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka-broker:29092") \
  .option("subscribe", "quickstart-events") \
  .option("startingOffsets", "earliest") \
  .option("endingOffsets", "latest") \
  .option("includeHeaders", "true") \
  .load()
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)", "headers").show()

                                                                                

+----+--------------------+-------+
| key|               value|headers|
+----+--------------------+-------+
|null|            testando|   null|
|null|          novo texto|   null|
|null|       aqui vai dado|   null|
|null|                  oi|   null|
|null|                bora|   null|
|null|                 vai|   null|
|null|          tudo novo?|   null|
|null|           ola,mundo|   null|
|null|             eu,jose|   null|
|null|         aqui,denovo|   null|
|null|E la vamos nos de...|   null|
|null|    testando mais um|   null|
|null|                e ai|   null|
|null|                 foi|   null|
|null|            e agora?|   null|
|null|            so isto?|   null|
+----+--------------------+-------+



In [None]:
# Subscribe to multiple topics, specifying explicit Kafka offsets
df = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka-broker:29092") \
  .option("subscribe", "topic1,topic2") \
  .option("startingOffsets", """{"topic1":{"0":23,"1":-2},"topic2":{"0":-2}}""") \
  .option("endingOffsets", """{"topic1":{"0":50,"1":-1},"topic2":{"0":-1}}""") \
  .load()
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").show()

In [None]:
# Subscribe to a pattern, at the earliest and latest offsets
df = spark \
  .read \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka-broker:29092") \
  .option("subscribePattern", "topic.*") \
  .option("startingOffsets", "earliest") \
  .option("endingOffsets", "latest") \
  .load()
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").show()

In [6]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)
 |-- headers: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- key: string (nullable = true)
 |    |    |-- value: binary (nullable = true)



In [None]:
df = df.withColumn("value", lit("some text"))
df.show()

#### Sink

In [8]:
# Write key-value data from a DataFrame to a specific Kafka topic specified in an option
df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
  .write \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka-broker:29092") \
  .option("topic", "another-topic") \
  .save()

In [None]:
# Write key-value data from a DataFrame to Kafka using a topic specified in the data
df.selectExpr("topic", "CAST(key AS STRING)", "CAST(value AS STRING)") \
  .write \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka-broker:29092") \
  .save()

## Spark Streaming (DStreams)