In [1]:
from pyspark.sql import SparkSession 
from pyspark.sql.functions import explode 
from pyspark.sql.functions import split

In [2]:
spark = SparkSession\
        .builder\
        .appName("StructuredKafkaWordCount")\
        .getOrCreate()

In [3]:
# Create DataSet representing the stream of input lines from kafka
lines = spark\
        .readStream\
        .format("kafka")\
        .option("kafka.bootstrap.servers", "localhost:9092")\
        .option("subscribe", "wordcount_topic")\
        .load()\
        .selectExpr("CAST(value AS STRING)")

In [4]:
# Split the lines into words 
words = lines.select( 
    # explode turns each item in an array into a separate row 
    explode( 
        split(lines.value, ' ') 
        ).alias('word') 
    )

In [5]:
# Generate running word count 
wordCounts = words.groupBy('word').count() 

In [None]:
# Start running the query that prints the running counts to the console 
query = wordCounts\
    .writeStream\
    .outputMode('complete')\
    .format('console')\
    .start() 

query.awaitTermination() 