## Spark Structured Streaming - Read from Socket

In [None]:
# Create the Spark Session
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Streaming Socket Word Count") \
    .master("spark://spark-master:7077") \
    .getOrCreate()

spark

In [None]:
# Create the streaming dataframe to read from socket
# Sockets are not recommended for Production applications is only for debugging and testing applications
streaming_df = spark.readStream \
    .format("socket") \
    .option("host", "localhost") \
    .option("port", "9999") \
    .load()

In [None]:
# Check the schema
streaming_df.printSchema()

In [None]:
# Lets split the strings based on spaces and explode the list to create words column
words_df = streaming_df.selectExpr("explode(split(value, ' ')) as word")

# Check the schema
words_df.printSchema()

In [None]:
# Now lets aggregate the words_df to find the word counts
from pyspark.sql.functions import count

# Change the shuffle partitions to 4 as we dont want to run through 200 partitions
spark.conf.set("spark.sql.shuffle.partitions", 4)

# Generate aggregated dataframe for word count
agg_words_df = words_df \
    .groupBy("word") \
    .agg(count("word").alias("count"))

# Print the schema to validate
agg_words_df.printSchema()

In [None]:
# Write the output to console sink to check the output
writing_df = agg_words_df.writeStream \
    .format("console") \
    .outputMode("update") \
    .start()

# Start the streaming application to run until the following happens
# 1. Exception in the running program
# 2. Manual Interruption
writing_df.awaitTermination()