In [30]:
# Generate Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Reading from Sockets")
    .master("local[*]")
    .getOrCreate()
)

spark

In [31]:
# Read input batch data
df_raw = spark.read.text("sample.txt")

In [9]:
df_raw.printSchema()

root
 |-- value: string (nullable = true)



In [10]:
df_raw.show()

+------------------+
|             value|
+------------------+
|simon had a dog...|
+------------------+



In [21]:
# Split the line into words
from pyspark.sql.functions import split

df_words = df_raw.withColumn("words", split(df_raw["value"], " "))

In [22]:
df_words.show() 

+------------------+--------------------+
|             value|               words|
+------------------+--------------------+
|simon had a dog...|[simon, had, a, d...|
+------------------+--------------------+



In [23]:
# Explode the list of words
from pyspark.sql.functions import explode

df_explode = df_words.withColumn("word", explode(df_words["words"])).drop("value", "words")

In [24]:
df_explode.show()

+------+
|  word|
+------+
| simon|
|   had|
|     a|
|dog...|
+------+



In [28]:
# Aggregate the words to generate count
from pyspark.sql.functions import count, lit

df_agg = df_explode.groupBy("word").agg(count(lit(1)).alias("count"))

In [29]:
df_agg.show()

+------+-----+
|  word|count|
+------+-----+
| simon|    1|
|   had|    1|
|     a|    1|
|dog...|    1|
+------+-----+

