In [29]:
# Generate Spark Session
from pyspark.sql import SparkSession

spark = (
    SparkSession
    .builder
    .appName("Reading from Sockets")
    .master("local[*]")
    .getOrCreate()
)

spark

In [2]:
# Read input data

df_raw = spark.read.format("text").load("datasets/input/example.txt")

In [4]:
df_raw.printSchema()

root
 |-- value: string (nullable = true)



In [5]:
df_raw.show()

+--------------------+
|               value|
+--------------------+
|Simon had a dog a...|
+--------------------+



In [8]:
# Split the line into words
from pyspark.sql.functions import split

df_words = df_raw.withColumn("words", split("value", " "))
df_words.show()

+--------------------+--------------------+
|               value|               words|
+--------------------+--------------------+
|Simon had a dog a...|[Simon, had, a, d...|
+--------------------+--------------------+



In [11]:
# Explode the list of words
from pyspark.sql.functions import explode

df_explode = df_words.withColumn("word", explode("words")).drop("value", "words")

In [16]:
# Aggregate the words to generate count
from pyspark.sql.functions import count, lit

df_agg = df_explode.groupBy("word").agg(count(lit(1)).alias("cnt"))
df_agg.show()

+-----+---+
| word|cnt|
+-----+---+
| used|  1|
|simon|  1|
|  dog|  2|
| love|  1|
|  had|  1|
|  cat|  2|
|Simon|  1|
|  the|  1|
|  and|  2|
|    a|  2|
|   to|  1|
+-----+---+



In [28]:
# Save the batch output

df_agg.write.format("csv").option("header","true").mode("overwrite").save("datasets/output")