## Import libraries

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split

## Create a SparkSession object

In [2]:
spark = (SparkSession.builder
         .appName("config-streaming")
         .master("spark://spark-master:7077")
         .config("spark.executor.memory", "512m")
         .getOrCreate()
        )

spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/11 11:12:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Create a streaming Dataframe that represents the input data from the source

### Using "readStream" method

In [3]:
lines = (spark.readStream
         .format("socket")
         .option("host", "localhost")
         .option("port", 9999)
         .load()
        )

## Apply transformations

### Split the lines into words

In [5]:
words = lines.select(explode(split(lines.value, " ")).alias("word"))

## Aggregation on streaming Dataframe using groupBy

### Generate a running word count

In [6]:
wordCounts = words.groupBy("word").count()

## Output the running counts to the console

In [7]:
query = (wordCounts.writeStream
         .outputMode("complete")
         .format("console")
         .start()
        )

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+----+-----+
|word|count|
+----+-----+
+----+-----+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+------------+-----+
|        word|count|
+------------+-----+
|        Data|    2|
|    overview|    1|
|Fundamentals|    1|
|      stream|    1|
|          by|    2|
|       solve|    1|
|         you|    1|
|   landscape|    1|
|    systems.|    1|
|replication,|    1|
|         for|    1|
|         Joe|    1|
|  tolerance,|    1|
|    provides|    1|
|        Reis|    1|
|      topics|    1|
|   practices|    1|
|       model|    1|
|     concise|    1|
| distributed|    1|
+------------+-----+
only showing top 20 rows



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+------------+-----+
|        word|count|
+------------+-----+
|   Dynamical|    1|
|        Data|    2|
|     complex|    1|
|    overview|    1|
|     Science|    1|
|Fundamentals|    1|
|      stream|    1|
|      Nathan|    1|
|          by|    3|
|       solve|    2|
|         you|    2|
|   landscape|    1|
|          L.|    1|
|    systems.|    1|
|       apply|    1|
|replication,|    1|
|         for|    1|
|         Joe|    1|
|         how|    1|
|  reduction,|    1|
+------------+-----+
only showing top 20 rows



                                                                                

-------------------------------------------
Batch: 3
-------------------------------------------
+------------+-----+
|        word|count|
+------------+-----+
|   Dynamical|    1|
|        Data|    2|
|     complex|    1|
|    overview|    1|
|     Science|    1|
|Fundamentals|    1|
|      stream|    1|
|      Nathan|    1|
|          by|    3|
|       solve|    2|
|         you|    2|
|   landscape|    1|
|          L.|    1|
|    systems.|    1|
|       apply|    1|
|replication,|    1|
|         for|    1|
|         Joe|    1|
|         how|    1|
|  tolerance,|    1|
+------------+-----+
only showing top 20 rows



                                                                                

-------------------------------------------
Batch: 4
-------------------------------------------
+------------+-----+
|        word|count|
+------------+-----+
|   Dynamical|    1|
|        Data|    3|
|     complex|    1|
|    overview|    1|
|     Science|    1|
|Fundamentals|    1|
|      stream|    1|
|      Nathan|    1|
|          by|    3|
|       solve|    2|
|         new|    1|
|         you|    2|
|   landscape|    1|
|          L.|    1|
|    systems.|    1|
|       apply|    1|
|replication,|    1|
|         for|    1|
|         Joe|    1|
|         how|    1|
+------------+-----+
only showing top 20 rows



                                                                                

-------------------------------------------
Batch: 5
-------------------------------------------
+------------+-----+
|        word|count|
+------------+-----+
|   Dynamical|    1|
|        Data|    4|
|     complex|    1|
|    overview|    1|
|      demand|    1|
|     Science|    1|
|Fundamentals|    1|
|      stream|    1|
|      Nathan|    1|
|          by|    3|
|       solve|    2|
|         you|    2|
|         new|    1|
|   landscape|    1|
|        more|    1|
|          L.|    1|
|    systems.|    1|
|       apply|    1|
|replication,|    1|
|         for|    1|
+------------+-----+
only showing top 20 rows



## Stop the query

In [8]:
query.stop()

## Stop the spark session

In [9]:
spark.stop()