# Window Operations Demo

### Demo

In [None]:
import findspark
# TODO: your path will likely not have 'matthew' in it. Change it to reflect your path.
findspark.init('/home/matthew/spark-2.2.1-bin-hadoop2.7')

In [None]:
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
from pyspark.sql.functions import window

In [None]:
host = sys.argv[1]
port = int(sys.argv[2])
windowSize = int(sys.argv[3])
slideSize = int(sys.argv[4]) if (len(sys.argv) == 5) else windowSize
if slideSize > windowSize:
    print("<slide duration> must be less than or equal to <window duration>", file=sys.stderr)
windowDuration = '{} seconds'.format(windowSize)
slideDuration = '{} seconds'.format(slideSize)

In [None]:
spark = SparkSession.builder.appName("StructuredNetworkWordCountWindowed").getOrCreate()

In [None]:
# Create DataFrame representing the stream of input lines from connection to host:port
lines = spark.readStream.format('socket').option('host', host).option('port', port).option('includeTimestamp', 'true').load()

In [None]:
# Split the lines into words, retaining timestamps
# split() splits each line into an array, and explode() turns the array into multiple rows
words = lines.select(explode(split(lines.value, ' ')).alias('word'),lines.timestamp)

In [None]:
# Group the data by window and word and compute the count of each group
windowedCounts = words.groupBy(window(words.timestamp, windowDuration, slideDuration),words.word).count().orderBy('window')

In [None]:
# Start running the query that prints the windowed word counts to the console
query = windowedCounts.writeStream.outputMode('complete').format('console').option('truncate', 'false').start()

query.awaitTermination()

## References
1. 