# Window Operations Exercise

1. What is Window Operations(better with some graphs)
2. Explain parameters (window length and sliding interval)
3. Some of the popular Window operations
    * Window
    * countByWindow
    * reduceByKeyAndWindow
    * countByValueAndWindow


### Exercise

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.streaming import StreamingContext
import sys
import random
from apache_log_parser import ApacheAccessLog

random.seed(15)

if len(sys.argv) != 2:
    print('Please provide the path to Apache log file')
    print('10_10.py <path_to_log_directory>')
    sys.exit(2)

conf = (SparkConf().setMaster("local[4]").setAppName("log processor").set("spark.executor.memory", "2g"))

sc = SparkContext(conf=conf)

ssc = StreamingContext(sc, 2)
ssc.checkpoint("checkpoint")
 
directory = sys.argv[1]
print(directory)

# create DStream from text file
# Note: the spark streaming checks for any updates to this directory.
# So first, start this program, and then copy the log file logs/access_log.log to 'directory' location
log_data = ssc.textFileStream(directory)
access_log_dstream = log_data.map(ApacheAccessLog.parse_from_log_line).filter(lambda parsed_line: parsed_line is not None)
ip_dstream = access_log_dstream.map(lambda parsed_line: (parsed_line.ip, 1)) 
ip_count = ip_dstream.reduceByKey(lambda x,y: x+y)
ip_count.pprint(num = 30)
ip_bytes_dstream = access_log_dstream.map(lambda parsed_line: (parsed_line.ip, parsed_line.content_size))
ip_bytes_sum_dstream = ip_bytes_dstream.reduceByKey(lambda x,y: x+y)
ip_bytes_request_count_dstream = ip_count.join(ip_bytes_sum_dstream)
ip_bytes_request_count_dstream.pprint(num = 30)

####### TODO: use window()to count data over a window ##########################



####### Exercise End ##########################################################

ssc.start() 
ssc.awaitTermination()


## References
1. https://spark.apache.org/docs/latest/streaming-programming-guide.html#discretized-streams-dstreams