# Basics of Spark Steaming: Section Final Exercise

### Exercise
Fill in the "TODO" sections in the code according to the specifications

In [1]:
import findspark
# TODO: your path will likely not have 'matthew' in it. Change it to reflect your path.
findspark.init('/home/matthew/spark-2.1.0-bin-hadoop2.7')

In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.streaming import StreamingContext
import sys
import random
from apache_log_parser import ApacheAccessLog

In [3]:
conf = (SparkConf().setMaster("local[4]").setAppName("log processor").set("spark.executor.memory", "2g"))

sc = SparkContext(conf=conf)
ssc = StreamingContext(sc, 2)
ssc.checkpoint("checkpoint")


In [4]:
# create DStream from text file
# Note: the spark streaming checks for any updates to this directory.
# So first, start this program, and then copy the log file logs/access_log.log to 'directory' location
log_data = ssc.textFileStream('logs')

In [5]:
# Parses each line using a utility class
# This is the DStream that will be used for most of the problems below
access_log_dstream = log_data.map(ApacheAccessLog.parse_from_log_line).filter(lambda parsed_line: parsed_line is not None)

In [6]:
# maps each ip with value 1. So the stream becomes (ip, 1)
ip_dstream = access_log_dstream.map(lambda parsed_line: (parsed_line.ip, 1))
ip_count = ip_dstream.reduceByKey(lambda x,y: x+y)
ip_count.pprint(num = 30)


ip_bytes_dstream = access_log_dstream.map(lambda parsed_line: (parsed_line.ip, parsed_line.content_size))
ip_bytes_sum_dstream = ip_bytes_dstream.reduceByKey(lambda x,y: x+y)

In [7]:
##### TODO: the ip_bytes_request_count_dstream & ip_bytes_sum_dstream DStreams
ip_bytes_request_count_dstream = ip_count.join(ip_bytes_sum_dstream)
ip_bytes_request_count_dstream.pprint(num = 30)

##########################


In [8]:
# Write Dummy function `internalFunction` for transform() function
def internalFunction(rdd):
    return rdd

####### TODO: Define transformed_access_log_dstream by applying the transform() function 
####### to the access_log_dstream 

transformed_access_log_dstream = access_log_dstream.transform(internalFunction)
transformed_access_log_dstream.pprint()

###############################

In [9]:
####### TODO: Use the window() function to count data over the access_log_stream
access_logs_window = access_log_dstream.window(windowDuration = 6, slideDuration=4) 
window_counts = access_logs_window.count()
print( " Window count: ")
window_counts.pprint()
###############################

 Window count: 


In [10]:
######## TODO: define ip_count_dstream of Ip counts per window using reduceByKeyAndWindow()
ip_count_dstream = ip_dstream.reduceByKeyAndWindow(func = lambda x,y: x+y, invFunc = lambda x,y: x-y, windowDuration = 6, slideDuration=4)
#############################

ip_count_dstream.pprint(num=30)

In [11]:
ip_dstream = access_log_dstream.map(lambda entry: entry.ip)
######### TODO: Use the countByValueAndWindow() function on the ip_dstream
######### to define ip_address_request_count
ip_address_request_count = ip_dstream.countByValueAndWindow(windowDuration = 6, slideDuration=4)
ip_address_request_count.pprint()

#############################

In [12]:
######### TODO: Use the countByWindow() function on the ip_dstream
######### To define ip_address_request_count
request_count = access_log_dstream.countByWindow(windowDuration = 6, slideDuration=4)
request_count.pprint()

#############################

In [13]:
# This section runs a Running count of response codes using updateStateByKey()
# This basically maintains a running sum , rather than sum in windows

def state_full_sum(new_values, global_sum):
    return sum(new_values) + (global_sum or 0)

response_code_dstream = access_log_dstream.map(lambda entry: (entry.response_code, 1))
response_code_count_dstream = response_code_dstream.updateStateByKey(state_full_sum)
response_code_count_dstream.pprint()

In [14]:
###### Save the IP address request count using the saveAsTextFiles() function #####
ip_address_request_count.saveAsTextFiles(prefix = "output", suffix = "txt")

################################

In [15]:
ssc.start() 
# ssc.awaitTermination()

-------------------------------------------
Time: 2018-03-01 21:30:42
-------------------------------------------

-------------------------------------------
Time: 2018-03-01 21:30:42
-------------------------------------------

-------------------------------------------
Time: 2018-03-01 21:30:42
-------------------------------------------

-------------------------------------------
Time: 2018-03-01 21:30:42
-------------------------------------------

-------------------------------------------
Time: 2018-03-01 21:30:44
-------------------------------------------

-------------------------------------------
Time: 2018-03-01 21:30:44
-------------------------------------------

-------------------------------------------
Time: 2018-03-01 21:30:44
-------------------------------------------

-------------------------------------------
Time: 2018-03-01 21:30:44
-------------------------------------------

-------------------------------------------
Time: 2018-03-01 21:30:44
----------

-------------------------------------------
Time: 2018-03-01 21:31:04
-------------------------------------------
('lj1027.inktomisearch.com', (2, ''))
('lj1153.inktomisearch.com', (2, ''))
('ns.wtbts.org', (12, ''))
('lj1216.inktomisearch.com', (1, ''))
('user-0c8hdkf.cable.mindspring.com', (5, ''))
('ladybug.cns.vt.edu', (5, ''))
('h24-70-69-74.ca.shawcable.net', (32, ''))
('osdlab.eic.nctu.edu.tw', (1, ''))
('lj1052.inktomisearch.com', (1, ''))
('fw.kcm.org', (2, ''))
('cr020r01-3.sac.overture.com', (44, ''))
('200.222.33.33', (1, ''))
('ic8234.upco.es', (4, ''))
('ipcorp-c8b07af1.terraempresas.com.br', (1, ''))
('145.253.208.9', (7, ''))
('fw1.millardref.com', (7, ''))
('lj1089.inktomisearch.com', (1, ''))
('cacher2-ext.wise.edt.ericsson.se', (1, ''))
('spot.nnacorp.com', (5, ''))
('h24-71-236-129.ca.shawcable.net', (51, ''))
('h24-70-56-49.ca.shawcable.net', (7, ''))
('acbf6930.ipt.aol.com', (2, ''))
('212.21.228.26', (1, ''))
('mmscrm07-2.sac.overture.com', (3, ''))
('pool-68-160

-------------------------------------------
Time: 2018-03-01 21:31:12
-------------------------------------------

-------------------------------------------
Time: 2018-03-01 21:31:12
-------------------------------------------

-------------------------------------------
Time: 2018-03-01 21:31:12
-------------------------------------------

-------------------------------------------
Time: 2018-03-01 21:31:12
-------------------------------------------
('304', 137)
('200', 1274)
('401', 123)
('302', 6)
('404', 5)

-------------------------------------------
Time: 2018-03-01 21:31:14
-------------------------------------------

-------------------------------------------
Time: 2018-03-01 21:31:14
-------------------------------------------

-------------------------------------------
Time: 2018-03-01 21:31:14
-------------------------------------------

-------------------------------------------
Time: 2018-03-01 21:31:14
-------------------------------------------
('304', 137)
('200'

-------------------------------------------
Time: 2018-03-01 21:31:34
-------------------------------------------
('304', 137)
('200', 1274)
('401', 123)
('302', 6)
('404', 5)

-------------------------------------------
Time: 2018-03-01 21:31:36
-------------------------------------------

-------------------------------------------
Time: 2018-03-01 21:31:36
-------------------------------------------

-------------------------------------------
Time: 2018-03-01 21:31:36
-------------------------------------------

-------------------------------------------
Time: 2018-03-01 21:31:36
-------------------------------------------

-------------------------------------------
Time: 2018-03-01 21:31:36
-------------------------------------------

-------------------------------------------
Time: 2018-03-01 21:31:36
-------------------------------------------

-------------------------------------------
Time: 2018-03-01 21:31:36
-------------------------------------------

------------------

In [None]:
ssc.stop(stopSparkContext=True, stopGraceFully=False)