In [None]:
import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext(appName="Lab1")

### Lab1 : Spark Word Count

### Topics : 

* RDD Creation
* RDD Transformations and Actions

### Example objetive :

Given an input file , compute the nb of ocurrences of a particular word inside the file

### Reference :

* SPARK Reference Documentation: https://spark.apache.org/docs/2.3.1/programming-guide.html#rdd-operations


In [None]:
 import sys
 print(sys.executable)
 print(sys.version)
 print(sys.version_info)

In [None]:
dir()

In [None]:
dir(__builtins__)

In [None]:
sc

In [None]:
sc

In [None]:
range(1,100)

In [None]:
numbers1_99 = sc.parallelize(range(1,100))
type(numbers1_99)

In [None]:
print(numbers1_99.sum())

In [None]:
import timeit
from operator import add
from pyspark.sql import SparkSession

In [None]:
inputFile="data/appl.log"

In [None]:
lines = sc.textFile(inputFile)
type(lines)

In [None]:
# get the number of lines 
%time lines.count()

In [None]:
# get the number of partitions for this RDD 
lines.getNumPartitions()

In [None]:
# Specify the word you want to search for
search_word='error'

### Apply Transformations and actions to compute the result

Transformations : 
    
1. flatMap() transformation : split each line into the words that form it , split by whitespace
2. filter() transformation:  filter on each line those words that are equal to the search word
3. map() transformation : create a tuple with each filtered word on each line and a counter
4. reduceByKey() transformation : aggregate based on the keys(=distinct words) with a sum function (add) over all lines

Action : 
    
1. collect() : return all elements from the computed RDD

Lazy Evaluation :

* Until the collect() action is called nothing actually happens

In [None]:
counts_rdd = lines.flatMap(lambda x: x.split(' ')) \
        .filter(lambda x : search_word in x) \
        .map(lambda word : (word, 1)) \
        .reduceByKey(add)

type(counts_rdd)

### Inspect Job Execution

In [None]:
# See the RDD lineage
print(counts_rdd.toDebugString().decode("utf-8"))

In [None]:
# The lineage is telling us that there will be 2 stages with 2 tasks each for this spark job
# Important points : 
# 1. see there is a one to one correlation between task and partition
# 2. A shuffling of data is involved because the reduceByKey 
#    requires to place all items belonging to the same key on the same partition 
#    shuffling operation marks the boundary between stages

In [None]:
errors = counts_rdd.collect()


In [None]:
for word, count in errors:
    print("%s: %i" % (word, count))

### Room for optimization

In [None]:
# Now , imagine we want to search a set of words ...
# Do you want to repeat every time the loading and split by whitespace operations ?
# These are going to be repeated every time unless we cache ...
cached_lines = lines.cache()

In [None]:
# Now search for other words
search_word='info'
counts_rdd = cached_lines.flatMap(lambda x: x.split(' ')) \
        .filter(lambda x : search_word in x) \
        .map(lambda word : (word, 1)) \
        .reduceByKey(add)

In [None]:
infos = counts_rdd.collect()

In [None]:
for word, count in infos:
    print("%s: %i" % (word, count))

### Further analysis

* In the Spark Web UI Inspect the storage tab.
* You should see that the RDD has been cached , saved directly in memory
* Now perform again and operation , like count()

In [None]:
%time cached_lines.count()

In [None]:
# Now search for other words
search_word='notice'
counts_rdd = cached_lines.flatMap(lambda x: x.split(' ')) \
        .filter(lambda x : search_word in x) \
        .map(lambda word : (word, 1)) \
        .reduceByKey(add)

In [None]:
notices = counts_rdd.collect()

In [None]:
for word, count in notices:
    print("%s: %i" % (word, count))