# Word Count in Spark
We can find a cleaned-up text file with all of Shakespeare's work in

In [None]:
%%sh
hdfs dfs -ls /data/shakespeare

In [1]:
DATADIR='/data/shakespeare'

In [None]:
##sc.stop; del(sc)

In [2]:
# %load pyspark_init_arc.py
#
# This configuration works for Spark on macOS using homebrew
#
import os, sys
# set OS environment variable
os.environ["SPARK_HOME"] = '/usr/hdp/2.4.2.0-258/spark'
# add Spark library to Python
sys.path.insert(0, os.path.join(os.environ["SPARK_HOME"], 'python'))

# import package
import pyspark
from pyspark.context import SparkContext, SparkConf

import atexit
def stop_my_spark():
    sc.stop()
    del(sc)

# Register exit    
atexit.register(stop_my_spark)

# Configure and start Spark ... but only once.
if not 'sc' in globals():
    conf = SparkConf()
    conf.setAppName('MyFirstSpark') ## you may want to change this
    conf.setMaster('yarn-client')   ##conf.setMaster('local[2]')
    sc = SparkContext(conf=conf)
    print "Launched Spark version %s with ID %s" % (sc.version, sc.applicationId)

Launched Spark version 1.6.1 with ID application_1508160140652_0006


In [None]:
print "http://arc.insight.gsu.edu:8088/cluster/app/%s"% (sc.applicationId)

# Load Data

In [3]:
rdd = sc.textFile(os.path.join(DATADIR, 'shakespeare.txt')).sample(False, 0.001)

In [4]:
rdd.take(10)

[u"  Or else receiv'st with pleasure thine annoy?",
 u'    And put you in the catalogue of those',
 u'    Come, sirrah.                                         Exeunt',
 u'    Unless her prayers, whom heaven delights to hear',
 u"  KING. Come, come, to th' purpose. Did he love this woman?",
 u'    In time we hate that which we often fear.',
 u'  OLIVER, son of Sir Rowland de Boys',
 u'  CELIA. Dear sovereign, hear me speak.',
 u'    in it, it goes much against my stomach. Hast any philosophy in',
 u'    shallow, inconstant, full of tears, full of smiles; for every']

# Cleaning-up

The `mapper.sh` code run some character replacements
<pre>
tr -d '.,:?"' \
| tr '[]{}-' '     ' \
| tr 'A-Z' 'a-z' \
| tr ' ' '\n' \
| grep -v -e '^[[:space:]]*$'

</pre>

We're going to use the regular expression package, `re` to replace characters.
<pre>
regex = re.compile('[%s]' % re.escape(string.punctuation))
regex.sub(' ', s)
</pre>


In [5]:
import re, string
regex = re.compile('[%s]' % re.escape(string.punctuation))

In [6]:
print "Special characters: %s"%(string.punctuation)

Special characters: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [7]:
regex.sub(' ', 'Hello! (Xyz)?')

'Hello   Xyz  '

In [11]:
# hint: regex.sub(' ', s)
def get_rid_of_punct(x):
    return  regex.sub(' ', x)

#rdd2 = rdd.map(lambda x: regex.sub(' ', x))
rdd2 = rdd.map(get_rid_of_punct).map(lambda s: s.lower())
rdd2.take(4)

[u'  or else receiv st with pleasure thine annoy ',
 u'    and put you in the catalogue of those',
 u'    come  sirrah                                          exeunt',
 u'    unless her prayers  whom heaven delights to hear']

Let's convert to lower case

In [None]:
# hint: lower()


`map()` vs `flatMap()`
- `map` produces a single row per row, even if the row may contain a collection
- `flatMap` if the function on the row produces a collection multiple rows will be ejected

In [14]:
# hint: s.split(' ')
rdd2.flatMap(lambda s: s.split(' ')).take(4)

[u'', u'', u'or', u'else']

Now, that we have the words extracted we still need to add a value for the reduce process $x \rightarrow (x,1)$

In [17]:
rdd3 = rdd2.flatMap(lambda s: s.split(' ')).map(lambda w: (w, 1))
rdd3.take(4)

[(u'', 1), (u'', 1), (u'or', 1), (u'else', 1)]

#  Counting
Now, our data set should be in the proper format, and we can count the words

In [25]:
# hint: +
cnt_rdd = rdd3.filter(lambda s: len(s[0])>0).reduceByKey(lambda a,b: a+b)
cnt_rdd.take(10)

[(u'all', 3),
 (u'particularly', 1),
 (u'pardon', 1),
 (u'being', 1),
 (u'able', 1),
 (u'toby', 1),
 (u'cease', 1),
 (u'burgundy', 1),
 (u'go', 2),
 (u'fear', 3)]

Let's also sort them in descending order ... may have to swap values within the row

In [28]:
# hint: sortByKey
cnt_rdd.map(lambda t: (t[1], t[0])).sortByKey(False).take(20)

[(26, u'the'),
 (20, u'and'),
 (20, u'a'),
 (20, u'of'),
 (19, u'to'),
 (17, u'i'),
 (15, u'my'),
 (14, u'not'),
 (13, u'as'),
 (12, u'in'),
 (11, u'you'),
 (11, u'that'),
 (11, u'be'),
 (10, u'me'),
 (9, u'we'),
 (9, u's'),
 (9, u'it'),
 (8, u'is'),
 (8, u'for'),
 (7, u'thou')]