DS ESA 13 Spark Basics (Polczynski)
-----------------------------------

Load the complete Shakespeare writings from here, clean the file (there is some legal text at the beginning and in the file; you can do it by hand if needed) and search for the #24 most used word in his writings.

In [1]:
# set up
from pyspark import SparkContext, SparkConf
from itertools import islice
import re

# clean up spark context for reuse (when running multiple times in notebook)
try:
    if sc != None:
        sc.stop()
except:
    pass

# init vars for first run
conf = None
sc = None

# load up spark context
if __name__ == "__main__" and sc == None:
    conf = SparkConf().setAppName("DS ESA13").setMaster("local[*]")
    sc = SparkContext(conf = conf)
    
# set replacement string to remove unwanted data later
removeLicense = """<<THIS ELECTRONIC VERSION OF THE COMPLETE WORKS OF WILLIAM
SHAKESPEARE IS COPYRIGHT 1990-1993 BY WORLD LIBRARY, INC., AND IS
PROVIDED BY PROJECT GUTENBERG ETEXT OF ILLINOIS BENEDICTINE COLLEGE
WITH PERMISSION.  ELECTRONIC AND MACHINE READABLE COPIES MAY BE
DISTRIBUTED SO LONG AS SUCH COPIES (1) ARE FOR YOUR OR OTHERS
PERSONAL USE ONLY, AND (2) ARE NOT DISTRIBUTED OR USED
COMMERCIALLY.  PROHIBITED COMMERCIAL DISTRIBUTION INCLUDES BY ANY
SERVICE THAT CHARGES FOR DOWNLOAD TIME OR FOR MEMBERSHIP.>>"""

removeLicense = removeLicense.split("\n")
print(removeLicense)

['<<THIS ELECTRONIC VERSION OF THE COMPLETE WORKS OF WILLIAM', 'SHAKESPEARE IS COPYRIGHT 1990-1993 BY WORLD LIBRARY, INC., AND IS', 'PROVIDED BY PROJECT GUTENBERG ETEXT OF ILLINOIS BENEDICTINE COLLEGE', 'WITH PERMISSION.  ELECTRONIC AND MACHINE READABLE COPIES MAY BE', 'DISTRIBUTED SO LONG AS SUCH COPIES (1) ARE FOR YOUR OR OTHERS', 'PERSONAL USE ONLY, AND (2) ARE NOT DISTRIBUTED OR USED', 'COMMERCIALLY.  PROHIBITED COMMERCIAL DISTRIBUTION INCLUDES BY ANY', 'SERVICE THAT CHARGES FOR DOWNLOAD TIME OR FOR MEMBERSHIP.>>']


In [2]:
# helper function to skip first n lines
def skipLines(data, n):
    return data.mapPartitionsWithIndex(lambda i, iter: islice(iter, n, None) if i == 0 else iter)

# helper function to filter out lines
def removeLines(data, removelines):
    return data.filter(lambda line: not any(l in line for l in removelines))

# helper function to filter out short words
def removeWordsLen(data, n):
    return data.filter(lambda word: len(word) > n)

# run spark job
if sc != None:
    lines = sc.textFile("ds_esa13_t8.shakespeare.txt")
   
    # skip first 246 lines
    lines = skipLines(lines, 246)
    
    # remove inline license
    lines = removeLines(lines, removeLicense)

    # split words based on space and other special chars such as point and comma etc
    # also convert to lowercase
    words = lines.flatMap(lambda line: re.split(r'[\s`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?]', line.lower()))
    
    # remove short words e.g. I,a and fragments left from special chars
    words = removeWordsLen(words, 1)
    
    # not needed anymore as already removed by excluding short words
    # remove empty words result from split
    # words = words.filter(bool) # empty string is false...
    
    # get word counts
    wordCounts = words.countByValue()
    
    # sort by occurence
    sortedItemCounters = sc.parallelize(wordCounts.items()).sortBy(lambda wc: wc[1], ascending=False)
    
    # output all
    # print(sortedItemCounters.collect())
    
    # output first 24
    print("The 24 most used words in the shakespeak dataset:")
    for word, count in sortedItemCounters.take(24):
        print("{}: {}".format(word, count))


The 24 most used words in the shakespeak dataset:
the: 27379
and: 26084
to: 19771
of: 17484
you: 13826
my: 12489
that: 11318
in: 11112
is: 9319
not: 8512
with: 7791
me: 7777
it: 7725
for: 7655
be: 6897
his: 6859
he: 6679
your: 6657
this: 6609
but: 6277
have: 5902
as: 5749
thou: 5549
him: 5205


In [3]:
sc.stop()