In [1]:
import pyspark
sc = pyspark.SparkContext('local[*]')

In [2]:
# load text of macbeth from s3
!wget https://dsr-notebooks.s3.amazonaws.com/macbeth.txt
shakespeare = sc.textFile("macbeth.txt")

--2018-07-19 08:35:44--  https://dsr-notebooks.s3.amazonaws.com/macbeth.txt
Resolving dsr-notebooks.s3.amazonaws.com (dsr-notebooks.s3.amazonaws.com)... 54.231.40.203
Connecting to dsr-notebooks.s3.amazonaws.com (dsr-notebooks.s3.amazonaws.com)|54.231.40.203|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5589889 (5.3M) [text/plain]
Saving to: ‘macbeth.txt.4’


2018-07-19 08:35:48 (1.73 MB/s) - ‘macbeth.txt.4’ saved [5589889/5589889]



In [3]:
lines = shakespeare.map(lambda line: line.lower())

In [4]:
lines.cache()

PythonRDD[2] at RDD at PythonRDD.scala:48

In [5]:
macbeth = lines.filter(lambda l: l.find("macbeth") >= 0)
import sys
print(macbeth)

PythonRDD[3] at RDD at PythonRDD.scala:48


In [6]:
macbeth.count()

283

In [7]:
macbeth.take(5)

['the tragedy of macbeth',
 "  macbeth, thane of glamis and cawdor, a general in the king's army",
 '  lady macbeth, his wife',
 '  seyton, attendant to macbeth',
 '  gentlewoman attending on lady macbeth']

In [8]:
macbeth_and_macduff = macbeth.filter(lambda l: l.find("macduff") >= 0)

In [9]:
print(macbeth_and_macduff)
macbeth_and_macduff.collect()

PythonRDD[7] at RDD at PythonRDD.scala:48


['  macduff. those that macbeth hath slain.',
 "  macbeth. how say'st thou, that macduff denies his person",
 '  first apparition. macbeth! macbeth! macbeth! beware macduff,',
 '  macbeth. then live, macduff. what need i fear of thee?',
 "             re-enter macduff, with macbeth's head."]

In [10]:
print(macbeth_and_macduff.toDebugString())

b'(2) PythonRDD[7] at RDD at PythonRDD.scala:48 []\n |  PythonRDD[2] at RDD at PythonRDD.scala:48 []\n |      CachedPartitions: 2; MemorySize: 4.0 MB; ExternalBlockStoreSize: 0.0 B; DiskSize: 0.0 B\n |  macbeth.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 []\n |  macbeth.txt HadoopRDD[0] at textFile at NativeMethodAccessorImpl.java:0 []'


In [11]:
import re
words = lines.flatMap(lambda l: re.split(r'\W+', l))

In [12]:
groupedWords = words.map(lambda w: (w, 1))
unsorted = groupedWords.reduceByKey(lambda a, b: a + b)
unsorted.collect()

[('project', 331),
 ('gutenberg', 326),
 ('ebook', 17),
 ('of', 18307),
 ('shakespeare', 272),
 ('', 197060),
 ('this', 6900),
 ('is', 9808),
 ('use', 562),
 ('anyone', 7),
 ('anywhere', 8),
 ('at', 2536),
 ('no', 3814),
 ('restrictions', 2),
 ('whatsoever', 17),
 ('may', 1891),
 ('give', 1353),
 ('away', 864),
 ('online', 4),
 ('www', 11),
 ('org', 17),
 ('details', 1),
 ('below', 58),
 ('copyright', 244),
 ('guidelines', 1),
 ('in', 11183),
 ('title', 92),
 ('posting', 5),
 ('1', 357),
 ('2011', 1),
 ('100', 5),
 ('january', 3),
 ('1994', 1),
 ('language', 37),
 ('start', 37),
 ('produced', 4),
 ('world', 904),
 ('library', 233),
 ('inc', 224),
 ('presented', 18),
 ('cooperation', 1),
 ('cdroms', 1),
 ('are', 3917),
 ('placed', 11),
 ('public', 66),
 ('domain', 12),
 ('certain', 178),
 ('implications', 1),
 ('read', 210),
 ('version', 222),
 ('1990', 221),
 ('1993', 221),
 ('college', 225),
 ('machine', 224),
 ('long', 704),
 ('as', 5988),
 ('others', 422),
 ('only', 535),
 ('used', 

In [13]:
unsorted.map(lambda t: (t[1], t[0])).sortByKey(False).collect()

[(197060, ''),
 (27843, 'the'),
 (26847, 'and'),
 (22538, 'i'),
 (19882, 'to'),
 (18307, 'of'),
 (14800, 'a'),
 (13928, 'you'),
 (12490, 'my'),
 (11563, 'that'),
 (11183, 'in'),
 (9808, 'is'),
 (8961, 'd'),
 (8760, 'not'),
 (8358, 'for'),
 (8066, 'with'),
 (7778, 'me'),
 (7750, 'it'),
 (7734, 's'),
 (7146, 'be'),
 (6900, 'this'),
 (6891, 'your'),
 (6859, 'his'),
 (6682, 'he'),
 (6287, 'but'),
 (5988, 'as'),
 (5910, 'have'),
 (5549, 'thou'),
 (5282, 'so'),
 (5205, 'him'),
 (5017, 'will'),
 (4810, 'what'),
 (4490, 'by'),
 (4034, 'thy'),
 (3983, 'all'),
 (3917, 'are'),
 (3850, 'her'),
 (3847, 'do'),
 (3814, 'no'),
 (3623, 'we'),
 (3602, 'shall'),
 (3539, 'if'),
 (3207, 'on'),
 (3199, 'or'),
 (3181, 'thee'),
 (3094, 'lord'),
 (3066, 'our'),
 (3053, 'o'),
 (3041, 'king'),
 (2834, 'good'),
 (2792, 'now'),
 (2764, 'sir'),
 (2670, 'from'),
 (2536, 'at'),
 (2534, 'they'),
 (2520, 'come'),
 (2410, 'she'),
 (2409, 'll'),
 (2369, 'let'),
 (2357, 'enter'),
 (2331, 'here'),
 (2329, 'which'),
 (2301,

In [14]:
sc.stop()