In [1]:
from pyspark import SparkContext

In [2]:
sc = SparkContext().getOrCreate()

In [3]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.getOrCreate()

In [6]:
!wget https://norvig.com/ngrams/shakespeare.txt

--2019-06-25 12:03:00--  https://norvig.com/ngrams/shakespeare.txt
Resolving norvig.com (norvig.com)... 158.106.138.13
Connecting to norvig.com (norvig.com)|158.106.138.13|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4538523 (4.3M) [text/plain]
Saving to: ‘shakespeare.txt’


2019-06-25 12:03:01 (4.81 MB/s) - ‘shakespeare.txt’ saved [4538523/4538523]



In [15]:
text = sc.textFile('/home/nate/Dropbox/regis/MSDS/courses/MSDS610/week7/shakespeare.txt')

In [16]:
type(text)

pyspark.rdd.RDD

In [18]:
text_df = spark.read.csv('/home/nate/Dropbox/regis/MSDS/courses/MSDS610/week7/shakespeare.txt')

In [19]:
type(text_df)

pyspark.sql.dataframe.DataFrame

In [20]:
type(text_df.rdd)

pyspark.rdd.RDD

In [21]:
text.take(5)

["A MIDSUMMER-NIGHT'S DREAM",
 '',
 'Now , fair Hippolyta , our nuptial hour ',
 'Draws on apace : four happy days bring in ',
 'Another moon ; but O ! methinks how slow ']

In [23]:
text_df.rdd.take(5)

[Row(_c0="A MIDSUMMER-NIGHT'S DREAM"),
 Row(_c0='Now '),
 Row(_c0='Draws on apace : four happy days bring in '),
 Row(_c0='Another moon ; but O ! methinks how slow '),
 Row(_c0='This old moon wanes ; she lingers my desires ')]

In [24]:
text_df.show(5)

+--------------------+
|                 _c0|
+--------------------+
|A MIDSUMMER-NIGHT...|
|                Now |
|Draws on apace : ...|
|Another moon ; bu...|
|This old moon wan...|
+--------------------+
only showing top 5 rows



In [16]:
text_header = ["A MIDSUMMER-NIGHT'S DREAM",
 '',
 'Now , fair Hippolyta , our nuptial hour ',
 'Draws on apace : four happy days bring in ',
 'Another moon ; but O ! methinks how slow ']

In [21]:
words_list = []
for line in text_header:
    split_line = line.split()
    words_list.extend(split_line)

In [24]:
words_list = []
for line in text_header:
    split_line = split_words(line)
    words_list.extend(split_line)

In [27]:
list(map(split_words, text_header))

[['A', "MIDSUMMER-NIGHT'S", 'DREAM'],
 [],
 ['Now', ',', 'fair', 'Hippolyta', ',', 'our', 'nuptial', 'hour'],
 ['Draws', 'on', 'apace', ':', 'four', 'happy', 'days', 'bring', 'in'],
 ['Another', 'moon', ';', 'but', 'O', '!', 'methinks', 'how', 'slow']]

In [23]:
def split_words(line):
    split_line = line.split()
    return split_line

In [28]:
list(map(lambda line: line.split(), text_header))

[['A', "MIDSUMMER-NIGHT'S", 'DREAM'],
 [],
 ['Now', ',', 'fair', 'Hippolyta', ',', 'our', 'nuptial', 'hour'],
 ['Draws', 'on', 'apace', ':', 'four', 'happy', 'days', 'bring', 'in'],
 ['Another', 'moon', ';', 'but', 'O', '!', 'methinks', 'how', 'slow']]

These two approaches are identical:

In [48]:
words = text.flatMap(lambda x: x.split())

In [49]:
words = text.flatMap(split_words)

In [47]:
words.take(5)

['A', "MIDSUMMER-NIGHT'S", 'DREAM', 'Now', ',']

In [37]:
wordCounts = words.map(lambda x: (x, 1))

In [38]:
wordCounts.take(5)

[('A', 1), ("MIDSUMMER-NIGHT'S", 1), ('DREAM', 1), ('Now', 1), (',', 1)]

In [53]:
# same as in the documentation: 
from operator import add
word_sums = wordCounts.reduceByKey(add)

We can see from the docs (https://docs.python.org/3/library/operator.html#operator.add) that the add function is basically this:

`lambda x, y: x + y`

In [54]:
word_sums.take(5)

[("MIDSUMMER-NIGHT'S", 1),
 ('Now', 741),
 ('Hippolyta', 6),
 ('nuptial', 21),
 ('apace', 25)]

In [55]:
# Another equivalent way:
word_sums = wordCounts.reduceByKey(lambda x, y: x + y)

In [56]:
word_sums.take(5)

[("MIDSUMMER-NIGHT'S", 1),
 ('Now', 741),
 ('Hippolyta', 6),
 ('nuptial', 21),
 ('apace', 25)]

It seems weird that this works at first, but since it's reducing, it's combining all the common keys.  So the lambda function is taking the existing count of a word, then adding 1 to it each time it encounters that word again.

Here is how to do the same thing with groupByKey.  Notice it returns this pyspark iterable class, which we can convert to a python list with `list()`.

In [59]:
wordCounts.groupByKey().collect()

[("MIDSUMMER-NIGHT'S",
  <pyspark.resultiterable.ResultIterable at 0x7fb87133bc88>),
 ('Now', <pyspark.resultiterable.ResultIterable at 0x7fb87133bd30>),
 ('Hippolyta', <pyspark.resultiterable.ResultIterable at 0x7fb87133be48>),
 ('nuptial', <pyspark.resultiterable.ResultIterable at 0x7fb87133bda0>),
 ('apace', <pyspark.resultiterable.ResultIterable at 0x7fb87133bf28>),
 ('four', <pyspark.resultiterable.ResultIterable at 0x7fb87133bf98>),
 ('days', <pyspark.resultiterable.ResultIterable at 0x7fb871374048>),
 ('bring', <pyspark.resultiterable.ResultIterable at 0x7fb87133be10>),
 ('in', <pyspark.resultiterable.ResultIterable at 0x7fb871374128>),
 ('Another', <pyspark.resultiterable.ResultIterable at 0x7fb871374198>),
 (';', <pyspark.resultiterable.ResultIterable at 0x7fb8713741d0>),
 ('but', <pyspark.resultiterable.ResultIterable at 0x7fb871374240>),
 ('O', <pyspark.resultiterable.ResultIterable at 0x7fb871374278>),
 ('!', <pyspark.resultiterable.ResultIterable at 0x7fb8713742b0>),
 ('me

In [62]:
wordCounts.groupByKey().map(lambda x: (x[0], list(x[1]))).take(5)

[("MIDSUMMER-NIGHT'S", [1]),
 ('Now',
  [1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
  

In [63]:
wordCounts.groupByKey().map(lambda x: (x[0], sum(list(x[1])))).take(5)

[("MIDSUMMER-NIGHT'S", 1),
 ('Now', 741),
 ('Hippolyta', 6),
 ('nuptial', 21),
 ('apace', 25)]

The pyspark documentation may also be helpful: https://spark.apache.org/docs/latest/api/python/pyspark.html?highlight=reducebykey