In [0]:
# The goal of this manipulation is to get familiar with PySpark RDDs , if you have ever heard of "Hello world" in web development , "Word Count"
# is the same in distributed computing 
# In this notebook, I will setup a pipeline that will count the words in a document in a distributed manner.

In [0]:
#  Defining the spark context to play with RDDs
sc = spark.sparkContext

In [0]:
# S3 filepath
FILENAME = 's3://full-stack-bigdata-datasets/Big_Data/purple_rain.txt'

In [0]:
#Loading the filepath to a Spark RDD 
text_file = sc.textFile(FILENAME)

In [0]:
text_file

Out[8]: s3://full-stack-bigdata-datasets/Big_Data/purple_rain.txt MapPartitionsRDD[4] at textFile at NativeMethodAccessorImpl.java:0

In [0]:
# The last cell doesn't tell us much, let's check the first 3 elements of this RDD
text_file.take(3)


Out[9]: ['I never meant to cause you any sorrow',
 'I never meant to cause you any pain',
 'I only wanted one time to see you laughing']

In [0]:
# List of tokens
tokens = text_file.flatMap(lambda line: line.split(' '))


In [0]:
tokens.take(10)


Out[11]: ['I', 'never', 'meant', 'to', 'cause', 'you', 'any', 'sorrow', 'I', 'never']

In [0]:
# Now that we have our list of words (well, not exactly a list of words, it is still a RDD), we can start counting things.we need to map each word to an initial count
# Writing a function that takes a token as input and returns a tuple then mapping the tokens to this function to create the variable partial_count
def token_to_tuple(token):
    return (token, 1)
partial_count = tokens.map(token_to_tuple)
partial_count
partial_count.take(10)


Out[12]: [('I', 1),
 ('never', 1),
 ('meant', 1),
 ('to', 1),
 ('cause', 1),
 ('you', 1),
 ('any', 1),
 ('sorrow', 1),
 ('I', 1),
 ('never', 1)]

In [0]:
grouped_by_key = partial_count.groupByKey()
grouped_by_key.take(3)

Out[13]: [('never', <pyspark.resultiterable.ResultIterable at 0x7fa8cb72eb20>),
 ('cause', <pyspark.resultiterable.ResultIterable at 0x7fa8cb72ee50>),
 ('pain', <pyspark.resultiterable.ResultIterable at 0x7fa8cb72edc0>)]

In [0]:
# Each element of grouped_by_key is a tuple, and inside a tuple there is an iterable we can iterate over.

first_item = grouped_by_key.take(1)[0]
first_item

Out[14]: ('never', <pyspark.resultiterable.ResultIterable at 0x7fa8cbc7d2b0>)

In [0]:
def print_item(item_as_tuple):
    token_name, occurences = item_as_tuple
    occurences_as_list = list(occurences)
    print(f"{token_name}: {occurences_as_list}")

In [0]:
# Taking the first 10 items from grouped_by_key and then iterating over them.
for item in grouped_by_key.take(10):
    print_item(item)

never: [1, 1, 1, 1]
cause: [1, 1]
pain: [1]
only: [1, 1, 1, 1, 1, 1, 1]
in: [1, 1]
rain: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Purple: [1, 1, 1, 1, 1, 1, 1, 1, 1]
rain,: [1, 1, 1, 1, 1, 1, 1, 1, 1]
bathing: [1]


In [0]:
# When you take the first 10 elements of grouped_by_key, it returns a list of Tuple[str, ResultIterable].
# What I want instead is a list of Tuple[str, int] where the second element is the total number of occurence for the fist element.
[(token, sum(list(occurences))) for token, occurences in grouped_by_key.take(10)]

Out[17]: [('never', 4),
 ('cause', 2),
 ('pain', 1),
 ('only', 7),
 ('in', 2),
 ('rain', 14),
 ('', 10),
 ('Purple', 9),
 ('rain,', 9),
 ('bathing', 1)]

In [0]:
def reduce_function(a, b):
    return a + b

In [0]:
reduced = partial_count.reduceByKey(reduce_function)
reduced.take(10)

Out[20]: [('never', 4),
 ('cause', 2),
 ('pain', 1),
 ('only', 7),
 ('in', 2),
 ('rain', 14),
 ('', 10),
 ('Purple', 9),
 ('rain,', 9),
 ('bathing', 1)]

In [0]:
# I've got a list of tuples, where the key is the token, and the value is its count within the text, but... they're not ordered. Which is inconvenient if I want to have the 10 most popular tokens within the text.
sorted_counts = reduced.sortBy(lambda t: t[1])
sorted_counts.take(10)
desc_sorted_counts = reduced.sortBy(lambda t: -t[1])# descending sort
desc_sorted_counts.take(10)

Out[21]: [('rain', 14),
 ('I', 14),
 ('you', 14),
 ('purple', 14),
 ('to', 13),
 ('', 10),
 ('Purple', 9),
 ('rain,', 9),
 ('only', 7),
 ('see', 6)]