In [1]:
from pyspark import SparkConf, SparkContext

In [2]:
conf = SparkConf().setMaster("local").setAppName("RatingsHistogram")
sc = SparkContext(conf=conf)

In [10]:
lines = sc.textFile("ml-100k/u.data")

In [15]:
ratings = lines.map(lambda x: x.split()[2])

In [20]:
print(ratings.countByKey())

defaultdict(<class 'int'>, {'3': 27145, '1': 6110, '2': 11370, '4': 34174, '5': 21201})


In [21]:
textFile = sc.textFile("big.txt")

**NOTE:** Nothing actually happens in Spark until any *action* is actually called. The *transformations* donot trigger any processing, the driver program only starts executing only when an *action* is called up on an RDD.

In [22]:
textFile.count()

128457

How to convert these lines to a list of words and then create an RDD with just the words in them?
**Answer: Use FlatMap**

In [36]:
words = textFile.flatMap(lambda x: x.strip().split()) # FlatMap returns multiple values for each value in an RDD

In [37]:
words.take(10)

['The',
 'Project',
 'Gutenberg',
 'EBook',
 'of',
 'The',
 'Adventures',
 'of',
 'Sherlock',
 'Holmes']

In [38]:
words.count()

1095695

In [39]:
freq_count = words.countByValue()

In [40]:
freq_count

defaultdict(int,
            {'The': 6149,
             'Project': 205,
             'Gutenberg': 78,
             'EBook': 5,
             'of': 39169,
             'Adventures': 2,
             'Sherlock': 95,
             'Holmes': 198,
             'by': 6384,
             'Sir': 30,
             'Arthur': 18,
             'Conan': 3,
             'Doyle': 2,
             '(#15': 1,
             'in': 19515,
             'our': 969,
             'series': 88,
             'Doyle)': 1,
             'Copyright': 6,
             'laws': 166,
             'are': 3418,
             'changing': 40,
             'all': 3349,
             'over': 1141,
             'the': 71744,
             'world.': 48,
             'Be': 19,
             'sure': 103,
             'to': 27895,
             'check': 35,
             'copyright': 42,
             'for': 6358,
             'your': 1116,
             'country': 231,
             'before': 1039,
             'downloading': 3,
             'or