In [1]:
import findspark
findspark.init()

In [2]:
from pyspark import SparkContext
sc = SparkContext("local", "WordCount")

22/05/05 23:41:39 WARN Utils: Your hostname, ubuntu-virtual-machine resolves to a loopback address: 127.0.1.1; using 192.168.174.129 instead (on interface ens33)
22/05/05 23:41:39 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/05/05 23:41:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [6]:
# Lazy evaluation, files won't be read until some action is applied on data
fileRdd = sc.textFile("hdfs://localhost:9000/ml-latest-small/README.txt")


In [7]:
# Count is an action method, it has read file, get the count from executors
# the files shall be read from hdfs by executor, load content into partitions, get the count
fileRdd.count()

                                                                                

153

In [9]:
fileRdd.take(5)

['Summary',
 '',
 'This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from [MovieLens](http://movielens.org), a movie recommendation service. It contains 100836 ratings and 3683 tag applications across 9742 movies. These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018.',
 '']

In [10]:
# RDD Lineage
# Map is transformation ,lazy evaluation, no job is created
lowerCaseRdd = fileRdd.map (lambda line: line.strip().lower())

In [11]:
#lowerCaseRdd.collect()  
# same but print only top 5 lines
lowerCaseRdd.take(5)

['summary',
 '',
 'this dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from [movielens](http://movielens.org), a movie recommendation service. it contains 100836 ratings and 3683 tag applications across 9742 movies. these data were created by 610 users between march 29, 1996 and september 24, 2018. this dataset was generated on september 26, 2018.',
 '']

In [13]:
wordListRdd = lowerCaseRdd.map (lambda line: line.split(" "))
wordListRdd.take(5)

[['summary'],
 [''],
 ['this',
  'dataset',
  '(ml-latest-small)',
  'describes',
  '5-star',
  'rating',
  'and',
  'free-text',
  'tagging',
  'activity',
  'from',
  '[movielens](http://movielens.org),',
  'a',
  'movie',
  'recommendation',
  'service.',
  'it',
  'contains',
  '100836',
  'ratings',
  'and',
  '3683',
  'tag',
  'applications',
  'across',
  '9742',
  'movies.',
  'these',
  'data',
  'were',
  'created',
  'by',
  '610',
  'users',
  'between',
  'march',
  '29,',
  '1996',
  'and',
  'september',
  '24,',
  '2018.',
  'this',
  'dataset',
  'was',
  'generated',
  'on',
  'september',
  '26,',
  '2018.'],
 ['']]

In [14]:
# flatMap, remove the list, project element in the list as record
wordRdd = wordListRdd.flatMap(lambda elements: elements)

In [15]:
print(wordRdd.count())

1278


In [17]:
# filter empty words
wordRdd =  wordRdd.filter (lambda word: word != "")
print(wordRdd.count())

1216


In [18]:
# convert word into (key,value) rdd (spark, 1) for reduceByKey
pairRdd = wordRdd.map (lambda word: (word, 1))
pairRdd.take(5)

[('summary', 1),
 ('this', 1),
 ('dataset', 1),
 ('(ml-latest-small)', 1)]

In [20]:
# get word count using reduceByKey
# transformation
wordCountRdd = pairRdd.reduceByKey(lambda acc, value: acc + value)
wordCountRdd.take(5)

                                                                                

[('summary', 1),
 ('this', 13),
 ('dataset', 6),
 ('(ml-latest-small)', 1)]

In [21]:
# write the result into text file in hdfs
# saveAsTextFile is an ACTION Method
# word-count-results is a folder, inside we will shall partition files

wordCountRdd.saveAsTextFile ("hdfs://localhost:9000/word-count-results")

In [22]:
# hdfs dfs -ls /word-count-results1
#  _SUCCESS 0 bytes , to state that last operation successfuly stored
# part-00000 - partition files 
# note the partition file name, part-00000 or other file name 
# hdfs dfs -cat /word-count-results1/part-00000

# use hdfs web ui  http://localhost:50070/

In [23]:
# saveAsTextFile with two partitioned data
wordCountRdd.repartition(2)\
            .saveAsTextFile("hdfs://localhost:9000/word-count-results2")

                                                                                

In [24]:
# hdfs dfs -ls /word-count-results2
# hdfs dfs -cat /word-count-results2/part-00000
# hdfs dfs -cat /word-count-results2/part-00001