In [2]:
import findspark
findspark.init()

In [3]:
from pyspark import SparkContext
sc = SparkContext("local", "WordCount")

22/05/05 23:44:05 WARN Utils: Your hostname, ubuntu-virtual-machine resolves to a loopback address: 127.0.1.1; using 192.168.174.129 instead (on interface ens33)
22/05/05 23:44:05 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/05/05 23:44:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
# Lazy evaluation, files won't be read until some action is applied on data
fileRdd = sc.textFile("hdfs://localhost:9000/ml-latest-small/README.txt")

In [5]:
print(fileRdd.count())

[Stage 0:>                                                          (0 + 1) / 1]

153


                                                                                

In [6]:
fileRdd.take(5)

['Summary',
 '',
 'This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from [MovieLens](http://movielens.org), a movie recommendation service. It contains 100836 ratings and 3683 tag applications across 9742 movies. These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018.',
 '']

In [7]:
# Count is an action method, it has read file, get the count from executors
# the files shall be read from hdfs by executor, load content into partitions, get the count
fileRdd.count()

153

In [8]:
# collect is a action method, this also create job, read data from hdfs etc
fileRdd.collect()

['Summary',
 '',
 'This dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from [MovieLens](http://movielens.org), a movie recommendation service. It contains 100836 ratings and 3683 tag applications across 9742 movies. These data were created by 610 users between March 29, 1996 and September 24, 2018. This dataset was generated on September 26, 2018.',
 '',
 'Users were selected at random for inclusion. All selected users had rated at least 20 movies. No demographic information is included. Each user is represented by an id, and no other information is provided.',
 '',
 'The data are contained in the files `links.csv`, `movies.csv`, `ratings.csv` and `tags.csv`. More details about the contents and use of all these files follows.',
 '',
 'This is a *development* dataset. As such, it may change over time and is not an appropriate dataset for shared research results. See available *benchmark* datasets if that is your intent.',
 '',
 'This and other GroupLens

In [9]:
# RDD Lineage
# Map is transformation ,lazy evaluation, no job is created
lowerCaseRdd = fileRdd.map (lambda line: line.strip().lower())

In [10]:
lowerCaseRdd.take(5)

['summary',
 '',
 'this dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from [movielens](http://movielens.org), a movie recommendation service. it contains 100836 ratings and 3683 tag applications across 9742 movies. these data were created by 610 users between march 29, 1996 and september 24, 2018. this dataset was generated on september 26, 2018.',
 '']

In [11]:
# RDD Lineage
# Map is transformation ,lazy evaluation, no job is created
lowerCaseRdd = fileRdd.map (lambda line: line.strip().lower())

In [12]:
# collect is action, creates job, load files, read file, remove space
lowerCaseRdd.collect()

['summary',
 '',
 'this dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from [movielens](http://movielens.org), a movie recommendation service. it contains 100836 ratings and 3683 tag applications across 9742 movies. these data were created by 610 users between march 29, 1996 and september 24, 2018. this dataset was generated on september 26, 2018.',
 '',
 'users were selected at random for inclusion. all selected users had rated at least 20 movies. no demographic information is included. each user is represented by an id, and no other information is provided.',
 '',
 'the data are contained in the files `links.csv`, `movies.csv`, `ratings.csv` and `tags.csv`. more details about the contents and use of all these files follows.',
 '',
 'this is a *development* dataset. as such, it may change over time and is not an appropriate dataset for shared research results. see available *benchmark* datasets if that is your intent.',
 '',
 'this and other grouplens

In [13]:
wordListRdd = lowerCaseRdd.map (lambda line: line.split(" "))


In [14]:
wordListRdd.collect()


[['summary'],
 [''],
 ['this',
  'dataset',
  '(ml-latest-small)',
  'describes',
  '5-star',
  'rating',
  'and',
  'free-text',
  'tagging',
  'activity',
  'from',
  '[movielens](http://movielens.org),',
  'a',
  'movie',
  'recommendation',
  'service.',
  'it',
  'contains',
  '100836',
  'ratings',
  'and',
  '3683',
  'tag',
  'applications',
  'across',
  '9742',
  'movies.',
  'these',
  'data',
  'were',
  'created',
  'by',
  '610',
  'users',
  'between',
  'march',
  '29,',
  '1996',
  'and',
  'september',
  '24,',
  '2018.',
  'this',
  'dataset',
  'was',
  'generated',
  'on',
  'september',
  '26,',
  '2018.'],
 [''],
 ['users',
  'were',
  'selected',
  'at',
  'random',
  'for',
  'inclusion.',
  'all',
  'selected',
  'users',
  'had',
  'rated',
  'at',
  'least',
  '20',
  'movies.',
  'no',
  'demographic',
  'information',
  'is',
  'included.',
  'each',
  'user',
  'is',
  'represented',
  'by',
  'an',
  'id,',
  'and',
  'no',
  'other',
  'information',
  

In [15]:
wordListRdd.take(5)

[['summary'],
 [''],
 ['this',
  'dataset',
  '(ml-latest-small)',
  'describes',
  '5-star',
  'rating',
  'and',
  'free-text',
  'tagging',
  'activity',
  'from',
  '[movielens](http://movielens.org),',
  'a',
  'movie',
  'recommendation',
  'service.',
  'it',
  'contains',
  '100836',
  'ratings',
  'and',
  '3683',
  'tag',
  'applications',
  'across',
  '9742',
  'movies.',
  'these',
  'data',
  'were',
  'created',
  'by',
  '610',
  'users',
  'between',
  'march',
  '29,',
  '1996',
  'and',
  'september',
  '24,',
  '2018.',
  'this',
  'dataset',
  'was',
  'generated',
  'on',
  'september',
  '26,',
  '2018.'],
 ['']]

In [16]:
# flatMap, remove the list, project element in the list as record
wordRdd = wordListRdd.flatMap(lambda elements: elements)

In [17]:
wordRdd.count()

1278

In [18]:
wordRdd =  wordRdd.filter (lambda word: word != "")


In [19]:
wordRdd.collect()


['summary',
 'this',
 'dataset',
 '(ml-latest-small)',
 'describes',
 '5-star',
 'rating',
 'and',
 'free-text',
 'tagging',
 'activity',
 'from',
 '[movielens](http://movielens.org),',
 'a',
 'movie',
 'recommendation',
 'service.',
 'it',
 'contains',
 '100836',
 'ratings',
 'and',
 '3683',
 'tag',
 'applications',
 'across',
 '9742',
 'movies.',
 'these',
 'data',
 'were',
 'created',
 'by',
 '610',
 'users',
 'between',
 'march',
 '29,',
 '1996',
 'and',
 'september',
 '24,',
 '2018.',
 'this',
 'dataset',
 'was',
 'generated',
 'on',
 'september',
 '26,',
 '2018.',
 'users',
 'were',
 'selected',
 'at',
 'random',
 'for',
 'inclusion.',
 'all',
 'selected',
 'users',
 'had',
 'rated',
 'at',
 'least',
 '20',
 'movies.',
 'no',
 'demographic',
 'information',
 'is',
 'included.',
 'each',
 'user',
 'is',
 'represented',
 'by',
 'an',
 'id,',
 'and',
 'no',
 'other',
 'information',
 'is',
 'provided.',
 'the',
 'data',
 'are',
 'contained',
 'in',
 'the',
 'files',
 '`links.csv`,',

In [20]:
# convert word into (key,value) rdd (spark, 1) for reduceByKey
pairRdd = wordRdd.map (lambda word: (word, 1))

In [21]:
pairRdd.take(5)


[('summary', 1),
 ('this', 1),
 ('dataset', 1),
 ('(ml-latest-small)', 1)]

In [22]:
# get word count using reduceByKey
# transformation
wordCountRdd = pairRdd.reduceByKey(lambda acc, value: acc + value)

In [23]:
wordCountRdd.collect()


[('summary', 1),
 ('this', 13),
 ('dataset', 6),
 ('(ml-latest-small)', 1),
 ('describes', 1),
 ('5-star', 2),
 ('rating', 3),
 ('and', 27),
 ('free-text', 1),
 ('tagging', 1),
 ('activity', 1),
 ('from', 6),
 ('[movielens](http://movielens.org),', 1),
 ('a', 14),
 ('movie', 15),
 ('recommendation', 1),
 ('service.', 1),
 ('it', 4),
 ('contains', 1),
 ('100836', 1),
 ('ratings', 4),
 ('3683', 1),
 ('tag', 6),
 ('applications', 1),
 ('across', 3),
 ('9742', 1),
 ('movies.', 3),
 ('these', 10),
 ('data', 16),
 ('were', 3),
 ('created', 1),
 ('by', 12),
 ('610', 1),
 ('users', 4),
 ('between', 3),
 ('march', 1),
 ('29,', 1),
 ('1996', 1),
 ('september', 2),
 ('24,', 1),
 ('2018.', 2),
 ('was', 1),
 ('generated', 1),
 ('on', 7),
 ('26,', 1),
 ('selected', 4),
 ('at', 8),
 ('random', 2),
 ('for', 16),
 ('inclusion.', 2),
 ('all', 5),
 ('had', 1),
 ('rated', 1),
 ('least', 2),
 ('20', 1),
 ('no', 4),
 ('demographic', 1),
 ('information', 6),
 ('is', 18),
 ('included.', 1),
 ('each', 8),
 ('u

In [24]:
wordCountRdd.take(5)


[('summary', 1),
 ('this', 13),
 ('dataset', 6),
 ('(ml-latest-small)', 1)]

In [25]:
# plan
wordCountRdd.getNumPartitions()

1

In [26]:
# write the result into text file in hdfs
# saveAsTextFile is an ACTION Method
# word-count-results1 is a folder, inside we will shall partition files

wordCountRdd.saveAsTextFile ("hdfs://localhost:9000/word-count-results3")

In [27]:
# hdfs dfs -ls /word-cou# hdfs dfs -ls /word-count-results1
#  _SUCCESS 0 bytes , to state that last operation successfuly stored
# part-00000 - partition files 
# note the partition file name, part-00000 or other file name 
# hdfs dfs -cat /word-count-results1/part-00000

# use hdfs web ui  http://localhost:50070/nt-results1
#  _SUCCESS 0 bytes , to state that last operation successfuly stored
# part-00000 - partition files 
# note the partition file name, part-00000 or other file name 
# hdfs dfs -cat /word-count-results1/part-00000

# use hdfs web ui  http://localhost:50070/

In [28]:
# hdfs dfs -ls /word-count-results1
#  _SUCCESS 0 bytes , to state that last operation successfuly stored
# part-00000 - partition files 
# note the partition file name, part-00000 or other file name 
# hdfs dfs -cat /word-count-results1/part-00000

# use hdfs web ui  http://localhost:50070/

In [29]:
# saveAsTextFile with two partitioned data
wordCountRdd.repartition(2)\
            .saveAsTextFile("hdfs://localhost:9000/word-count-results2")