In [12]:
from pyspark.sql import SparkSession
from operator import add
import re
import json
import time 

In [13]:
# New API
spark_session = SparkSession\
        .builder\
        .master("spark://master:7077") \
        .appName("haodong_zhao_wordcount_test")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","300s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("WARN")

22/03/14 20:38:45 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.


In [14]:
# df_rc = spark_session\
#         .read \
#         .option("header", "true")\
#         .json('hdfs://master:9000/dataset/RC_2011-07')\
#         .cache() # in-memory
# df_rc.printSchema()
# print(df_rc.count())
# print(data_frame.rdd.getNumPartitions())

In [15]:
rc_lines = spark_context.textFile('hdfs://master:9000/dataset/RC_2011-07')

rc_num = rc_lines.count()
print(rc_num)



10557466


                                                                                

In [16]:
rc_lines.take(5)

['{"distinguished":null,"downs":0,"created_utc":"1309478400","controversiality":0,"edited":false,"gilded":0,"author_flair_css_class":"mordekaiser","id":"c22x4aq","author":"adomorn","retrieved_on":1427302516,"score_hidden":false,"subreddit_id":"t5_2rfxx","score":1,"name":"t1_c22x4aq","author_flair_text":"[adomorn] (NA)","link_id":"t3_id1nc","archived":true,"ups":1,"parent_id":"t3_id1nc","subreddit":"leagueoflegends","body":"Good lord.  Yes."}',
 '{"distinguished":null,"created_utc":"1309478401","downs":0,"controversiality":0,"gilded":0,"edited":false,"author":"[deleted]","id":"c22x4at","author_flair_css_class":null,"retrieved_on":1427302516,"score_hidden":false,"score":-3,"subreddit_id":"t5_2qwxl","author_flair_text":null,"archived":true,"link_id":"t3_idpys","name":"t1_c22x4at","ups":-3,"parent_id":"t1_c22wz9g","body":"I don\'t know about that...","subreddit":"runescape"}',
 '{"edited":false,"gilded":0,"author":"matts2","author_flair_css_class":null,"id":"c22x4au","retrieved_on":1427302

# Preprocessing
In the preprocessing, we try to split every string into words.
1. since the file is a JSON file, each line represents a json object, we load the lines as JSON object;
2. the contents of the comments are located in 'body' field, so we extract data in that field;
3. convert all the strings into lowercase
4. remove all the chars other than alphanumeric chars
5. split each line into words

In [17]:
def preprocess(rdd):
    rdd = rdd.map(lambda line: json.loads(line))\
             .map(lambda line: line['body']) \
             .map(lambda line: line.strip()) \
             .map(lambda line: line.lower()) 

    pattern = r'[^\w\s]' # matches chars other than alphanumeric chars and whitespace chars
    rdd = rdd.map(lambda line: re.sub(pattern,'',line)) \
             .map(lambda line: line.split(' ')) 

    return rdd

In [18]:
rc_tokenize = preprocess(rc_lines)

In [19]:
rc_tokenize.take(10)

[['good', 'lord', '', 'yes'],
 ['i', 'dont', 'know', 'about', 'that'],
 ['explain',
  'something',
  'about',
  'israel',
  'in',
  'simple',
  'terms',
  '\n\nno'],
 ['i',
  'would',
  'add',
  'that',
  'the',
  'more',
  'exercise',
  'part',
  'shouldnt',
  'be',
  'more',
  'squats',
  'instead',
  'do',
  'some',
  'cardio',
  'that',
  'requires',
  'all',
  'the',
  'muscles',
  'to',
  'work',
  'swimming',
  'is',
  'great',
  'to',
  'just',
  'get',
  'the',
  'muscles',
  'some',
  'air',
  'running',
  'kayaking',
  'jumping',
  'jacks',
  'ect',
  'nothing',
  'that',
  'is',
  'going',
  'to',
  'be',
  'too',
  'harsh',
  'on',
  'the',
  'quads',
  '\n\nwhen',
  'the',
  'muscles',
  'get',
  'sore',
  'it',
  'is',
  'an',
  'easy',
  'excuse',
  'to',
  'rest',
  'however',
  'exercise',
  'is',
  'the',
  'better',
  'option',
  'and',
  'actually',
  'helps',
  'the',
  'soreness',
  'go',
  'away',
  'faster'],
 ['care',
  'to',
  'explain',
  '4',
  '',
  'is',


# Wordcount
In the wordcounting step, we count the words and find the most frequently used words in the dataset.

In [20]:
def word_count(rdd):
    rdd = rdd.flatMap(lambda line: line)\
             .map(lambda word: word.strip()) \
             .filter(lambda x:x!='')\
             .map(lambda word: (word, 1))\
             .reduceByKey(lambda v1, v2 : v1+v2)\
             .map(lambda kv: (kv[1], kv[0])) \
             .sortByKey(False)\
             .map(lambda vk: (vk[1], vk[0]))
    return rdd

## Get top-10 frequently used words.

In [21]:
word_count(rc_tokenize).take(10)

                                                                                

[('the', 12056693),
 ('to', 8183438),
 ('a', 7682373),
 ('i', 7091109),
 ('and', 6359431),
 ('of', 5508949),
 ('you', 4887249),
 ('that', 4467293),
 ('it', 4365313),
 ('is', 4319805)]

In [22]:
spark_session.stop()

---