In [18]:
from pyspark.sql import SparkSession
from operator import add
import re
import json
import time 

In [19]:
# New API
spark_session = SparkSession\
        .builder\
        .master("spark://master:7077") \
        .appName("haodong_zhao_wordcount")\
        .config("spark.executor.cores",2)\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("WARN")

2022-03-13 17:21:40,719 WARN spark.ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.


In [20]:
# df_rc = spark_session\
#         .read \
#         .option("header", "true")\
#         .json('hdfs://master:9000/dataset/RC_2011-07')\
#         .cache() # in-memory
# df_rc.printSchema()
# print(df_rc.count())
# print(data_frame.rdd.getNumPartitions())

In [21]:
# in the debug mode, the results of the steps would be printed.
is_debug = False


In [22]:
# Start Timing
start = time.time()

In [23]:
rc_lines = spark_context.textFile('hdfs://master:9000/dataset/RC_2011-07')

if is_debug:
    rc_num = rc_lines.count()
    print(rc_num)

In [24]:
if is_debug:
    rc_lines.take(5)

# Preprocessing
In the preprocessing, we try to split every string into words.
1. since the file is a JSON file, each line represents a json object, we load the lines as JSON object;
2. the contents of the comments are located in 'body' field, so we extract data in that field;
3. convert all the strings into lowercase
4. remove all the chars other than alphanumeric chars
5. split each line into words

In [25]:
def preprocess(rdd):

    rdd = rdd.map(lambda line: json.loads(line))\
             .map(lambda line: line['body']) \
             .map(lambda line: line.strip()) \
             .map(lambda line: line.lower()) 

    pattern = r'[^\w\s]' # matches chars other than alphanumeric chars and whitespace chars
    rdd = rdd.map(lambda line: re.sub(pattern,'',line)) \
             .map(lambda line: line.split(' ')) 
    return rdd

In [26]:
rc_tokenize = preprocess(rc_lines)

In [27]:
if is_debug:
    rc_tokenize.take(10)

# Wordcount
In the wordcounting step, we count the words and find the most frequently used words in the dataset.

In [28]:
def word_count(rdd):
    rdd = rdd.flatMap(lambda line: line)\
            .map(lambda word: word.strip()) \
            .map(lambda word: (word, 1))\
            .reduceByKey(lambda v1, v2 : v1+v2)\
            .map(lambda kv: (kv[1], kv[0])) \
            .sortByKey(False)\
            .map(lambda vk: (vk[1], vk[0]))
    return rdd

## Get top-10 frequently used words.

In [29]:
word_count(rc_tokenize).take(10)

                                                                                

[('the', 12056693),
 ('to', 8183438),
 ('a', 7682373),
 ('i', 7091109),
 ('and', 6359431),
 ('of', 5508949),
 ('', 5143264),
 ('you', 4887249),
 ('that', 4467293),
 ('it', 4365313)]

---

# Timing

In [30]:
end = time.time()
elasped = end - start

In [31]:
print("Elasped time: {}s".format(elasped))

Elasped time: 135.34742069244385s


---

In [32]:
spark_session.stop()