In [1]:
from pyspark.sql import SparkSession
from operator import add
import re
import json
import time 

def preprocess(rdd):

    rdd = rdd.map(lambda line: json.loads(line))\
             .map(lambda line: line['body']) \
             .map(lambda line: line.strip()) \
             .map(lambda line: line.lower()) 

    pattern = r'[^\w\s]' # matches chars other than alphanumeric chars and whitespace chars
    rdd = rdd.map(lambda line: re.sub(pattern,'',line)) \
             .map(lambda line: line.split(' ')) 
    return rdd


def word_count(rdd):
    rdd = rdd.flatMap(lambda line: line)\
            .map(lambda word: word.strip()) \
            .filter(lambda x:x!='')\
            .map(lambda word: (word, 1))\
            .reduceByKey(lambda v1, v2 : v1+v2)\
            .map(lambda kv: (kv[1], kv[0])) \
            .sortByKey(False)\
            .map(lambda vk: (vk[1], vk[0]))
    return rdd



spark_session = SparkSession\
        .builder\
        .master("spark://master:7077") \
        .appName("haodong_zhao_wordcount_timing")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","300s")\
        .config("spark.cores.max", 2) \
        .config("spark.driver.port", 9998)\
        .config("spark.blockManager.port", 10005)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext
spark_context.setLogLevel("WARN")
rc_lines = spark_context.textFile('hdfs://master:9000/dataset/RC_2011-07')

# Start Timing
start = time.time()

rc_tokenize = preprocess(rc_lines)
word_count(rc_tokenize).take(10)

end = time.time()
elapsed = end - start

print("Elapsed time: {}s".format(elapsed))

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/14 20:47:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/14 20:47:36 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.

Elapsed time: 494.26002979278564s


                                                                                

In [2]:
spark_session.stop()