# Strong Scaling

In [1]:
import json
import time

from pyspark.sql import SparkSession
from nltk.sentiment import SentimentIntensityAnalyzer


In [2]:
def preprocess(rdd):
    rdd = rdd.map(lambda line: json.loads(line))\
             .map(lambda line: line['body']) \
             .map(lambda line: line.strip())
    return rdd


def classify_comments(rdd):
    sia = SentimentIntensityAnalyzer()

    def _classify(comment):
        scores = sia.polarity_scores(comment)
        compound = scores['compound']
        if compound >= 0.05:
            return 'positive'
        elif compound <= -0.05:
            return 'negative'
        else:
            return 'neutral'

    m_rdd = rdd.map(lambda line: (line, _classify(line)))
    return m_rdd


def analyze_comments(rdd):
    rdd = rdd.map(lambda pair: pair[1]) \
             .map(lambda k: (k, 1)) \
             .reduceByKey(lambda v1, v2: v1+v2)\
             .map(lambda kv: (kv[1], kv[0])) \
             .sortByKey(False)\
             .map(lambda vk: (vk[1], vk[0]))
    return rdd


In [3]:
data_path = "RC_2011-07"
executor_memory = "2g"

---

In [4]:
# specify the number of the cores
max_cores = 1

In [5]:
# New API
spark_session = SparkSession\
    .builder\
    .master("spark://master:7077") \
    .appName("haodong_zhao_comment_classification_timing")\
    .config("spark.dynamicAllocation.enabled", True)\
    .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
    .config("spark.shuffle.service.enabled", False)\
    .config("spark.dynamicAllocation.executorIdleTimeout", "300s")\
    .config("spark.cores.max", max_cores) \
    .config("spark.executor.memory", executor_memory)\
    .config("spark.driver.port", 9998)\
    .config("spark.blockManager.port", 10005)\
    .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")


rc_lines = spark_context.textFile(
    'hdfs://master:9000/dataset/' + data_path).cache()


Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/17 17:21:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/17 17:21:26 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.


In [6]:
# Start Timing
start = time.time()

results = analyze_comments(classify_comments(preprocess(rc_lines))).collect()

# End Timing
end = time.time()
elapsed = end - start

                                                                                

In [7]:
print("Elapsed time: {}s".format(elapsed))
print("Results: ", results)

Elapsed time: 4521.225337266922s
Results:  [('positive', 4386834), ('neutral', 3517926), ('negative', 2652706)]


In [8]:
spark_session.stop()


---

In [9]:
# specify the number of the cores
max_cores = 2

In [10]:
# New API
spark_session = SparkSession\
    .builder\
    .master("spark://master:7077") \
    .appName("haodong_zhao_comment_classification_timing")\
    .config("spark.dynamicAllocation.enabled", True)\
    .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
    .config("spark.shuffle.service.enabled", False)\
    .config("spark.dynamicAllocation.executorIdleTimeout", "300s")\
    .config("spark.cores.max", max_cores) \
    .config("spark.executor.memory", executor_memory)\
    .config("spark.driver.port", 9998)\
    .config("spark.blockManager.port", 10005)\
    .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

rc_lines = spark_context.textFile(
    'hdfs://master:9000/dataset/' + data_path).cache()


In [11]:
# Start Timing
start = time.time()

results = analyze_comments(classify_comments(preprocess(rc_lines))).collect()

# End Timing
end = time.time()
elapsed = end - start

                                                                                

In [12]:
print("Elapsed time: {}s".format(elapsed))
print("Results: ", results)


Elapsed time: 2321.3200421333313s
Results:  [('positive', 4386834), ('neutral', 3517926), ('negative', 2652706)]


In [13]:
spark_session.stop()


---

In [14]:
# specify the number of the cores
max_cores = 4


In [15]:
# New API
spark_session = SparkSession\
    .builder\
    .master("spark://master:7077") \
    .appName("haodong_zhao_comment_classification_timing")\
    .config("spark.dynamicAllocation.enabled", True)\
    .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
    .config("spark.shuffle.service.enabled", False)\
    .config("spark.dynamicAllocation.executorIdleTimeout", "300s")\
    .config("spark.cores.max", max_cores) \
    .config("spark.executor.memory", executor_memory)\
    .config("spark.driver.port", 9998)\
    .config("spark.blockManager.port", 10005)\
    .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")


rc_lines = spark_context.textFile(
    'hdfs://master:9000/dataset/' + data_path).cache()



In [16]:
# Start Timing
start = time.time()

results = analyze_comments(classify_comments(preprocess(rc_lines))).collect()

# End Timing
end = time.time()
elapsed = end - start

                                                                                

In [17]:
print("Elapsed time: {}s".format(elapsed))
print("Results: ", results)

Elapsed time: 1211.1807935237885s
Results:  [('positive', 4386834), ('neutral', 3517926), ('negative', 2652706)]


In [18]:
spark_session.stop()

---

In [19]:
# specify the number of the cores
max_cores = 8


In [20]:
# New API
spark_session = SparkSession\
    .builder\
    .master("spark://master:7077") \
    .appName("haodong_zhao_comment_classification_timing")\
    .config("spark.dynamicAllocation.enabled", True)\
    .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
    .config("spark.shuffle.service.enabled", False)\
    .config("spark.dynamicAllocation.executorIdleTimeout", "300s")\
    .config("spark.cores.max", max_cores) \
    .config("spark.executor.memory", executor_memory)\
    .config("spark.driver.port", 9998)\
    .config("spark.blockManager.port", 10005)\
    .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")


rc_lines = spark_context.textFile(
    'hdfs://master:9000/dataset/' + data_path).cache()

In [21]:
# Start Timing
start = time.time()

results = analyze_comments(classify_comments(preprocess(rc_lines))).collect()

# End Timing
end = time.time()
elapsed = end - start

                                                                                

In [22]:
print("Elapsed time: {}s".format(elapsed))
print("Results: ", results)

Elapsed time: 632.7771925926208s
Results:  [('positive', 4386834), ('neutral', 3517926), ('negative', 2652706)]


In [23]:
spark_session.stop()

---