# Strong Scaling

In [1]:
import json
import time

from pyspark.sql import SparkSession
from nltk.sentiment import SentimentIntensityAnalyzer


In [2]:
def preprocess(rdd):
    rdd = rdd.map(lambda line: json.loads(line))\
             .map(lambda line: line['body']) \
             .map(lambda line: line.strip())
    return rdd


def classify_comments(rdd):
    sia = SentimentIntensityAnalyzer()

    def _classify(comment):
        scores = sia.polarity_scores(comment)
        compound = scores['compound']
        if compound >= 0.05:
            return 'positive'
        elif compound <= -0.05:
            return 'negative'
        else:
            return 'neutral'

    m_rdd = rdd.map(lambda line: (line, _classify(line)))
    return m_rdd


def analyze_comments(rdd):
    rdd = rdd.map(lambda pair: pair[1]) \
             .map(lambda k: (k, 1)) \
             .reduceByKey(lambda v1, v2: v1+v2)\
             .map(lambda kv: (kv[1], kv[0])) \
             .sortByKey(False)\
             .map(lambda vk: (vk[1], vk[0]))
    return rdd


def rdd_slice(rdd, start, end):
    rdd = rdd.zipWithIndex()\
            .filter(lambda kv: kv[1] >= start and kv[1] <= end) \
            .map(lambda kv: kv[0])
    return rdd     



---

In [3]:
data_path = "RC_2011-07"
executor_memory = "3g"
data_size = 1000000

In [4]:
# New API
spark_session = SparkSession\
    .builder\
    .master("spark://master:7077") \
    .appName("haodong_zhao_comment_classification_horizontal_strong_worker1")\
    .config("spark.dynamicAllocation.enabled", True)\
    .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
    .config("spark.shuffle.service.enabled", False)\
    .config("spark.dynamicAllocation.executorIdleTimeout", "300s")\
    .config("spark.executor.memory", executor_memory)\
    .config("spark.executor.cores", 2)\
    .config("spark.driver.port", 9998)\
    .config("spark.blockManager.port", 10005)\
    .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")


rc_lines = spark_context.textFile(
    'hdfs://master:9000/dataset/' + data_path).cache()

rc_lines = rdd_slice(rc_lines, 0, data_size-1)

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/18 23:19:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/18 23:19:58 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.
                                                                                

In [5]:
# Start Timing
start = time.time()

results = analyze_comments(classify_comments(preprocess(rc_lines))).collect()
# End Timing
end = time.time()

elapsed = end - start

                                                                                

In [6]:
print("Elapsed time: {}s".format(elapsed))
print("Results: ", results)

Elapsed time: 251.2565562725067s
Results:  [('positive', 414697), ('neutral', 338313), ('negative', 246990)]


In [7]:
spark_session.stop()

---

In [8]:
data_path = "RC_2011-07"
executor_memory = "3g"
data_size = 2000000

In [9]:
# New API
spark_session = SparkSession\
    .builder\
    .master("spark://master:7077") \
    .appName("haodong_zhao_comment_classification_horizontal_strong_worker1")\
    .config("spark.dynamicAllocation.enabled", True)\
    .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
    .config("spark.shuffle.service.enabled", False)\
    .config("spark.dynamicAllocation.executorIdleTimeout", "300s")\
    .config("spark.executor.memory", executor_memory)\
    .config("spark.executor.cores", 2)\
    .config("spark.driver.port", 9998)\
    .config("spark.blockManager.port", 10005)\
    .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")


rc_lines = spark_context.textFile(
    'hdfs://master:9000/dataset/' + data_path).cache()

rc_lines = rdd_slice(rc_lines, 0, data_size-1)

# Start Timing
start = time.time()

results = analyze_comments(classify_comments(preprocess(rc_lines))).collect()

# End Timing
end = time.time()
elapsed = end - start

print("Elapsed time: {}s".format(elapsed))
print("Results: ", results)
spark_session.stop()

                                                                                

Elapsed time: 464.3368036746979s
Results:  [('positive', 832168), ('neutral', 667892), ('negative', 499940)]


---

In [10]:
data_path = "RC_2011-07"
executor_memory = "3g"
data_size = 4000000

In [11]:
# New API
spark_session = SparkSession\
    .builder\
    .master("spark://master:7077") \
    .appName("haodong_zhao_comment_classification_horizontal_strong_worker1")\
    .config("spark.dynamicAllocation.enabled", True)\
    .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
    .config("spark.shuffle.service.enabled", False)\
    .config("spark.dynamicAllocation.executorIdleTimeout", "300s")\
    .config("spark.executor.memory", executor_memory)\
    .config("spark.executor.cores", 2)\
    .config("spark.driver.port", 9998)\
    .config("spark.blockManager.port", 10005)\
    .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")


rc_lines = spark_context.textFile(
    'hdfs://master:9000/dataset/' + data_path).cache()

rc_lines = rdd_slice(rc_lines, 0, data_size-1)

# Start Timing
start = time.time()

results = analyze_comments(classify_comments(preprocess(rc_lines))).collect()

# End Timing
end = time.time()
elapsed = end - start

print("Elapsed time: {}s".format(elapsed))
print("Results: ", results)
spark_session.stop()

                                                                                

Elapsed time: 896.0941910743713s
Results:  [('positive', 1665944), ('neutral', 1334207), ('negative', 999849)]


---