In [1]:
# Strong scaling test 1 (Also used as Weak scaling test 1)
# 1 node
# 9 GB of data


from pyspark.sql import SparkSession, SQLContext 
import json
import timeit
import pyspark.sql.functions as f 

spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.225:7077") \
        .appName("Strong_scaling_1")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.shuffle.service.enabled", True)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.cores.max",1)\
        .getOrCreate()

In [2]:
#In this cell, the Reddit data is fetched from HDFS, read as a JSON-file and saved to the variable ’df’. The wall clock time of this operation is measured and printed using timeit.default_timer()

start_time_0 = timeit.default_timer()

df = spark_session.read.json('hdfs://192.168.2.225:9000/reddit/RC_2012-02')

elapsed_0 = timeit.default_timer() - start_time_0

print("Time elapsed: ", elapsed_0, "s")

Time elapsed:  355.51083533093333 s
CPU times: user 76.7 ms, sys: 43.5 ms, total: 120 ms
Wall time: 5min 55s


In [4]:
### PRE-PROCESSING ###
#In this cell the data is preprocessed. Most columns are dropped and comments that have been deleted are removed.

initial_start_time = timeit.default_timer()

if 'author_cakeday' in df.columns:
        current_df = df.drop('author_cakeday')
        
df = df.drop('author_flair_css_class','author_flair_text','can_gild','distinguished','edited','id','is_submitter','link_id','parent_id','permalink','retrieved_on','stickied','subreddit_id')

df = df.filter((df.body != '[deleted]'))


elapsed = timeit.default_timer() - initial_start_time



print("Time elapsed: ", elapsed, "s")

Time elapsed:  1.0999257609946653 s


In [5]:
### MAIN PROCESSING - CELL 1 ###

#In this cell the data is processed to show the most frequently occurring subreddits that reddit comments are posted in, in descending order.

start_time_1 = timeit.default_timer()

df.groupBy('subreddit').count().sort("count", ascending = False).show(10)

elapsed_1 = timeit.default_timer() - start_time_1

print("Time elapsed: ", elapsed_1,"s")

+-------------------+-------+
|          subreddit|  count|
+-------------------+-------+
|          AskReddit|1974313|
|              funny| 767566|
|               pics| 601277|
|            atheism| 482672|
|             gaming| 477368|
|                WTF| 391188|
|           politics| 363712|
|              trees| 296983|
|fffffffuuuuuuuuuuuu| 294680|
|               IAmA| 267945|
+-------------------+-------+
only showing top 10 rows

Time elapsed:  389.4198994738981 s
CPU times: user 111 ms, sys: 32 ms, total: 143 ms
Wall time: 6min 29s


In [6]:
### MAIN PROCESSING - CELL 2 ###
#In this cell, the number of words in each comment is counted and saved in the new column ’wordCount’. 

start_time_2 = timeit.default_timer()

df = df.withColumn('wordCount', f.size(f.split(f.col('body'), ' ')))


df.groupBy('wordCount').count().sort("count", ascending = False).show(10)   

elapsed_2 = timeit.default_timer() - start_time_2


print("Time elapsed: ", elapsed_2,"s")

+---------+------+
|wordCount| count|
+---------+------+
|        5|578020|
|        6|576405|
|        4|557292|
|        7|554545|
|        8|527944|
|        1|509309|
|        3|508649|
|        9|495282|
|       10|465635|
|        2|454971|
+---------+------+
only showing top 10 rows

Time elapsed:  449.60605269798543 s
CPU times: user 134 ms, sys: 24.1 ms, total: 158 ms
Wall time: 7min 29s


In [7]:
### MAIN PROCESSING - CELL 3 ###
#In this cell, the mean of all values in the new column ’wordCount’ is calculated and printed to show the average word count in a comment.

start_time_3 = timeit.default_timer()

df_stats = df.select(f.mean(f.col('wordCount')).alias('mean')).collect()  

mean = df_stats[0]['mean']

print("Average wordcount in comment: ", mean, " words.")


elapsed_3 = timeit.default_timer() - start_time_3
print("Time elapsed: ", elapsed_3,"s")

Average wordcount in comment:  32.56660286612964  words.
Time elapsed:  340.74685820401646 s
CPU times: user 116 ms, sys: 36.7 ms, total: 153 ms
Wall time: 5min 40s


In [8]:
### TOTAL TIME ELAPSED (PROCESSING) ###
#Here the total time elapsed since the preprocessing of the data is printed.

total_elapsed = timeit.default_timer() - initial_start_time

print("Total time elapsed: ", total_elapsed, "s")

Total time elapsed:  1180.9862001820002 s


In [9]:
spark_session.stop()