In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as functions

spark = SparkSession.builder\
    .master("spark://192.168.2.59:7077")\
    .appName("test_app")\
    .config("spark.driver.memory","4096m")\
    .config("spark.executor.memory", "2048m")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
rc = spark.read.format('json').load('hdfs:///reddit_comments')

                                                                                

In [5]:
df_subreddits = rc.groupBy('subreddit')
df_subreddits_count = df_subreddits.count().orderBy("count", ascending=False)
df_subreddits_count.show(5)



+----------+------+
| subreddit| count|
+----------+------+
| AskReddit|875973|
|      pics|487514|
|reddit.com|325653|
|    gaming|244828|
|  politics|243931|
+----------+------+
only showing top 5 rows



                                                                                

In [6]:
# Defines function that counts number of bad words

# Bad words
profanity= ["asshole","bitch","bloody","bollocks","bugger","bullshit","bitch","cock","cocksucker","coonass","cornhole","cracker"
            ,"cunt","dick","dickhead","faggot","fuck","motherfucker","nigga","nigger","paki","pussy","shit","slut","tranny"
            "twat","wanker"]

# Function to count nr profanity in a single comment
def count_profanity(comment):
    count = 0
    for word in comment.split():
        if word.lower() in profanity:
            count += 1
    return count
    
# User defined function boilerplate code
udf_count_profanity = functions.udf(lambda x: count_profanity(x))


In [8]:
# Removes unecessary columns
remove_cols = ["archived","author_flair_css_class","author", "author_flair_text", "controversiality", "created_utc", \
              "distinguished", "downs", "edited", "gilded", "id", "link_id", "name", "parent_id", "removal_reason", \
              "retrieved_on", "score", "score_hidden", "subreddit_id", "ups"]


df_slim = rc.drop(*remove_cols)
df_slim.printSchema()


root
 |-- body: string (nullable = true)
 |-- subreddit: string (nullable = true)



In [9]:
# Count nr bad words for all comments and convert column to int
from pyspark.sql.types import IntegerType

d1 = df_slim.withColumn("bad_words", udf_count_profanity(functions.col("body")))
d2 = d1.drop("body")
d3 = d2.selectExpr("subreddit", "cast(bad_words as int) bad_words")
d3.printSchema()


root
 |-- subreddit: string (nullable = true)
 |-- bad_words: integer (nullable = true)



In [10]:
# Sort by each subreddit and add up bad words 
d4 = d3.groupBy("subreddit").sum("bad_words").sort('sum(bad_words)', ascending=False).show(5)



+----------+--------------+
| subreddit|sum(bad_words)|
+----------+--------------+
| AskReddit|         39532|
|      pics|         17017|
|reddit.com|         14970|
|  politics|         12873|
|       WTF|         10052|
+----------+--------------+
only showing top 5 rows



                                                                                