In [1]:
from pyspark.sql import SparkSession
from operator import add
import json
from nltk.sentiment import SentimentIntensityAnalyzer

In [2]:
# New API
spark_session = SparkSession\
        .builder\
        .master("spark://master:7077") \
        .appName("haodong_zhao_comment_classification_test")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","300s")\
        .config("spark.executor.cores", 2)\
        .config("spark.executor.memory", "3g")\
        .config("spark.driver.port", 9998)\
        .config("spark.blockManager.port", 10005)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("WARN")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/16 14:38:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/16 14:38:59 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.


In [3]:
rc_lines = spark_context.textFile('hdfs://master:9000/dataset/RC_2011-07')

In [4]:
rc_num = rc_lines.count()
print(rc_num)



10557466


                                                                                

In [5]:
rc_lines.take(5)

['{"distinguished":null,"downs":0,"created_utc":"1309478400","controversiality":0,"edited":false,"gilded":0,"author_flair_css_class":"mordekaiser","id":"c22x4aq","author":"adomorn","retrieved_on":1427302516,"score_hidden":false,"subreddit_id":"t5_2rfxx","score":1,"name":"t1_c22x4aq","author_flair_text":"[adomorn] (NA)","link_id":"t3_id1nc","archived":true,"ups":1,"parent_id":"t3_id1nc","subreddit":"leagueoflegends","body":"Good lord.  Yes."}',
 '{"distinguished":null,"created_utc":"1309478401","downs":0,"controversiality":0,"gilded":0,"edited":false,"author":"[deleted]","id":"c22x4at","author_flair_css_class":null,"retrieved_on":1427302516,"score_hidden":false,"score":-3,"subreddit_id":"t5_2qwxl","author_flair_text":null,"archived":true,"link_id":"t3_idpys","name":"t1_c22x4at","ups":-3,"parent_id":"t1_c22wz9g","body":"I don\'t know about that...","subreddit":"runescape"}',
 '{"edited":false,"gilded":0,"author":"matts2","author_flair_css_class":null,"id":"c22x4au","retrieved_on":1427302

# Preprocessing

In [7]:
def preprocess(rdd):
    rdd = rdd.map(lambda line: json.loads(line))\
             .map(lambda line: line['body']) \
             .map(lambda line: line.strip()) 
    return rdd

In [8]:
rc_lines = preprocess(rc_lines)

In [9]:
rc_lines.take(5)

['Good lord.  Yes.',
 "I don't know about that...",
 'Explain something about Israel in simple terms? \n\nNo.',
 "I would add that the 'more exercise' part shouldn't be more squats. Instead, do some cardio that requires all the muscles to work. Swimming is great to just get the muscles some air. Running, kayaking, jumping jacks, ect. Nothing that is going to be too harsh on the quads. \n\nWhen the muscles get sore, it is an easy excuse to 'rest.' However exercise is the better option and actually helps the soreness go away faster.",
 'care to explain #4 ? is it a dead end? no longer actively developed?']

# Classify comments

In [10]:
def classify_comments(rdd):
    sia = SentimentIntensityAnalyzer()
    
    def _classify(comment):
        scores = sia.polarity_scores(comment)
        compound = scores['compound']
        if compound >= 0.05:
            return 'positive'
        elif compound <= -0.05:
            return 'negative'
        else:
            return 'neutral'
    
    m_rdd = rdd.map(lambda line: (line, _classify(line)))
    return m_rdd

In [11]:
classified = classify_comments(rc_lines)

In [12]:
classified.take(5)

[('Good lord.  Yes.', 'positive'),
 ("I don't know about that...", 'neutral'),
 ('Explain something about Israel in simple terms? \n\nNo.', 'negative'),
 ("I would add that the 'more exercise' part shouldn't be more squats. Instead, do some cardio that requires all the muscles to work. Swimming is great to just get the muscles some air. Running, kayaking, jumping jacks, ect. Nothing that is going to be too harsh on the quads. \n\nWhen the muscles get sore, it is an easy excuse to 'rest.' However exercise is the better option and actually helps the soreness go away faster.",
  'positive'),
 ('care to explain #4 ? is it a dead end? no longer actively developed?',
  'negative')]

In [13]:
def analyze_comments(rdd):
    rdd = rdd.map(lambda pair: pair[1]) \
             .map(lambda k: (k, 1)) \
             .reduceByKey(lambda v1, v2 : v1+v2)\
             .map(lambda kv: (kv[1], kv[0])) \
             .sortByKey(False)\
             .map(lambda vk: (vk[1], vk[0]))
    return rdd

In [14]:
results = analyze_comments(classified)

                                                                                

In [15]:
results.collect()

[('positive', 4452504), ('neutral', 3392970), ('negative', 2711992)]

In [16]:
spark_session.stop()