In [55]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import time
# packacges that needs to be installed across all nodes:
from textblob import TextBlob 
import nltk
from nltk.corpus import stopwords


# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.197:7077") \
        .appName("jayant_uc_1")\
        .config("spark.executor.cores",2)\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","60s")\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/Reddit.comments") \
        .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/Reddit.comments") \
        .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1")\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

In [56]:
path1 = "hdfs://130.238.28.151:9000/user/ubuntu/Reddits_Comments"
path2 = 'hdfs://130.238.28.151:9000/user/hadoop/RC_2010-01'
rc_df = spark_session.read\
    .option("header", "true")\
    .json(path2)\
    .cache()


rcb_df = rc_df.select('body')
rcb_df.show(2)


[Stage 1:>                                                          (0 + 1) / 1]

+--------------------+
|                body|
+--------------------+
|Good rant, stop l...|
|  Sounds good to me.|
+--------------------+
only showing top 2 rows



                                                                                

In [57]:
# Cleaning text before analysis

rcb_df = rcb_df.dropna()
rcb_df = rcb_df.filter(rcb_df['body'] != '[deleted]')

#removing stopwords

nltk.download('stopwords')

stop_words = stopwords.words("english")

def remove_stopwords_fnc(x):        
    text = ''
    for x in x.split(' '):
        if x.lower() not in stop_words:
            text += x + ' '
        else:
            pass
    return text

remove_stopwords_udf = udf(remove_stopwords_fnc)
spark_session.udf.register("remove_stopwords_udf", remove_stopwords_udf)
rcb_df = rcb_df.withColumn('body',remove_stopwords_udf('body'))

rcb_df.show()

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[Stage 2:>                                                          (0 + 1) / 1]

+--------------------+
|                body|
+--------------------+
|Good rant, stop l...|
|    Sounds good me. |
|Ok people donate ...|
|               red? |
|really want give ...|
|school, depends p...|
|they?  know recen...|
|I'll add voice. b...|
|        worry 2012. |
|[George Carlin sa...|
|No, like that. He...|
|sad her.  been. k...|
|realize 'assclown...|
|sure wants back. ...|
|cat consider dog ...|
|hate uninformed r...|
|No, found sack po...|
|Feds take fall fi...|
|   to, omgomgomgomg |
|moved quite bit a...|
+--------------------+
only showing top 20 rows



                                                                                

In [58]:


def sentiment_fnc(text):
    return TextBlob(text).sentiment.polarity #gives the polarity of the sentiment, [-1.0, 1.0]
    

sentiment_udf = udf(lambda x: sentiment_fnc(x)) 
spark_session.udf.register("sentiment_udf", sentiment_udf)
rcb_df = rcb_df.withColumn('sentiment_score',sentiment_udf('body').cast('double'))


In [59]:
rcb_df.show(30)

# rcb_df.orderBy(asc("sentiment_score")).show()

# rcb_df.orderBy(desc("sentiment_score")).show()

[Stage 3:>                                                          (0 + 1) / 1]

+--------------------+--------------------+
|                body|     sentiment_score|
+--------------------+--------------------+
|Good rant, stop l...| 0.11613636363636362|
|    Sounds good me. |                 0.7|
|Ok people donate ...| 0.04999999999999999|
|               red? |                 0.0|
|really want give ...| 0.02938311688311688|
|school, depends p...| 0.13333333333333333|
|they?  know recen...| 0.16666666666666666|
|I'll add voice. b...|                -0.5|
|        worry 2012. |                 0.0|
|[George Carlin sa...|              0.1875|
|No, like that. He...|                 0.0|
|sad her.  been. k...|-0.28214285714285714|
|realize 'assclown...|                -0.1|
|sure wants back. ...| 0.06643518518518518|
|cat consider dog ...|                 0.0|
|hate uninformed r...|-0.05510204081632656|
|No, found sack po...|               -0.75|
|Feds take fall fi...|0.024999999999999994|
|   to, omgomgomgomg |                 0.0|
|moved quite bit a...| 0.1166666

                                                                                

In [54]:
spark_context.stop()