## Data size = 10.55 GB (RC_2010-01 + RC_2010-02 + RC_2010-03 + RC_2010-04 + RC_2010-05 + RC_2010-07), Node = 4 (1 master + 3 workers)

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import time
# packacges that needs to be installed across all nodes:
from textblob import TextBlob 
import nltk
from nltk.corpus import stopwords

In [2]:
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.197:7077") \
        .appName("project_code_uc")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores",2)\
        .config("spark.driver.port",9998)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/18 23:36:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/03/18 23:36:32 WARN ExecutorAllocationManager: Dynamic allocation without a shuffle service is an experimental feature.


## Start time count

In [3]:
start_time = time.time()

In [4]:
data_frame = spark_session.read\
    .option("header", "true")\
    .json('hdfs://192.168.2.197:9000/user/hadoop/RC_2010-0*')\
    .cache()

                                                                                

## Sentiment analysis

In [5]:
# Cleaning text before analysis

rcb_df = data_frame.select('body')
rcb_df = rcb_df.dropna()
rcb_df = rcb_df.filter(rcb_df['body'] != '[deleted]')

#removing stopwords

nltk.download('stopwords')

stop_words = stopwords.words("english")

def remove_stopwords_fnc(x):        
    text = ''
    for x in x.split(' '):
        if x.lower() not in stop_words:
            text += x + ' '
        else:
            pass
    return text

remove_stopwords_udf = udf(remove_stopwords_fnc)
spark_session.udf.register("remove_stopwords_udf", remove_stopwords_udf)
rcb_df = rcb_df.withColumn('body',remove_stopwords_udf('body'))

# rcb_df.show()

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def sentiment_fnc(text):
    return TextBlob(text).sentiment.polarity #gives the polarity of the sentiment, [-1.0, 1.0]
    

sentiment_udf = udf(lambda x: sentiment_fnc(x)) 
spark_session.udf.register("sentiment_udf", sentiment_udf)
rcb_df = rcb_df.withColumn('sentiment_score',sentiment_udf('body').cast('double'))

rcb_df.show(20)

[Stage 1:>                                                          (0 + 1) / 1]

+--------------------+--------------------+
|                body|     sentiment_score|
+--------------------+--------------------+
|Good rant, stop l...| 0.11613636363636362|
|    Sounds good me. |                 0.7|
|Ok people donate ...| 0.04999999999999999|
|               red? |                 0.0|
|really want give ...| 0.02938311688311688|
|school, depends p...| 0.13333333333333333|
|they?  know recen...| 0.16666666666666666|
|I'll add voice. b...|                -0.5|
|        worry 2012. |                 0.0|
|[George Carlin sa...|              0.1875|
|No, like that. He...|                 0.0|
|sad her.  been. k...|-0.28214285714285714|
|realize 'assclown...|                -0.1|
|sure wants back. ...| 0.06643518518518518|
|cat consider dog ...|                 0.0|
|hate uninformed r...|-0.05510204081632656|
|No, found sack po...|               -0.75|
|Feds take fall fi...|0.024999999999999994|
|   to, omgomgomgomg |                 0.0|
|moved quite bit a...| 0.1166666

                                                                                

### The discussion topics on a subreddit using keywords

In [7]:
avg_subreddit_score = data_frame.groupBy("subreddit").agg({'score': 'avg'})
avg_subreddit_score_sorted = avg_subreddit_score.orderBy('avg(score)',ascending = False)
print("The top 20 kindest subreddits based on average comment score are:")
avg_subreddit_score_sorted.show()

The top 20 kindest subreddits based on average comment score are:




+-----------------+------------------+
|        subreddit|        avg(score)|
+-----------------+------------------+
|         DateRape|107.67021276595744|
|           raerth| 52.63157894736842|
|       bestof2009|41.943885653785074|
|arboriculturalist|              27.0|
|            apath|              12.0|
|     aqua_aqua_bh|              12.0|
|           USPE08|10.533333333333333|
|    announcements|  9.75173047804456|
|           treees|              9.25|
|              gnu| 8.150697674418605|
|             blog|  8.06564002212107|
|         moonhoax|               8.0|
|        YourMoney|               8.0|
|            gamin|               7.4|
|     coderegister|            7.3125|
|        introvert| 6.684210526315789|
|            funny| 6.283154650297554|
|       Starcraft3|               6.0|
|       notetoself|               6.0|
|        Youngluck|              5.88|
+-----------------+------------------+
only showing top 20 rows



                                                                                

### What are the most active subreddits?

In [8]:
df_subreddit = data_frame.select('subreddit')
df_subreddit = df_subreddit.dropna()
df_subreddit_frequency = df_subreddit.groupby("subreddit").count()

In [9]:
df_subreddit_f = df_subreddit_frequency.sort('count',ascending=False).show(20)



+-------------------+-------+
|          subreddit|  count|
+-------------------+-------+
|          AskReddit|3943127|
|         reddit.com|1747183|
|               pics|1564607|
|           politics| 946155|
|             gaming| 831562|
|               IAmA| 769074|
|                WTF| 720079|
|              funny| 576533|
|            atheism| 449763|
|            science| 393642|
|          worldnews| 393579|
|        programming| 359091|
|         technology| 262316|
|              trees| 227040|
|    DoesAnybodyElse| 193797|
|              Music| 169804|
|fffffffuuuuuuuuuuuu| 166694|
|relationship_advice| 163461|
|    TwoXChromosomes| 160584|
|             videos| 141620|
+-------------------+-------+
only showing top 20 rows



                                                                                

## Execution time

In [10]:
print(f"Execution time: {time.time() - start_time}")

Execution time: 189.0602788925171
