In [0]:
%pip install textblob

In [0]:
import pyspark.sql.functions as F
import pyspark.sql.types as T

import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

In [0]:
nltk.download(["vader_lexicon"])

In [0]:
tweets = spark.read.csv("/mnt/herhackathon/processed/tweets.csv", header=True, sep="\t")
reddits = spark.read.csv("/mnt/herhackathon/processed/reddits.csv", header=True, sep="\t")

### Sentiment analysis

In [0]:
def preprocess_posts(posts, column_name):
  # remove URLs
  posts = posts.withColumn(column_name, F.regexp_replace(column_name, r"http\S+", ""))
  # remove user tags
  posts = posts.withColumn(column_name, F.regexp_replace(column_name, '@\w+', ''))
  # remove hashes and colons
  posts = posts.withColumn(column_name, F.regexp_replace(column_name, '#|:', ''))
  # remove retween sign and login of the post author
  posts = posts.withColumn(column_name, F.regexp_replace(column_name, '^RT (@[^ ]*)?', ''))

  return posts

In [0]:
sia = SentimentIntensityAnalyzer()

def sentiment_detection(text):
    scores = sia.polarity_scores(text)
    del scores["compound"]
    return max(scores, key=lambda k: scores[k])

def sentiment_score_detection(text):
    return sia.polarity_scores(text)["compound"]

def subjectivity_detection(text):
    return TextBlob(text).sentiment.subjectivity

# replaced by sentiment from the nltk library
# def polarity_detection(text):
#     return TextBlob(text).sentiment.polarity

def classify_posts(post, column_name):    
    # sentiment detection
    sentiment_detection_udf = udf(sentiment_detection, T.StringType())
    post = post.withColumn("sentiment", sentiment_detection_udf(column_name))

    # sentiment score detection
    sentiment_score_detection_udf = udf(sentiment_score_detection, T.StringType())
    post = post.withColumn("sentiment_score", sentiment_score_detection_udf(column_name))

    # subjectivity detection
    subjectivity_detection_udf = udf(subjectivity_detection, T.StringType())
    post = post.withColumn("subjectivity_score", subjectivity_detection_udf(column_name))

#     # polarity detection
#     polarity_detection_udf = udf(polarity_detection, T.StringType())
#     post = post.withColumn("polarity", polarity_detection_udf(column_name))

    post = post.select("id", "date", "sentiment", "sentiment_score", "subjectivity_score")

    return post

In [0]:
tweets_preprocessed = preprocess_posts(tweets, "body")
reddits_preprocessed = preprocess_posts(reddits, "content")

tweets_preprocessed = tweets_preprocessed.filter(F.col("body").isNotNull())

tweets_preprocessed = tweets_preprocessed.select("id", "body", "date").drop_duplicates()
reddits_preprocessed = reddits_preprocessed.select("id", "content", "date").drop_duplicates()

# classify text sentiment and subjectivity
tweets_classified = classify_posts(tweets_preprocessed, "body")
reddits_classified = classify_posts(reddits_preprocessed, "content")

In [0]:
sentiment_categorization = tweets_classified.unionByName(reddits_classified)

display(sentiment_categorization)

id,date,sentiment,sentiment_score,subjectivity_score
t_1407484308651053067,2021-06-23,neu,0.0,0.1
t_1405457726180843523,2021-06-17,neu,0.9186,0.7833333333333332
t_1406624514092437507,2021-06-20,neu,0.4019,0.6666666666666666
t_1403766821556195330,2021-06-12,neu,0.8825,0.61875
t_1408412621263486976,2021-06-25,neu,0.3612,0.0
t_1404230954994921474,2021-06-14,neu,0.0772,0.5
t_1403719683560382466,2021-06-12,neu,0.6369,0.4
t_1404831909222432770,2021-06-15,neu,0.0,0.0
t_1400936933514125313,2021-06-05,neu,-0.5994,0.25
t_1408172286541139970,2021-06-24,neu,0.34,0.6722222222222223


### Write output dataframes

In [0]:
def save_dataframe(df, name):
  (
    df
    .coalesce(1)
    .write
    .format('csv')
    .option("header", True)
    .option("sep", "\t")
    .mode("overwrite")
    .save(f"/mnt/herhackathon/processed/{name}.csv")
  )

In [0]:
save_dataframe(sentiment_categorization, "sentiment_categorization")