In [41]:
import praw
import pandas as pd
from datetime import datetime, timedelta
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import CountVectorizer

In [2]:
reddit = praw.Reddit(
    client_id="",
    client_secret="",
    password="",
    user_agent="",
    username="",
)

In [5]:
# Initialize Spark session
spark = SparkSession.builder.appName('RedditData').getOrCreate()

subreddit_name = 'nyc'  # Change this to your target subreddit
search_query = 'event'  # Modify this based on how events are typically posted
six_months_ago = datetime.utcnow() - timedelta(days=6*30)  # Approximation of 6 months

# Lists to store event and comments data
events = []
comments_data = []

# Search the subreddit for posts containing 'event' in the title
for submission in reddit.subreddit(subreddit_name).search(search_query, limit=10):  # Adjust the limit as needed
    # if datetime.utcfromtimestamp(submission.created_utc) > six_months_ago:
    submission.comments.replace_more(limit=None)  # Load all comments
    event_info = (
        submission.title,
        submission.url,
        datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
        len(submission.comments)
    )
    events.append(event_info)

    # Extract comments and add to comments_data
    for comment in submission.comments.list():
        comment_info = (submission.title, submission.url, comment.body)
        comments_data.append(comment_info)

In [6]:
# Create DataFrames
events_df = pd.DataFrame(events, columns=['title', 'url', 'created', 'num_comments'])
comments_df = pd.DataFrame(comments_data, columns=['title', 'url', 'comment'])

In [7]:
# Convert Pandas DataFrames to Spark DataFrames
spark_events_df = spark.createDataFrame(events_df)
spark_comments_df = spark.createDataFrame(comments_df)

In [17]:
# Display the DataFrames
spark_events_df.show(10)
spark_comments_df.show(10)

+--------------------+--------------------+-------------------+------------+
|               title|                 url|            created|num_comments|
+--------------------+--------------------+-------------------+------------+
|Rockefeller Cente...|https://www.reddi...|2023-11-30 05:50:02|          58|
|Drama at a drag q...|https://v.redd.it...|2022-12-12 07:28:34|          73|
|NYC Drag Story Ho...|https://www.nbcne...|2022-12-19 14:52:13|          47|
|Eric Adams attend...|https://www.polit...|2023-11-17 13:41:16|          27|
|NYC Mayor Adams i...|https://www.nydai...|2023-05-12 18:19:26|          28|
|NYC's Newest Park...|https://i.redd.it...|2021-05-21 17:03:37|          40|
|Republican Jewish...|https://www.haare...|2022-12-27 17:07:57|          35|
|Trump Attends UFC...|https://www.theda...|2019-11-03 12:41:48|          30|
|Ahead of potentia...|https://www.polit...|2023-02-19 17:28:21|          29|
|'Change in percep...|https://www.nbcne...|2021-12-18 16:10:25|          30|

In [51]:
comments = spark_comments_df.select(['comment']).withColumnRenamed('comment', 'text')

In [52]:
comments = comments.withColumn('text', F.lower(F.col('text')))

#### Remove stopwords from text

In [53]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/priyangshupal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [54]:
from nltk.corpus import stopwords

In [55]:
stopwords.words('english')[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [56]:
def removeStopwords(text):
  return " ".join([word for word in text.split() if word not in stopwords.words('english')])

removeStopwordsUDF = F.udf(removeStopwords)

In [57]:
comments = comments.withColumn('text', removeStopwordsUDF(F.col('text')))

#### Remove punctuations

In [58]:
import string

In [59]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [60]:
def remove_punctuations(text):
    # Make a translation table that maps all punctuation characters to None
    translator = str.maketrans("", "", string.punctuation)

    # Apply the translation table to the input string
    return text.translate(translator)

removePunctuationsUDF = F.udf(remove_punctuations)

In [61]:
comments = comments.withColumn('text', removePunctuationsUDF(F.col('text')))

#### Remove emails, emojis, urls etc.

In [62]:
import emoji
import re

In [63]:
url_regex = '((www\.[^\s]+)|(https?://[^\s]+))'
username_regex = '@[^\s]+'

In [64]:
def remove_urls(text):
  return re.sub(url_regex, '', text)

def remove_usernames(text):
  return re.sub(username_regex, '', text)

def remove_emojis(text):
  return emoji.demojize(text)

remove_urlsUDF = F.udf(remove_urls)
remove_usernamesUDF = F.udf(remove_usernames)
remove_emojisUDF = F.udf(remove_emojis)

In [65]:
comments = comments.withColumn('text', remove_urlsUDF(F.col('text')))
comments = comments.withColumn('text', remove_usernamesUDF(F.col('text')))
comments = comments.withColumn('text', remove_emojisUDF(F.col('text')))

#### Tokenizing, stemming, and lemmatizing the text

In [66]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer

In [67]:
def tokenize_stem_lemmatize(text):
    tokenizer = RegexpTokenizer('\w+')
    tokenized_words = tokenizer.tokenize(text)
    
    # Stemming logic
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in tokenized_words]
    
    # Lemmatizing logic
    lemmatizer = nltk.WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, pos = 'a') for word in stemmed_words]
    
    return ' '.join(lemmatized_words)

tokenize_stem_lemmatizeUDF = F.udf(tokenize_stem_lemmatize)

In [68]:
comments = comments.withColumn('text', tokenize_stem_lemmatizeUDF(F.col('text')))

In [69]:
comments.show(10)

[Stage 31:>                                                         (0 + 1) / 1]

+--------------------+
|                text|
+--------------------+
|swear feel like e...|
|deal propalestini...|
|protest organ wit...|
|palestinian prote...|
|god forbid peopl ...|
|even protest righ...|
|god forbid enjoy ...|
|jesu christ feel ...|
|jew sorri guy did...|
|   there disgust see|
+--------------------+
only showing top 10 rows



                                                                                

#### Preparing word embeddings

In [70]:
comments = comments.withColumn('text', F.split(F.col('text'), ' '))

In [71]:
cv = CountVectorizer(inputCol='text', outputCol='embeddings', vocabSize=3216)

In [72]:
model = cv.fit(comments)

                                                                                

In [73]:
comments = model.transform(comments)

In [74]:
comments.show(10)

+--------------------+--------------------+
|                text|          embeddings|
+--------------------+--------------------+
|[swear, feel, lik...|(3216,[2,103,112,...|
|[deal, propalesti...|(3216,[33,87,101,...|
|[protest, organ, ...|(3216,[0,13,21,48...|
|[palestinian, pro...|(3216,[8,13,111,1...|
|[god, forbid, peo...|(3216,[0,6,41,62,...|
|[even, protest, r...|(3216,[1,6,13,17,...|
|[god, forbid, enj...|(3216,[12,90,216,...|
|[jesu, christ, fe...|(3216,[4,8,13,15,...|
|[jew, sorri, guy,...|(3216,[4,15,37,96...|
|[there, disgust, ...|(3216,[19,39,1313...|
+--------------------+--------------------+
only showing top 10 rows

