##Data Cleaning

In [None]:
# Load cleaned data from Excel
df = pd.read_csv('cleaned_data_2223.csv')

In [None]:
# 1. Filter out comments <= 5 words
df = df[df['text'].str.split().str.len() > 5]

##NLP Using Spacy for classifying negative comments

In [None]:
#topic modelling attempt for year 2022-2023 using LDA model
# Import necessary libraries
import re
import pandas as pd
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob
import sklearn
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import gensim
from gensim.models import CoherenceModel
from gensim.corpora import Dictionary

In [None]:
# Import necessary libraries
import pandas as pd
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

In [None]:
# Initialise the NLP pipeline and add the spacytextblob step to the pipeline
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')

# Add your texts (assuming grouped_df['Speech'] contains the text data)
texts = df['text']

# This will take about 20-30 seconds to run
sentiment_results = []
for doc in nlp.pipe(texts, disable=["tok2vec", "tagger", "ner", "parser", "attribute_ruler", "lemmatizer"]):
    sentiment_results.append({
        'Polarity': doc._.blob.polarity,
        'Subjectivity': doc._.blob.subjectivity,
    })

# Convert sentiment results into a DataFrame
sentiment_results_df = pd.DataFrame(sentiment_results)

# Append sentiment results to the original dataset
grouped_df = pd.concat([df, sentiment_results_df], axis=1)

In [None]:
#try seeing if hve parent comments - but we realised that looking only at parent comments to do topic modelling was not accurate due to too few comments being analysed
parents_raw = grouped_df[grouped_df['link_id'] == grouped_df['parent_id'].str.replace(r"^t\d+_", "", regex=True)]

In [None]:
# Filter out comments with negative sentiment (polarity below -0.2)
negative_comments_df = grouped_df[grouped_df['Polarity'] < -0.2]

In [None]:
# Display or process the filtered DataFrame
negative_comments_df.head()

##NLP pipeline that includes filtering out irrelevant words such as swear words, useless words, toxic sg words that hold no semantic meaning besides toxicity, and words that relate to comments performed by reddit moderator bots

In [None]:
#filter out comments that may be performed by moderators:
bot_words = {'bot action', 'performed automatically', 'action performed', 'submission', 'automatically', 'moderator', 'link', 'bot', 'concern', 'please'}

In [None]:
# Filtering function
def filter_comments(df, bot_words):
    # Create a boolean mask for comments that do not contain any of the bot words
    mask = ~df['text'].str.lower().str.contains('|'.join(bot_words), na=False)
    # Return the filtered DataFrame
    return df[mask]

# Filter the DataFrame
df = filter_comments(negative_comments_df, bot_words)

In [None]:
#initialise stop words list containing swear words and toxic words in sg context
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# 2. Read the words from 'swear_words.txt' and 'useless_words.txt'
with open('swear_words.txt', 'r') as f:
    swear_words = {line.strip() for line in f}

with open('sg_toxic.txt', 'r') as f:
    toxic_words = {line.strip() for line in f}

with open('useless_words.txt', 'r') as f:
    useless_words = {line.strip() for line in f}

# 4. Update the stop_words set with swear words, useless words, and extra stopwords
stop_words.update(swear_words)
stop_words.update(toxic_words)
stop_words.update(useless_words)

In [None]:
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
from nltk.tokenize import word_tokenize

# Define text preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):  # Check if text is a string
        return ''  # or handle accordingly, e.g., return None
    words = word_tokenize(text.lower())
    # Stopword removal and lemmatization
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    # Remove punctuation and non-alphabetic tokens
    words = [word for word in words if word.isalpha()]
    return ' '.join(words)

In [None]:
# 4. Vectorize text using Tf-idf
vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, ngram_range=(1, 2))
X = vectorizer.fit_transform(df['processed_comment'])

##LDA for topic modelling

In [None]:
# 5. Apply LDA Topic Modeling
n_topics = 10  # Number of topics
lda_model = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda_model.fit(X)

In [None]:
# Function to print topics
def print_topics(model, vectorizer, top_n=10):
    words = vectorizer.get_feature_names_out()
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx + 1}: ", [words[i] for i in topic.argsort()[-top_n:]])

In [None]:
# Print topics from toxic comments
print_topics(lda_model, vectorizer)

In [None]:
#look at coherence scores
texts_preprocessed = df['processed_comment']

# Step 1: Create a Gensim Dictionary and Corpus
texts_tokenized = [text.split() for text in texts_preprocessed]
dictionary = Dictionary(texts_tokenized)
corpus = [dictionary.doc2bow(text) for text in texts_tokenized]

lda = lda_model
# Step 2: Get the topics from the LDA model
lda_topics = lda.components_
lda_topics_words = [[vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-10 - 1:-1]] for topic in lda_topics]

# Step 3: Calculate Coherence Score
coherence_model_lda = CoherenceModel(topics = lda_topics_words,
                                     texts = texts_tokenized,
                                     dictionary = dictionary,
                                     coherence = 'c_v')

coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score for LDA Model: {coherence_lda}')

##Tuning LDA Model by choosing the number of topics that give the highest coherence score

In [None]:
#hyperparameter tuning using grid search
n_topics_list = [3, 5, 10, 15, 20, 25]
coherence_scores = []

texts_tokenized = [text.split() for text in texts_preprocessed]

# It should take around 15-30 seconds for each iteration
for n_topics in tqdm(n_topics_list):

    lda = LatentDirichletAllocation(n_components = n_topics, random_state = 2024)
    lda.fit(X)
    lda_topics = lda.components_
    lda_topics_words = [[vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-10 - 1:-1]] for topic in lda_topics]
    coherence_model_lda = CoherenceModel(topics = lda_topics_words,
                                         texts = texts_tokenized,
                                         dictionary = dictionary,
                                         coherence = 'c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print(f"Number of topics: {n_topics} | Coherence Score: {coherence_lda}")
    coherence_scores.append(coherence_lda)

In [None]:
#plot coherence scores
import matplotlib.pyplot as plt
plt.plot(n_topics_list, coherence_scores)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
#fit the model with the optimal number of topics (highest coherence score)
lda = LatentDirichletAllocation(n_components = 15, random_state = 2024)
lda.fit(X)
no_top_words = 10
tf_feature_names = vectorizer.get_feature_names_out()
print_topics(lda_model, vectorizer)

In [None]:
# Group by year and month, then perform topic modeling
for (year, month), group in df.groupby(['year', 'month']):
    print(f"\nYear: {year}, Month: {month}")
    lda.fit(vectorizer.fit_transform(group['cleaned_text']))
    print_topics(lda, vectorizer.get_feature_names_out(), 10)


## NLP Using VADER for classifying negative comments



In [None]:
 data_2022 = df[(df['timestamp'] >= '2022-01-01') & (df['timestamp'] < '2023-01-01')]

In [None]:
analyzer = SentimentIntensityAnalyzer()
data_2022['sentiment_vader'] = data_2022['text'].apply(lambda x: analyzer.polarity_scores(x)['neg'])

In [None]:
data_2022['sentiment_vader'].plot.hist()

In [None]:
negative_comments = data_2022[data_2022['sentiment_vader'] > 0.5]
negative_comments.shape

## BERTopic for topic modelling

In [None]:
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(negative_comments['text'])

In [None]:
topics_overview = topic_model.get_topic_info()
topics_overview

In [None]:
topic_model.get_topic_info().head(7).set_index('Topic')[['Count', 'Name', 'Representation']]

In [None]:
topic_model.visualize_barchart(top_n_topics = 16, n_words = 10)