## Storyline Data Preprocessing

### Install Library

In [1]:
%pip install numpy better-profanity emoji gensim pyLDAvis scipy bertopic sentence-transformers hdbscan umap-learn stanza
%pip install --upgrade gensim numpy scipy
!conda install gensim -y

# Restart kernel manually or add this to force reload
# import os
# os._exit(00)

Note: you may need to restart the kernel to use updated packages.
Collecting numpy
  Using cached numpy-2.2.5-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting scipy
  Using cached scipy-1.15.2-cp312-cp312-win_amd64.whl.metadata (60 kB)
Note: you may need to restart the kernel to use updated packages.
Channels:
 - defaults
Platform: win-64
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



### Import Library

In [1]:
import pandas as pd
import numpy as np
import emoji
from umap import UMAP
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

#Data Preprocessing and Feature Engineering
import re
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

import stanza
from better_profanity import profanity

from gensim import corpora
from gensim.models import LdaModel
import pyLDAvis.gensim_models
import warnings
warnings.filterwarnings("ignore")

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel, pipeline
import torch
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\babym\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\babym\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\babym\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\babym\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\babym\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\babym\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_pe

### Constant

In [4]:
filename = 'reddit_comments.csv'
output_filename = 'reddit_storyline_output.csv'
vis_output = 'lda_visualization_storyline.html'
vis_output_bert = 'lda_visualization_storyline_bert.html'
num_topics = 50
num_words=10

stop_words = set(stopwords.words('english'))
lem = WordNetLemmatizer()

# Load Stanza model for English
stanza.download("en")
nlp = stanza.Pipeline(lang="en", processors="tokenize,pos,lemma")

# Add custom stop words for generic nouns
custom_stop_words = set(["thing", "stuff", "person", "people"])
auxiliary_verbs = {'be', 'have', 'do', 'will', 'shall', 'would', 'should', 'can', 'could', 'may', 'might', 'must'}

# Load Topic Modelling BERT Model and Tokenizer
bert_topic_modelling_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_topic_modelling_model = AutoModel.from_pretrained("bert-base-uncased")

# Load RoBERTa Sentiment Analysis tokenizer and model
sentiment_model_name = "cardiffnlp/twitter-roberta-base-sentiment"
sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)

# Create sentiment pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model=sentiment_model, tokenizer=sentiment_tokenizer)

label_map = {
    "LABEL_0": "Negative",
    "LABEL_1": "Neutral",
    "LABEL_2": "Positive"
}

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-05-01 18:26:02 INFO: Downloaded file to C:\Users\babym\stanza_resources\resources.json
2025-05-01 18:26:02 INFO: Downloading default packages for language: en (English) ...
2025-05-01 18:26:04 INFO: File exists: C:\Users\babym\stanza_resources\en\default.zip
2025-05-01 18:26:12 INFO: Finished downloading models and saved to C:\Users\babym\stanza_resources
2025-05-01 18:26:12 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-05-01 18:26:12 INFO: Downloaded file to C:\Users\babym\stanza_resources\resources.json
2025-05-01 18:26:13 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

2025-05-01 18:26:13 INFO: Using device: cpu
2025-05-01 18:26:13 INFO: Loading: tokenize
2025-05-01 18:26:13 INFO: Loading: mwt
2025-05-01 18:26:13 INFO: Loading: pos
2025-05-01 18:26:18 INFO: Loading: lemma
2025-05-01 18:26:20 INFO: Done loading processors!


In [5]:
df = pd.read_csv(filename, encoding='utf-8')
df.fillna('', inplace=True)
df.head(3)

Unnamed: 0,Post Title,Post URL,Comment ID,Parent ID,Author,Timestamp,Comment,Score,Reddit Name
0,Making Friends Monday! Share your game tags here!,https://www.reddit.com/r/gaming/comments/1jyrv...,mn0q5r5,,telking777,2025-04-14 06:06:31,EverestSparrow\n\nPlayStation,2,gaming
1,Making Friends Monday! Share your game tags here!,https://www.reddit.com/r/gaming/comments/1jyrv...,mn1pw56,,Midnight_Starligt,2025-04-14 11:57:18,"I play a lot of BG3, cyberpunk and Monster Hun...",2,gaming
2,Making Friends Monday! Share your game tags here!,https://www.reddit.com/r/gaming/comments/1jyrv...,mn2kaqv,mn1pw56,Tiny-Oven6944,2025-04-14 14:57:51,Good choice,1,gaming


### Text Preprocessing

In [5]:
#Removing stopwords and words with unusual symbols
def text_processing(text):
    # Handle non-string inputs
    if not isinstance(text, str):
        return ""

    # Convert emoji to words
    text = emoji.demojize(text)

    #Generating the list of words in the message (hastags and other punctuations removed) and convert to lowercase
    text = text.lower()

    # Replace profanity with asterisks
    text = profanity.censor(text)

    # Remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\W\d]*$', '', text)

    doc = nlp(text)
    normalized_message = []
    
    for sent in doc.sentences:
        for word in sent.words:
            # Map Stanza's UPOS tags to SpaCy-like tags
            if word.upos == "VERB" and word.lemma not in auxiliary_verbs:
                normalized_message.append(word.lemma)
            elif word.upos in ["NOUN", "PROPN"] and word.lemma not in custom_stop_words:
                normalized_message.append(word.lemma)

    # Filter out stop words and short tokens
    tokens = [token for token in normalized_message if token not in stop_words and len(token) > 2]

    # Generate string output
    string = ' '.join(tokens) if tokens else ""

    return (tokens,string)

In [6]:
df[['processed_text_lda', 'processed_text_bertopic']] = pd.DataFrame(
    df['Comment'].apply(text_processing).tolist(),
    index=df.index,
    columns=['processed_text_lda', 'processed_text_bertopic']
)

In [7]:
df.head(2)

Unnamed: 0,Post Title,Post URL,Comment ID,Parent ID,Author,Timestamp,Comment,Score,Reddit Name,processed_text_lda,processed_text_bertopic
0,Making Friends Monday! Share your game tags here!,https://www.reddit.com/r/gaming/comments/1jyrv...,mn0q5r5,,telking777,2025-04-14 06:06:31,EverestSparrow\n\nPlayStation,2,gaming,[],
1,Making Friends Monday! Share your game tags here!,https://www.reddit.com/r/gaming/comments/1jyrv...,mn1pw56,,Midnight_Starligt,2025-04-14 11:57:18,"I play a lot of BG3, cyberpunk and Monster Hun...",2,gaming,"[play, lot, cyberpunk, monster, hunter, want, ...",play lot cyberpunk monster hunter want campaig...


In [8]:
df.to_csv(output_filename, index=False, encoding='utf_8_sig')

### BERT

In [5]:
# Filter out empty documents
valid_docs = df['processed_text_bertopic'].str.len() > 0
texts = df.loc[valid_docs, 'processed_text_bertopic'].tolist()
valid_indices = df.index[valid_docs].tolist()

# Function to get BERT embeddings
def get_bert_embeddings(texts, batch_size=16, max_seq_length=128):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        inputs = bert_topic_modelling_tokenizer(batch_texts, return_tensors="pt", max_length=max_seq_length, truncation=True, padding=True)
        with torch.no_grad():
            outputs = bert_topic_modelling_model(**inputs)
        # Use [CLS] token embedding or mean of token embeddings
        batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # [CLS] token
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

# Generate embeddings
print("Generating BERT embeddings...")
embeddings = get_bert_embeddings(texts)

# Step 4: Cluster Embeddings to Identify Topics
kmeans = KMeans(n_clusters=num_topics, random_state=42)
topic_labels = kmeans.fit_predict(embeddings)

bert_topic_words = []
# Step 5: Extract Representative Words for Each Topic
def get_topic_words(texts, labels, num_words=num_words):
    vectorizer = CountVectorizer(stop_words='english', max_features=1000)
    doc_term_matrix = vectorizer.fit_transform(texts)
    feature_names = vectorizer.get_feature_names_out()
    topic_words = []

    for topic_idx in range(num_topics):
        topic_docs = [i for i, label in enumerate(labels) if label == topic_idx]
        if not topic_docs:
            topic_words.append(f"Topic {topic_idx + 1}: (empty topic)")
            continue
        topic_doc_term = doc_term_matrix[topic_docs].sum(axis=0).A1
        top_word_indices = topic_doc_term.argsort()[-num_words:][::-1]
        wordlist = [feature_names[i] for i in top_word_indices]
        bert_topic_words.extend(wordlist)
        words = ", ".join(wordlist)
        topic_words.append(f"Topic {topic_idx + 1}: {words}")

    return topic_words

# Extract topic words
topic_report = get_topic_words(texts, topic_labels)

# Save topic report
with open("topic_modeling_report_bertopic_reddit.txt", "w") as f:
    f.write("BERTopic Modeling Results\n")
    f.write("=====================\n")
    for line in topic_report:
        f.write(line + "\n")

# Step 7: Assign Topics to Documents
dominant_topics = [None] * len(df)
for i, (idx, label) in enumerate(zip(valid_indices, topic_labels)):
    dominant_topics[idx] = label
df['dominant_topic_bert'] = dominant_topics
df.to_csv(output_filename, index=False, encoding='utf_8_sig')

# Step 8: Visualize Topics with UMAP
print("Generating topic visualization...")
umap_model = UMAP(n_components=2, random_state=42)
umap_embeddings = umap_model.fit_transform(embeddings)

# Plot
plt.figure(figsize=(10, 8))
sns.scatterplot(x=umap_embeddings[:, 0], y=umap_embeddings[:, 1], hue=topic_labels, palette='deep', s=50)
plt.title("BERT Topic Clusters (UMAP)")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
plt.legend(title="Topic")
plt.savefig("bert_topic_visualization_reddit.png")
plt.show()
plt.close()

print("BERT topic modeling completed. Results saved to 'bert_topic_report.txt', 'posts_with_bert_topics.csv', and 'bert_topic_visualization.png'.")

KeyError: 'processed_text_bertopic'

#### Extract Top 50 words from Top 50 topics

In [10]:
word_counts = Counter(bert_topic_words)

# Get the top 50 words by frequency
top_50_words = word_counts.most_common(50)

# Print or save the results
print("Top 50 Words Across All Topics:")
for rank, (word, score) in enumerate(top_50_words, 1):
    print(f"{rank:2}. {word:<15} {score}")

# Save to CSV (optional)
top_words_df = pd.DataFrame(top_50_words, columns=["Word", "Frequency"])
top_words_df.to_csv("top_50_words_from_bert_reddit.csv", index=False, encoding='utf_8_sig')

Top 50 Words Across All Topics:
 1. make            38
 2. think           32
 3. time            32
 4. game            26
 5. say             23
 6. know            22
 7. want            20
 8. feel            17
 9. movie           13
10. read            12
11. play            12
12. use             11
13. love            11
14. look            10
15. year            10
16. come            8
17. like            7
18. world           7
19. watch           7
20. character       6
21. way             5
22. series          5
23. book            5
24. story           5
25. need            5
26. mean            5
27. man             4
28. film            4
29. post            4
30. harry           3
31. nintendo        3
32. point           3
33. country         3
34. start           3
35. end             3
36. king            3
37. season          3
38. work            3
39. comment         3
40. guy             3
41. potter          2
42. war             2
43. try             2
44. let

In [11]:
df.to_csv(output_filename, index=False, encoding='utf_8_sig')

### LDA [Not In use]

In [None]:
# Create Document-Term Matrix
# Filter out empty documents and keep track of valid indices
valid_docs_lda = df['processed_text_lda'].str.strip() != ''
processed_docs_lda = df.loc[valid_docs_lda, 'processed_text_lda'].tolist()
valid_indices_lda = df.index[valid_docs_lda].tolist()

# Create dictionary
dictionary = corpora.Dictionary(processed_docs_lda)

# Create corpus (bag-of-words representation)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs_lda]

# Train LDA model
lda_model = LdaModel(corpus=corpus, # Create LDA model
                     id2word=dictionary, # Dictionary for the model
                     num_topics=num_topics, # Number of topics
                     random_state=42, # Random state for reproducibility
                     passes=10, # Number of passes through the corpus
                     alpha='auto', # Hyperparameter for document-topic density
                     eta='auto') # Hyperparameter for topic-word density

# Extract topics and their words
topics_lda = lda_model.print_topics(num_words=10)
topic_report_lda = []
lda_topic_words = []
for idx, topic in topics_lda:
    topic_words_processed = [word.split("*")[1].strip('" ') for word in topic[1].split(" + ")]
    lda_topic_words.extend(topic_words_processed)
    topic_words = ", ".join(topic_words_processed)
    topic_report_lda.append(f"Topic {idx + 1}: {topic_words}")

# Get dominate topic
def get_dominant_topic(doc_bow):
    topic_dist = lda_model[doc_bow]
    if topic_dist:
        return max(topic_dist, key=lambda x: x[1])[0]
    return None

with open("topic_modeling_report_lda_reddit.txt", "w") as f:
    f.write("Topic Modeling Results\n")
    f.write("=====================\n")
    for line in topic_report_lda:
        f.write(line + "\n")

dominant_topics_lda = [None] * len(df)
for i, (idx, bow) in enumerate(zip(valid_indices_lda, corpus)):
    dominant_topics_lda[idx] = get_dominant_topic(bow)

df['dominant_topic_lda'] = dominant_topics_lda
df.to_csv(output_filename, index=False, encoding='utf_8_sig')

# Visualize Topics with pyLDAvis
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
pyLDAvis.save_html(vis, vis_output) # Save the visualization to an HTML file

print("Topic modeling completed.")

0.132*"magic" + 0.096*"style" + 0.083*"horror" + 0.060*"attack" + 0.057*"sword" + 0.045*"setting" + 0.043*"spot" + 0.038*"weapon" + 0.032*"trope" + 0.028*"rip"
0.207*"fan" + 0.100*"parent" + 0.095*"top" + 0.092*"class" + 0.046*"roll" + 0.035*"court" + 0.026*"paint" + 0.025*"stomach" + 0.025*"would" + 0.025*"means"
0.110*"anime" + 0.075*"argue" + 0.067*"involve" + 0.049*"discuss" + 0.048*"recommendation" + 0.048*"prove" + 0.046*"format" + 0.041*"suck" + 0.035*"twist" + 0.032*"middle"
0.190*"agree" + 0.130*"human" + 0.102*"sort" + 0.097*"figure" + 0.078*"value" + 0.037*"battle" + 0.029*"sale" + 0.026*"replace" + 0.025*"represent" + 0.023*"nonsense"
0.156*"mind" + 0.108*"deal" + 0.070*"reminder" + 0.067*"recommend" + 0.061*"interest" + 0.051*"struggle" + 0.041*"tend" + 0.032*"app" + 0.031*"deck" + 0.024*"affect"
0.119*"side" + 0.083*"body" + 0.082*"send" + 0.075*"discussion" + 0.061*"taste" + 0.045*"wipe" + 0.038*"rise" + 0.034*"area" + 0.031*"manager" + 0.031*"article"
0.199*"reason" + 0

In [16]:
df.to_csv(output_filename, index=False, encoding='utf_8_sig')

#### Extract Top 50 words from Top 50 topics

In [17]:
word_counts = Counter(lda_topic_words)

# Get the top 50 words by frequency
top_50_words = word_counts.most_common(50)

# Print or save the results
print("Top 50 Words Across All Topics:")
for rank, (word, score) in enumerate(top_50_words, 1):
    print(f"{rank:2}. {word:<15} {score}")

# Save to CSV (optional)
top_words_df = pd.DataFrame(top_50_words, columns=["Word", "Frequency"])
top_words_df.to_csv("top_50_words_from_lda_reddit.csv", index=False, encoding='utf_8_sig')

Top 50 Words Across All Topics:


### Sentiment analysis

In [None]:
# text = "The story was amazing and emotional!"
# result = sentiment_pipeline(text)[0]
# label = label_map[result['label']]
# score = result['score']
# print(f"Sentiment: {label}, Score: {score:.3f}")

In [6]:
def get_sentiment(text):
    if not isinstance(text, str) or text.strip() == "":
        return pd.Series([None, None])

    # Add truncation=True
    result = sentiment_pipeline(text, truncation=True, max_length=512)[0]

    return pd.Series([label_map[result['label']], result['score']])

# Add sentiment results to DataFrame
df[['sentiment_label', 'sentiment_score']] = df['Comment'].apply(get_sentiment)


In [7]:
df.to_csv(output_filename, index=False, encoding='utf_8_sig')

In [None]:
word_counts = Counter(lda_topic_words)

# Get the top 50 words by frequency
top_50_words = word_counts.most_common(50)

# Print or save the results
print("Top 50 Words Across All Topics:")
for rank, (word, score) in enumerate(top_50_words, 1):
    print(f"{rank:2}. {word:<15} {score}")