In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("reddit_with_emotions_full.csv") 
df["text"] = df["text"].astype(str)

In [5]:
df_fin = df[df["finance_label"] == "finance"]
df_non = df[df["finance_label"] == "not finance"]


In [18]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

def clean_text_basic(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)   # remove URLs
    text = re.sub(r"[^a-z\s]", " ", text)                # remove punctuation/numbers
    text = re.sub(r"\s+", " ", text).strip()             # normalize spaces
    
    words = [w for w in text.split() if w not in stop_words and len(w) > 2]

    return " ".join(words)

df_fin["clean_text"] = df_fin["text"].astype(str).apply(clean_text_basic)


[nltk_data] Downloading package stopwords to /Users/jeet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic

vectorizer_model = CountVectorizer(
    stop_words="english",
    min_df=10,         # removes rare words
    ngram_range=(1,2)  # gets phrases like "job loss", "rent increase"
)

topic_model_fin = BERTopic(
    language="english",
    calculate_probabilities=True,
    vectorizer_model=vectorizer_model,
    verbose=True
)


In [23]:
texts = df_fin["clean_text"].tolist()

topics_fin, probs_fin = topic_model_fin.fit_transform(texts)


2025-11-17 02:47:28,201 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/233 [00:00<?, ?it/s]

2025-11-17 02:48:01,559 - BERTopic - Embedding - Completed ✓
2025-11-17 02:48:01,560 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-17 02:48:04,295 - BERTopic - Dimensionality - Completed ✓
2025-11-17 02:48:04,296 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-17 02:48:05,079 - BERTopic - Cluster - Completed ✓
2025-11-17 02:48:05,081 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-17 02:48:05,896 - BERTopic - Representation - Completed ✓


In [24]:
topic_model_fin.get_topic_info()


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,4346,-1_like_feel_life_know,"[like, feel, life, know, want, time, people, r...",[hey everyone want describe bunch things relat...
1,0,937,0_mom_dad_sister_brother,"[mom, dad, sister, brother, mother, parents, f...",[year old transgender man whole list diagnoses...
2,1,237,1_insurance_therapy_health_afford,"[insurance, therapy, health, afford, therapist...",[going treatment acting like taking insurance ...
3,2,199,2_feel_feel like_like_life,"[feel, feel like, like, life, work, happy, fee...",[honestly feel like need knowif people feel gr...
4,3,155,3_relationship_love_told_said,"[relationship, love, told, said, like, time, r...",[tiniest bit context small chance stumbled upo...
...,...,...,...,...,...
58,57,10,57_life_live_anymore_celebrate,"[life, live, anymore, celebrate, step, hate li...",[feel like everytime win step forward steps ba...
59,58,10,58_church_think_people_person,"[church, think, people, person, forgive, god, ...",[keep thinking back certain things life led ce...
60,59,10,59_class_exam_final_grades,"[class, exam, final, grades, results, bit, goi...",[recently received midterm results back much d...
61,60,10,60_able_industry_work_tho,"[able, industry, work, tho, feel, getting wors...",[anyone resonates feelings going please share ...


In [25]:
topic_model_fin.get_topic(3)


[('relationship', 0.01867199409539816),
 ('love', 0.012427244568107954),
 ('told', 0.01179461580290307),
 ('said', 0.011165218406205683),
 ('like', 0.010794282508646679),
 ('time', 0.010570907237441475),
 ('really', 0.010378812521797245),
 ('felt', 0.010055279683444709),
 ('wanted', 0.009753849431913377),
 ('friend', 0.009730525436453304)]

In [27]:
topic_model_fin.visualize_barchart()
