In [5]:
import pandas as pd
import numpy as np
import matplotlib as plt
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer


In [6]:
import nltk
from nltk.corpus import stopwords

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kartikayluthra/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
df = pd.read_csv("/Users/kartikayluthra/Desktop/finance-qa-/data/processed/train.csv")
sample_df = df.sample(10000, random_state=42)
sample_df.to_csv("/Users/kartikayluthra/Desktop/finance-qa-/data/processed/train_sample.csv", index = "False")
answers = sample_df["assistant"].dropna().tolist()
stop_words = set(stopwords.words("english"))

In [17]:
import re

def preprocess_text(answer):
    answer = answer.lower()
    answer = answer.strip()
    answer = re.sub(r'\d+', '', answer)
    answer = re.sub(r'[^\w\s]', '', answer)

    tokens = answer.split()
    tokens = [word for word in tokens if word not in stop_words]
    
    return " ".join(tokens)

processed_texts = [preprocess_text(answer) for answer in answers]

In [20]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  
topic_model = BERTopic(embedding_model=embedding_model, verbose=True, calculate_probabilities= True)

topics, probs = topic_model.fit_transform(processed_texts)

2025-09-17 12:45:00,360 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

2025-09-17 12:45:26,237 - BERTopic - Embedding - Completed ✓
2025-09-17 12:45:26,237 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-09-17 12:45:30,732 - BERTopic - Dimensionality - Completed ✓
2025-09-17 12:45:30,734 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-17 12:45:36,125 - BERTopic - Cluster - Completed ✓
2025-09-17 12:45:36,132 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-17 12:45:36,296 - BERTopic - Representation - Completed ✓


In [21]:
if probs is None:
    raise ValueError("probs is None. Make sure calculate_probabilities=True when initializing BERTopic.")
top_n = 3
top_topics = []
for prob in probs:
    
    if prob is None or len(prob) == 0:
        top_topics.append([None]*top_n)
        continue
    top_idx = np.argsort(prob)[::-1][:top_n]
    top_topics.append(top_idx.tolist())

top_cols = [f"topic_{i+1}" for i in range(top_n)]
df_top = pd.DataFrame(top_topics, columns=top_cols)

sample_df_with_topics = pd.concat([sample_df.reset_index(drop=True), df_top], axis=1)

sample_df_with_topics.to_csv("/Users/kartikayluthra/Desktop/finance-qa-/data/processed/train_sample_with_top3_topics.csv", index=False)
print("Saved CSV with top 3 topics")

Saved CSV with top 3 topics
