In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer


In [3]:
import nltk
from nltk.corpus import stopwords

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kartikayluthra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
df = pd.read_csv("/Users/kartikayluthra/Desktop/finance-qa-/data/processed/train.csv")
sample_df = df.sample(10000, random_state=42)
sample_df.to_csv("/Users/kartikayluthra/Desktop/finance-qa-/data/processed/train_sample.csv", index = "False")
answers = sample_df["assistant"].dropna().tolist()
stop_words = set(stopwords.words("english"))

In [6]:
import re

def preprocess_text(answer):
    answer = answer.lower()
    answer = answer.strip()
    answer = re.sub(r'\d+', '', answer)
    answer = re.sub(r'[^\w\s]', '', answer)

    tokens = answer.split()
    tokens = [word for word in tokens if word not in stop_words]
    
    return " ".join(tokens)

processed_texts = [preprocess_text(answer) for answer in answers]

In [7]:
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")  
topic_model = BERTopic(embedding_model=embedding_model, verbose=True, calculate_probabilities= True)

topics, probs = topic_model.fit_transform(processed_texts)

2025-09-17 19:03:21,378 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

2025-09-17 19:03:52,671 - BERTopic - Embedding - Completed ✓
2025-09-17 19:03:52,672 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2025-09-17 19:04:04,511 - BERTopic - Dimensionality - Completed ✓
2025-09-17 19:04:04,517 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-09-17 19:04:10,909 - BERTopic - Cluster - Completed ✓
2025-09-17 19:04:10,930 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-09-17 19:04:11,128 - BERTopic - Representation - Completed ✓


In [8]:
if probs is None:
    raise ValueError("probs is None. Make sure calculate_probabilities=True when initializing BERTopic.")
top_n = 3
top_topics = []
for prob in probs:
    
    if prob is None or len(prob) == 0:
        top_topics.append([None]*top_n)
        continue
    top_idx = np.argsort(prob)[::-1][:top_n]
    top_topics.append(top_idx.tolist())

top_cols = [f"topic_{i+1}" for i in range(top_n)]
df_top = pd.DataFrame(top_topics, columns=top_cols)

sample_df_with_topics = pd.concat([sample_df.reset_index(drop=True), df_top], axis=1)

sample_df_with_topics.to_csv("/Users/kartikayluthra/Desktop/finance-qa-/data/processed/train_sample_with_top3_topics.csv", index=False)
print("Saved CSV with top 3 topics")

Saved CSV with top 3 topics


In [9]:
# Summary of all topics
topic_info = topic_model.get_topic_info()
print(topic_info)


     Topic  Count                                               Name  \
0       -1   2451                           -1_market_price_may_also   
1        0    765                                  0_ha_afọ_woof_unk   
2        1    428                     1_neutral_cordoned_pictet_nerd   
3        2    400  2_答案_使用_adyenadyen是一家总部位于荷兰的全球支付解决方案提供商为企业提供全球...   
4        3    380                            3_xbrl_associated_data_   
..     ...    ...                                                ...   
134    133     11                  133_layer_memory_computer_network   
135    134     11                        134_nine_seven_eight_oclock   
136    135     11                    135_privacy_secure_data_protect   
137    136     11              136_percent_unanimously_eighty_nearly   
138    137     10                        137_color_colors_mixing_red   

                                        Representation  \
0    [market, price, may, also, profit, could, numb...   
1    [ha, afọ, woof

In [12]:
topic_map = dict(zip(topic_info["Topic"], topic_info["Name"]))
df = pd.read_csv("/Users/kartikayluthra/Desktop/finance-qa-/data/processed/train_sample_with_top3_topics.csv")

In [15]:
print(df.columns)

Index(['system', 'user', 'assistant', 'topic_1', 'topic_2', 'topic_3'], dtype='object')


In [16]:
for col in ["topic_1", "topic_2", "topic_3"]:
     df[col] = df[col].map(topic_map)

In [17]:
df.head()

Unnamed: 0,system,user,assistant,topic_1,topic_2,topic_3
0,\n,"Emily purchased 8 bundles, each containing 24 ...","To find Emily's total profit, let's first calc...",34_revenue_total_profit_cost,109_tuesday_monday_saturday_made,54_book_profit_books_revenue
1,\n,A bakery sells a total of 480 muffins and cake...,"To find the total profit made by the bakery, w...",81_cakes_muffins_sold_profit,123_cakes_muffins_fracc_cake,34_revenue_total_profit_cost
2,You are a financial analyst categorizing tweet...,DigitalBridge to Acquire Majority Stake in GD ...,M&A | Investments,85_financials_investments_followed_islamic,79_markets_conditions_favor_market,88_movement_stock_premarket_shares
3,"As a finance expert, your role is to provide i...",Request:\nConvert this sentence into a valid f...,for element in list:\n element = element * 2,49_bond_bonds_chemical_energy,119_snake_oil_steam_energy,129_oil_engine_fuel_dipstick
4,\n,欧洲联盟的经济模型是怎样的？,<欧洲联盟的经济模型是一种社会市场经济模型。它强调自由市场和竞争，同时也强调社会福利和公平。...,2_答案_使用_adyenadyen是一家总部位于荷兰的全球支付解决方案提供商为企业提供全球...,9_document_type_domain_unique_identifier_techn...,76_ontario_canada_normalized_house_number


In [18]:
df.to_csv("/Users/kartikayluthra/Desktop/finance-qa-/data/processed/train_sample_topics.csv", index= False)