# BERTopic Training

## Dependencies

In [None]:
import pandas as pd
from bertopic import BERTopic
import os
import datetime
import warnings
from gensim import corpora, models
from sentence_transformers import SentenceTransformer
import logging
import itertools
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from src.utils.topic_diversity import topic_diversity
from collections import Counter
import nltk
from nltk.corpus import stopwords

nltk_data_path = "../data/libs/nltk_data"
nltk.data.path.append(nltk_data_path)

os.environ["TOKENIZERS_PARALLELISM"] = "false"
logging.disable(logging.CRITICAL)
warnings.filterwarnings("ignore")

## Load Dataset

In [None]:
df = pd.read_csv("../data/processed/20250515_1207_minimal_clean_merged_tweets.csv")
df.info()

## Preparation / Config

In [None]:
# CONFIGURATION FOR SAVING
model_name = 'BERTOPIC'

# Get today's date in YYYYMMDD format
date_today = datetime.datetime.today().strftime("%Y%m%d_%H%M")

# Saved dir path
results_dir = f"../results/{date_today}_{model_name}"

# Create the directory if it doesn't exist
os.makedirs(results_dir, exist_ok=True)

TOP_DIVERSITY_WORDS_N = 30
TOP_COHERENCE_WORDS_N = 10

# If should need Sampling
SAMPLE_SIZE = 400000
df_sample = df.sample(n=SAMPLE_SIZE, random_state=42).copy()
documents = df_sample['final_text'].astype(str).tolist()
print(f"Sampled {len(df_sample)} rows from full dataset.")

# Full dataset
# documents = df['final_text'].astype(str).tolist()
# print(f"{len(df)} rows from full dataset.")

# Tokenize
tokenized_texts = [str(doc).split() for doc in documents]
dictionary = corpora.Dictionary(tokenized_texts)

## Hyperparameter Embedding Models

In [None]:
# Define search ranges
embedding_models = [
    'all-MiniLM-L6-v2',
    'paraphrase-MiniLM-L12-v2',
    'paraphrase-multilingual-MiniLM-L12-v2',
    'paraphrase-MPNet-base-v2'
]

# Store results
embedding_model_results = []
best_coherence = -1
best_embedding_model = None

# Run grid search for embedding model
for model_name in embedding_models:
    print(f"\nTesting embedding model: {model_name}")

    embedding_model = SentenceTransformer(model_name)

    # Initialize BERTopic with this embedding
    topic_model = BERTopic(embedding_model=embedding_model, verbose=False)

    topics, probs = topic_model.fit_transform(documents)
    topic_words = topic_model.get_topics()

    # Extract top_n words per topic
    topic_words_list = [
        [word for word, _ in topic_words[topic_id]]
        for topic_id in topic_words if topic_id != -1  # ignore unclusterd for now
    ]

    # Compute Coherence
    coherence_model = models.CoherenceModel(
        topics=topic_words_list,
        texts=tokenized_texts,
        dictionary=dictionary,
        coherence='c_v',
        topn=TOP_COHERENCE_WORDS_N,
    )
    coherence_score = coherence_model.get_coherence()

    # Topic Stats
    topic_info = topic_model.get_topic_info()
    num_topics = len(topic_info[topic_info.Topic != -1])
    avg_topic_size = topic_info[topic_info.Topic != -1]['Count'].mean()

    # Store results
    embedding_model_results.append({
        "embedding_model": model_name,
        "num_topics": num_topics,
        "avg_topic_size": avg_topic_size,
        "coherence_c_v": coherence_score
    })

    print(f"embedding_model={model_name}, num_topics={num_topics}, avg_topic_size={avg_topic_size:.2f} coherence={coherence_score:.4f}")

    # Track best
    if coherence_score > best_coherence:
        best_coherence = coherence_score
        best_embedding_model = model_name

# Summary
print("\nBest Embedding Model:")
print(f"Best Embedding Model: {best_embedding_model}")

# Save Results
df_embedding_model_hyperparameter = pd.DataFrame(embedding_model_results)
df_embedding_model_hyperparameter.to_csv(os.path.join(results_dir, f"embedding_model_hyperparameter_{date_today}.csv"), index=False)

## Hyperparameter Vectorizer

In [None]:
BEST_EMBEDDING = best_embedding_model
embedding_model = SentenceTransformer(BEST_EMBEDDING)

# Prepare stopwords
stopwords_list = set(stopwords.words("english"))
custom_stopwords = list(stopwords_list.union({
     "actually", "ago", "agree", "also", "answer", "anyone", "around", "article", "ask", "away", "back", "bad", "bit", "could", "come", "covid", "covid-19", "covid_19", "day", "damn", "disagree", "due", "else", "ever", "everyone", "example", "finally", "find", "follow", "fuck", "get", "give", "go", "good", "hah", "haha", "happen", "hear", "hell", "info", "join", "kinda", "kind", "know", "later", "leave", "less", "link", "live", "lol", "lolol", "long", "long-covid", "long_covid", "longcovid", "look", "lot", "make", "many", "may", "maybe", "much", "must", "need", "never", "new", "next", "news", "omg", "one", "ones", "people",  "ppl", "please", "post", "probably", "pretty", "quite", "read", "really", "right", "say", "see", "share", "shit", "show", "speak", "sorry", "sort", "sort-of", "still", "suck", "sure", "take", "talk", "tell", "thank", "thank-you", "thanks", "think", "thing", "time", "today", "try", "tweet", "twitter", "type", "uh", "uh-huh", "um", "update", "use", "vid", "via", "want", "way", "well", "would", "wrong", "yeah", "yep","even" ,"keep", "yet", "thread", "story", "watch", "listen", "write", "video", "comment", "piece", "start", "stop", "let", "put", "become", "seem", "great", "amazing", "interesting", "clear", "big", "huge", "point", "amp", "rt", "the", "to", "is", "are", "was", "were", "has", "have", "had", "do", "does", "did", "can", "will", "just", "going", "gonna",  "covid", "long", "you", "we", "your", "i", "he", "she", "they", "me", "us", "our", "their", "my", "his", "her", "them", "should", "this", "that", "these", "those", "some", "any", "each", "other", "another", "most", "something", "anything", "everything", "nothing", "way"
}))

vectorizer_args = [
    {"ngram_range": (1, 1), "min_df": 5, "max_df": 0.95, "stop_words": custom_stopwords},
    {"ngram_range": (1, 2), "min_df": 10, "max_df": 0.85, "stop_words": custom_stopwords},
    {"ngram_range": (1, 3), "min_df": 15, "max_df": 0.75, "stop_words": custom_stopwords},
    {"ngram_range": (1, 1), "min_df": 5, "max_df": 0.95},
    {"ngram_range": (1, 2), "min_df": 10, "max_df": 0.85},
    {"ngram_range": (1, 3), "min_df": 15, "max_df": 0.75},
]

# Store results
vectorizer_nr_topics_results = []
best_coherence_vn = -1
best_vectorizer = None
skipped = 0

# Run grid search for embedding hyperparameter
for vect_arg in vectorizer_args:
    print(f"\nTesting VECTORIZER={vect_arg}")
    vectorizer_model = CountVectorizer(**vect_arg)

    topic_model = BERTopic(
        embedding_model=embedding_model,
        vectorizer_model=vectorizer_model,
        verbose=False
    )

    topics, probs = topic_model.fit_transform(documents)
    raw_topics = topic_model.get_topics()

    topic_model.update_topics(
        documents,
        vectorizer_model=vectorizer_model,
        top_n_words=TOP_DIVERSITY_WORDS_N
    )

    topic_info_df = topic_model.get_topic_info()
    topic_labels = topic_info_df.set_index("Topic")["Name"].to_dict()

    topic_word_data = []
    topic_words_for_coherence = []

    for topic_id, word_list in topic_model.get_topics().items():
        if topic_id == -1:
            continue

        topic_label = topic_labels.get(topic_id, f"Topic {topic_id}")

        # For diversity export, save top 30
        for rank, (word, weight) in enumerate(word_list[:TOP_DIVERSITY_WORDS_N], start=1):
            topic_word_data.append({
                "topic": topic_id,
                "topic_label": topic_label,
                "word_rank": rank,
                "word": word,
                "weight": weight
            })

        # For coherence, split multi-word expressions
        top_words = [token for word, _ in word_list[:TOP_COHERENCE_WORDS_N] for token in word.split()]
        cleaned = [w for w in top_words if isinstance(w, str) and w.strip()]
        if len(cleaned) >= 2:
            topic_words_for_coherence.append(cleaned)
        else:
            skipped += 1

    print(f"Skipped {skipped} topic(s) due to insufficient valid words.")
    # Compute coherence
    coherence_model = models.CoherenceModel(
        topics=topic_words_for_coherence,
        texts=tokenized_texts,
        dictionary=dictionary,
        coherence='c_v',
        topn=TOP_COHERENCE_WORDS_N,
    )
    coherence_score = coherence_model.get_coherence()

    topic_info = topic_model.get_topic_info()
    valid_topic_info = topic_info[topic_info.Topic != -1]

    if not valid_topic_info.empty and "Count" in valid_topic_info.columns:
        num_topics = len(valid_topic_info)
        avg_topic_size = valid_topic_info["Count"].mean()
    else:
        num_topics = 0
        avg_topic_size = 0.0

    vectorizer_nr_topics_results.append({
        "embedding_model": BEST_EMBEDDING,
        "vectorizer_arg": vect_arg,
        "num_topics": num_topics,
        "avg_topic_size": avg_topic_size,
        "coherence_c_v": coherence_score
    })

    print(f"num_topics={num_topics}, avg_topic_size={avg_topic_size:.2f} coherence={coherence_score:.4f}")

    if coherence_score > best_coherence_vn:
        best_coherence_vn = coherence_score
        best_vectorizer = vect_arg


# Summary
print("\nBest Vectorizer:")
print(f"Best Vectorizer Args: {best_vectorizer}")

# Save results
df_vect_nr_topics_hyperparameter = pd.DataFrame(vectorizer_nr_topics_results)
df_vect_nr_topics_hyperparameter.to_csv(
    os.path.join(results_dir, f"vectorizer_hyperparameter_{date_today}.csv"), index=False
)

## Hyperparameter UMAP & HDBScan

In [None]:
BEST_EMBEDDING = best_embedding_model
BEST_VECT_ARG = best_vectorizer

# Load embedding model
embedding_model = SentenceTransformer(BEST_EMBEDDING)

# Define hyperparameter search space
umap_args = [
    None,
    {"n_neighbors": 15, "min_dist": 0.1, "n_components": 5, "metric": "cosine"},
    {"n_neighbors": 30, "min_dist": 0.0, "n_components": 10, "metric": "cosine"},
    {"n_neighbors": 10, "min_dist": 0.25, "n_components": 5, "metric": "cosine"},
]

hdbscan_args = [
    None,
    {"min_cluster_size": 30, "min_samples": 10, "metric": "euclidean", "cluster_selection_method": "eom"},
    {"min_cluster_size": 15, "min_samples": 5, "metric": "euclidean", "cluster_selection_method": "eom"},
    {"min_cluster_size": 50, "min_samples": 15, "metric": "euclidean", "cluster_selection_method": "leaf"},
]

# Store results
umap_hdbscan_results = []
best_coherence = -1
best_umap_hdbscan = None
skipped = 0

# Run UMAP+HDBSCAN grid search
for umap_arg, hdbscan_arg in itertools.product(umap_args, hdbscan_args):
    print(f"\nTesting UMAP={umap_arg} | HDBSCAN={hdbscan_arg}")

    umap_model = UMAP(**umap_arg, random_state=42) if umap_arg else None
    hdbscan_model = HDBSCAN(**hdbscan_arg) if hdbscan_arg else None
    vectorizer_model = CountVectorizer(**BEST_VECT_ARG)

    # Initialize BERTopic with fixed embedding and tuned clustering pipeline
    topic_model = BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        verbose=False,
    )

    # Fit model
    topics, probs = topic_model.fit_transform(documents)

    topic_model.update_topics(
        documents,
        vectorizer_model=vectorizer_model,
        top_n_words=TOP_DIVERSITY_WORDS_N
    )

    topic_info_df = topic_model.get_topic_info()
    topic_labels = topic_info_df.set_index("Topic")["Name"].to_dict()

    topic_word_data = []        # For CSV/word export
    topic_words_for_coherence = []  # For CoherenceModel

    for topic_id, word_list in topic_model.get_topics().items():
        if topic_id == -1:
            continue

        topic_label = topic_labels.get(topic_id, f"Topic {topic_id}")

        # For diversity export save top 30
        for rank, (word, weight) in enumerate(word_list[:TOP_DIVERSITY_WORDS_N], start=1):
            topic_word_data.append({
                "topic": topic_id,
                "topic_label": topic_label,
                "word_rank": rank,
                "word": word,
                "weight": weight
            })

        # For coherence split multi-word expressions
        top_words = [token for word, _ in word_list[:TOP_COHERENCE_WORDS_N] for token in word.split()]
        cleaned = [w for w in top_words if isinstance(w, str) and w.strip()]
        if len(cleaned) >= 2:
            topic_words_for_coherence.append(cleaned)
        else:
            skipped += 1

    print(f"Skipped {skipped} topic(s) due to insufficient valid words.")

    # Coherence score
    coherence_model = models.CoherenceModel(
        topics=topic_words_for_coherence,
        texts=tokenized_texts,
        dictionary=dictionary,
        coherence='c_v',
        topn=TOP_COHERENCE_WORDS_N,
    )
    coherence_score = coherence_model.get_coherence()

    # Topic statistics
    topic_info = topic_model.get_topic_info()
    num_topics = len(topic_info[topic_info.Topic != -1])
    avg_topic_size = topic_info[topic_info.Topic != -1]['Count'].mean()

    umap_hdbscan_results.append({
        "embedding_model": BEST_EMBEDDING,
        "umap_arg": umap_arg,
        "hdbscan_arg": hdbscan_arg,
        "num_topics": num_topics,
        "avg_topic_size": avg_topic_size,
        "coherence_c_v": coherence_score
    })

    print(f"num_topics={num_topics}, avg_topic_size={avg_topic_size:.2f} coherence={coherence_score:.4f}")

    if coherence_score > best_coherence:
        best_coherence = coherence_score
        best_umap_hdbscan = (umap_arg, hdbscan_arg)

    # Temporary
    temp_df_umap_hdbscan_hyperparameter = pd.DataFrame(umap_hdbscan_results)
    temp_df_umap_hdbscan_hyperparameter.to_csv(os.path.join(results_dir, f"hyperparameter_umap_{umap_arg}_hdbscan_{hdbscan_arg}_{date_today}.csv"), index=False)

    # Count documents per topic
    topic_counts = Counter(topic_model.topics_)

    # Sort by document count (descending)
    sorted_counts = sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)

    # Create DataFrame
    topic_docs_distribution_df = pd.DataFrame(sorted_counts, columns=["topic", "count"])

    # Calculate percentage
    total_docs = sum(topic_counts.values())
    topic_docs_distribution_df["percentage"] = (
        topic_docs_distribution_df["count"] / total_docs * 100
    ).round(2)

    topic_docs_distribution_df.to_csv(os.path.join(results_dir, f"dist_umap_{umap_arg}_hdbscan_{hdbscan_arg}_{date_today}.csv"), index=False)

# Summary
print("\nBest UMAP + HDBSCAN Combination (Stage 2):")
print(f"Best UMAP Args: {best_umap_hdbscan[0]}")
print(f"Best HDBSCAN Args: {best_umap_hdbscan[1]}")

# Save results
df_umap_hdbscan_hyperparameter = pd.DataFrame(umap_hdbscan_results)
df_umap_hdbscan_hyperparameter.to_csv(os.path.join(results_dir, f"umap_hdbscan_hyperparameter_{date_today}.csv"), index=False)

### View Topic Distributions

In [None]:
fixed_topic_words_list = []
for topic in topic_words_list:
    split_words = []
    for phrase in topic:
        split_words.extend(phrase.split())
    unique_words = list({w for w in split_words if w})
    if len(unique_words) >= 2:
        fixed_topic_words_list.append(unique_words[:TOP_COHERENCE_WORDS_N])

# Coherence score
coherence_model = models.CoherenceModel(
    topics=fixed_topic_words_list,
    texts=tokenized_texts,
    dictionary=dictionary,
    coherence='c_v',
    topn=TOP_COHERENCE_WORDS_N,
)
coherence_score = coherence_model.get_coherence()

# Topic statistics
topic_info = topic_model.get_topic_info()
num_topics = len(topic_info[topic_info.Topic != -1])
avg_topic_size = topic_info[topic_info.Topic != -1]['Count'].mean()

umap_hdbscan_results.append({
    "embedding_model": BEST_EMBEDDING,
    "umap_arg": umap_arg,
    "hdbscan_arg": hdbscan_arg,
    "num_topics": num_topics,
    "avg_topic_size": avg_topic_size,
    "coherence_c_v": coherence_score
})

print(f"num_topics={num_topics}, avg_topic_size={avg_topic_size:.2f} coherence={coherence_score:.4f}")

if coherence_score > best_coherence:
    best_coherence = coherence_score
    best_umap_hdbscan = (umap_arg, hdbscan_arg)

# temp
temp_df_umap_hdbscan_hyperparameter = pd.DataFrame(umap_hdbscan_results)
temp_df_umap_hdbscan_hyperparameter.to_csv(os.path.join(results_dir, f"hyperparameter_umap_{umap_arg}_hdbscan_{hdbscan_arg}_{date_today}.csv"), index=False)

# Count documents per topic
topic_counts = Counter(topic_model.topics_)

# Sort by document count (descending)
sorted_counts = sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)

# Create DataFrame
topic_docs_distribution_df = pd.DataFrame(sorted_counts, columns=["topic", "count"])

# Calculate percentage
total_docs = sum(topic_counts.values())
topic_docs_distribution_df["percentage"] = (
    topic_docs_distribution_df["count"] / total_docs * 100
).round(2)

topic_docs_distribution_df.to_csv(os.path.join(results_dir, f"dist_umap_{umap_arg}_hdbscan_{hdbscan_arg}_{date_today}.csv"), index=False)

## Reducing Outliers

In [None]:
topics, probs = topic_model.fit_transform(documents)

embeddings = embedding_model.encode(documents, show_progress_bar=True)

new_topics = topic_model.reduce_outliers(
            documents=documents,
            topics=topics,
            strategy="embeddings",
            embeddings=embeddings,
)

### View the new docs distributions results

In [None]:
# Count documents per topic
topic_counts = Counter(new_topics)

# Sort by document count (descending)
sorted_counts = sorted(topic_counts.items(), key=lambda x: x[1], reverse=True)

# Create DataFrame
topic_docs_distribution_df = pd.DataFrame(sorted_counts, columns=["topic", "count"])

# Calculate percentage
total_docs = sum(topic_counts.values())
topic_docs_distribution_df["percentage"] = (
    topic_docs_distribution_df["count"] / total_docs * 100
).round(2)

# View or save
topic_docs_distribution_df

## Update the model to use the new topic

In [None]:
topic_model.topics_ = new_topics
topic_model.update_topics(
    documents,
    topics=new_topics,
    vectorizer_model=CountVectorizer(**BEST_VECT_ARG)
)

## Get Coherence and Top Words

In [None]:
fixed_topic_words_list = []
for topic in topic_words_list:
    split_words = []
    for phrase in topic:
        split_words.extend(phrase.split())
    unique_words = list({w for w in split_words if w})
    if len(unique_words) >= 2:
        fixed_topic_words_list.append(unique_words[:TOP_COHERENCE_WORDS_N])

# Coherence score
coherence_model = models.CoherenceModel(
    topics=fixed_topic_words_list,
    texts=tokenized_texts,
    dictionary=dictionary,
    coherence='c_v',
    topn=TOP_COHERENCE_WORDS_N,
)
coherence_score = coherence_model.get_coherence()

coherence_score

## Update to 30 words per topic

In [None]:
topic_model.update_topics(documents, top_n_words=30)

topic_model.save(os.path.join(results_dir, f"bertopic_model_{date_today}_30_words"))

## Diversity Score

In [None]:
top_n_values = [5, 10, 20, 30]
diversity_score_results = []
for top_n in top_n_values:
    diversity_score = topic_diversity(topic_model, top_n=top_n, model_type='bertopic')
    diversity_score_results.append({"top_n": top_n, "topic_diversity": diversity_score})
    print(f"top_n: {top_n} topic_diversity: {diversity_score}")

df_diversity = pd.DataFrame(diversity_score_results)
df_diversity.to_csv(os.path.join(results_dir, f"bertopic_topic_diversity_scores_{date_today}.csv"), index=False)

## Saved all model/config

In [None]:
# Get top N words and weights per topic from BERTopic
topic_info_df = topic_model.get_topic_info()
topic_labels = topic_info_df.set_index('Topic')['Name'].to_dict()

topic_word_data = []
for topic_num, word_list in topic_model.get_topics().items():
    if topic_num == -1:
        continue  # Skip outlier topic
    topic_label = topic_labels.get(topic_num, f"Topic {topic_num}")
    for rank, (word, weight) in enumerate(word_list[:TOP_DIVERSITY_WORDS_N], start=1):
        topic_word_data.append({
            "topic": topic_num,
            "topic_label": topic_label,
            "word_rank": rank,
            "word": word,
            "weight": weight
        })

# Convert to DataFrame
df_topic_words = pd.DataFrame(topic_word_data)

# Save to CSV
topic_words_filename = os.path.join(results_dir, f"bertopic_top{TOP_DIVERSITY_WORDS_N}_words_per_topic_{date_today}_real30.csv")
df_topic_words.to_csv(topic_words_filename, index=False)

print(f"Top {TOP_DIVERSITY_WORDS_N} words per topic saved to: {topic_words_filename}")

# Print Sample Preview
print(f"\nTop {TOP_DIVERSITY_WORDS_N} Words per Topic (BERTopic):")
for topic_num in sorted(df_topic_words['topic'].unique()):
    topic_label = df_topic_words[df_topic_words['topic'] == topic_num]['topic_label'].iloc[0]
    words = df_topic_words[df_topic_words['topic'] == topic_num]['word'].tolist()
    print(f"{topic_label} (Topic {topic_num}): {', '.join(words)}")


num_topics = len(topic_info[topic_info.Topic != -1])
avg_topic_size = topic_info[topic_info.Topic != -1]['Count'].mean()

print(num_topics, avg_topic_size)

summary = [{"num_topics": num_topics, "avg_topic_size": avg_topic_size, "coherence_score": coherence_score, "emebdding_model": best_embedding_model, "best_umap": best_umap_hdbscan[0], "best_hdbscan": best_umap_hdbscan[1], "best_vectorizer": best_vectorizer}]
df_summary = pd.DataFrame(summary)
df_summary.to_csv(os.path.join(results_dir, f"summary_bertopic_topic_scores_{date_today}.csv"), index=False)

fig = topic_model.visualize_topics()
vis_html_path = os.path.join(
    results_dir, f"original_number_topics_visualization_{date_today}.html"
)
fig.write_html(vis_html_path)
print(f"📊 Saved visualization for original_number topics to: {vis_html_path}")