# Latent Dirichlet Allocation (LDA) Training

## Dependencies

In [None]:
import pandas as pd
import itertools
import datetime
from gensim import corpora, models
import os
import matplotlib.pyplot as plt
from src.utils.topic_diversity import topic_diversity
import pyLDAvis.gensim_models
import warnings

warnings.filterwarnings("ignore")

## Load Dataset

In [None]:
df = pd.read_csv("../data/processed/20250516_1955_clean_merged_tweets.csv")
df.info()

## Preparation / Config

In [None]:
# CONFIGURATION FOR SAVING
model_name = 'LDA'

# Get today's date in YYYYMMDD format
date_today = datetime.datetime.today().strftime("%Y%m%d_%H%M")

# Saved dir path
results_dir = f"../results/{date_today}_{model_name}"

# Create the directory if it doesn't exist
os.makedirs(results_dir, exist_ok=True)

# Set Top-N of words
TOP_DIVERSITY_WORDS_N = 30
TOP_COHERENCE_WORDS_N = 10

# Tokenize
df['tokenized_content'] = df['final_text'].apply(lambda x: str(x).split())
texts = df['tokenized_content'].tolist()

## Hyperparameter Filter Extremes (no_below, no_above, keep_n)

In [None]:
# Parameter grid
no_below_values = [2, 5, 10, 50, 100]
no_above_values = [0.5, 0.7, 0.9, 0.95]
keep_n_values = [30000, 50000, 70000, 90000]


# Fixed Model Settings
NUM_TOPICS = 16
PASSES = 10
WORKERS = 10

# Store results
filter_extremes_hyperparameter = []
best_coherence = -1
best_filter_extremes = None

# Start Grid Search
for no_below, no_above, keep_n in itertools.product(no_below_values, no_above_values, keep_n_values):
    # Build dictionary
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)

    # Create corpus
    corpus = [dictionary.doc2bow(doc) for doc in texts]

    # Train LDA model
    lda_model = models.LdaMulticore(
        corpus=corpus,
        id2word=dictionary,
        num_topics=NUM_TOPICS,
        passes=PASSES,
        workers=WORKERS,
        random_state=42
    )

    # Compute coherence
    coherence_model = models.CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v', topn=TOP_COHERENCE_WORDS_N)
    coherence_score = coherence_model.get_coherence()

    # Store result
    filter_extremes_hyperparameter.append({
        "no_below": no_below,
        "no_above": no_above,
        "keep_n": keep_n,
        "coherence": coherence_score,
        "dictionary_size": len(dictionary),
        "num_topics": NUM_TOPICS,
    })

    print(f"no_below={no_below}, no_above={no_above}, keep_n={keep_n}, dict_size={len(dictionary)}  coherence={coherence_score:.4f}")

    # Track best
    if coherence_score > best_coherence:
        best_coherence = coherence_score
        best_filter_extremes = (no_below, no_above, keep_n)


# Summary
print("\nBest Filtering Parameters:")
print(f"Best no_below: {best_filter_extremes[0]}, Best no_above: {best_filter_extremes[1]}, Best keep_n: {best_filter_extremes[2]}, Best Coherence Score: {best_coherence:.4f}")

# Save Results
df_filter_extremes_hyperparameter = pd.DataFrame(filter_extremes_hyperparameter)
df_filter_extremes_hyperparameter.to_csv(os.path.join(results_dir, f"filter_extremes_hyperparameter_{date_today}.csv"), index=False)
print(f"Filter extremes results saved in: {results_dir} ")

## Create Dict Based on Best Filter Params

In [None]:
# Load or define the best parameters
BEST_NO_BELOW = best_filter_extremes[0]
BEST_NO_ABOVE = best_filter_extremes[1]
BEST_KEEP_N = best_filter_extremes[2]

# Create dictionary and corpus with best filtering
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=BEST_NO_BELOW, no_above=BEST_NO_ABOVE, keep_n=BEST_KEEP_N)
corpus = [dictionary.doc2bow(doc) for doc in texts]

### Hyperparameter Num Topics (num_topics)

In [None]:
# Define num_topics range to test
num_topics_range = list(range(2, 27))

# Fixed settings
PASSES = 10
WORKERS = 10

# Store results
topic_num_tuning_results = []
best_coherence = -1
best_num_topics = None

# Grid search over num_topics
for num_topics in num_topics_range:
    # Train model
    lda_model = models.LdaMulticore(
        corpus=corpus,
        id2word=dictionary,
        num_topics=num_topics,
        passes=PASSES,
        workers=WORKERS,
        random_state=42
    )

    # Compute coherence
    coherence_model = models.CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v', topn=TOP_COHERENCE_WORDS_N)
    coherence_score = coherence_model.get_coherence()

    # Store results
    topic_num_tuning_results.append({
        "num_topics": num_topics,
        "coherence": coherence_score,
        "no_below": BEST_NO_BELOW,
        "no_above": BEST_NO_ABOVE,
        "keep_n": BEST_KEEP_N,
        "dictionary_size": len(dictionary)
    })

    print(f" num_topics={num_topics} coherence={coherence_score:.4f}")

    if coherence_score > best_coherence:
        best_coherence = coherence_score
        best_num_topics = num_topics

# Summary
print("\nBest Number of Topics:")
print(f"Best num_topics: {best_num_topics}, Best Coherence Score: {best_coherence:.4f}")

# Save results
df_topic_num_tuning = pd.DataFrame(topic_num_tuning_results)
df_topic_num_tuning.to_csv(os.path.join(results_dir, f"num_topics_hyperparameter_{date_today}.csv"), index=False)
print(f"Hyperparameter topics results saved in: {results_dir} ")

### Visualize num_topics Grid Search

In [None]:
plt.plot(df_topic_num_tuning["num_topics"], df_topic_num_tuning["coherence"], marker='o')
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score (c_v)")
plt.title("LDA Topic Coherence vs Number of Topics")
plt.grid(True)
plt.tight_layout()
plt.show()

### Hyperparameter Best Params (alpha, eta)

In [None]:
BEST_NUM_TOPICS = best_num_topics

# Parameter grid
alpha_values = ['symmetric', 'asymmetric', 0.01, 0.1, 0.5]
eta_values   = ['symmetric', 0.01, 0.1, 0.5]

PASSES = 10
WORKERS = 10

# Store results
alpha_eta_tuning_results = []
best_coherence = -1
best_alpha_eta = None

# Grid search over alpha and eta
for alpha, eta in itertools.product(alpha_values, eta_values):
    # Train model
    lda_model = models.LdaMulticore(
        corpus=corpus,
        id2word=dictionary,
        num_topics=BEST_NUM_TOPICS,
        passes=PASSES,
        workers=WORKERS,
        alpha=alpha,
        eta=eta,
        random_state=42
    )

    # Coherence score
    coherence_model = models.CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v', topn=TOP_COHERENCE_WORDS_N)
    coherence_score = coherence_model.get_coherence()

    # Store results
    alpha_eta_tuning_results.append({
        "alpha": alpha,
        "eta": eta,
        "coherence": coherence_score,
        "no_below": BEST_NO_BELOW,
        "no_above": BEST_NO_ABOVE,
        "keep_n": BEST_KEEP_N,
        "num_topics": BEST_NUM_TOPICS,
        "dictionary_size": len(dictionary)
    })

    print(f" alpha={alpha}, eta={eta} coherence={coherence_score:.4f}")

    if coherence_score > best_coherence:
        best_coherence = coherence_score
        best_alpha_eta = (alpha, eta)

# Summary
print("\nBest Alpha/Eta Combination:")
print(f"Best alpha: {best_alpha_eta[0]}, Best eta: {best_alpha_eta[1]}, Best Coherence Score: {best_coherence:.4f}")

# Save results
df_alpha_eta_tuning = pd.DataFrame(alpha_eta_tuning_results)
df_alpha_eta_tuning.to_csv(os.path.join(results_dir, f"alpha_eta_hyperparameter_{date_today}.csv"), index=False)
print(f"Filter extremes results saved in: {results_dir} ")

## Final Model

In [None]:
BEST_NO_BELOW = best_filter_extremes[0]
BEST_NO_ABOVE = best_filter_extremes[1]
BEST_KEEP_N = best_filter_extremes[2]
BEST_NUM_TOPICS = best_num_topics
BEST_ALPHA = best_alpha_eta[0]
BEST_ETA = best_alpha_eta[1]
PASSES = 100
WORKERS = 10

#  Train final LDA model
lda_model = models.LdaMulticore(
    corpus=corpus,
    id2word=dictionary,
    num_topics=BEST_NUM_TOPICS,
    passes=PASSES,
    workers=WORKERS,
    alpha=BEST_ALPHA,
    eta=BEST_ETA,
    random_state=42
)

#  Compute coherence score
coherence_model = models.CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v', topn=TOP_COHERENCE_WORDS_N)
coherence_score = coherence_model.get_coherence()
print(f"🎯 Final model coherence (c_v): {coherence_score:.4f}")

#  Save model and artifacts
lda_model.save(os.path.join(results_dir, f"lda_model_{date_today}.gensim"))
dictionary.save(os.path.join(results_dir, f"lda_dictionary_{date_today}.dict"))
corpora.MmCorpus.serialize(os.path.join(results_dir, f"lda_corpus_{date_today}.mm"), corpus)

#  Save topic-word distributions to CSV
topics = lda_model.show_topics(num_topics=BEST_NUM_TOPICS, num_words=TOP_DIVERSITY_WORDS_N, formatted=False)
topic_word_list = []
for topic_num, topic_words in topics:
    for word, weight in topic_words:
        topic_word_list.append({
            "topic": topic_num,
            "word": word,
            "weight": weight
        })

df_topics = pd.DataFrame(topic_word_list)
df_topics.to_csv(os.path.join(results_dir, f"lda_topic_word_distributions_{date_today}.csv"), index=False)

#  Save document-topic distributions (optional)
doc_topics = []
for i, doc_bow in enumerate(corpus):
    topic_dist = lda_model.get_document_topics(doc_bow, minimum_probability=0.0)
    row = {"doc_id": i}
    row.update({f"topic_{t[0]}": t[1] for t in topic_dist})
    doc_topics.append(row)

df_doc_topics = pd.DataFrame(doc_topics)
df_doc_topics.to_csv(os.path.join(results_dir, f"lda_document_topic_distributions_{date_today}.csv"), index=False)

#  Save final summary
summary = {
    "no_below": BEST_NO_BELOW,
    "no_above": BEST_NO_ABOVE,
    "keep_n": BEST_KEEP_N,
    "num_topics": BEST_NUM_TOPICS,
    "alpha": BEST_ALPHA,
    "eta": BEST_ETA,
    "passes": PASSES,
    "coherence_score": coherence_score,
    "dictionary_size": len(dictionary),
    "num_documents": len(corpus),
}

# Diversity Scores
top_n_values = [5, 10, 20, 30]
diversity_score_results = []
for top_n in top_n_values:
    diversity_score = topic_diversity(lda_model, top_n=top_n, model_type='lda')
    diversity_score_results.append({"top_n": top_n, "topic_diversity": diversity_score})
    print(f"top_n: {top_n} topic_diversity: {diversity_score}")

df_diversity = pd.DataFrame(diversity_score_results)
df_diversity.to_csv(os.path.join(results_dir, f"topic_diversity_scores_{date_today}.csv"), index=False)

for row in diversity_score_results:
    summary[f"diversity_score_top{row['top_n']}"] = row["topic_diversity"]

# Save Summary
pd.DataFrame([summary]).to_csv(os.path.join(results_dir, f"lda_model_summary_{date_today}.csv"), index=False)

# Prepare the visualization data
lda_vis_data = pyLDAvis.gensim_models.prepare(
    topic_model=lda_model,
    corpus=corpus,
    dictionary=dictionary,
    sort_topics=False
)

# Export as HTML
pyLDAvis.save_html(lda_vis_data, os.path.join(results_dir, f"lda_pyladavis_visualization_{date_today}.html"))

print(f" Final model, topics, and summaries saved to: {results_dir}")

### View Top-30 Words per Topic

In [None]:
# Get top words from the model
topics = lda_model.show_topics(num_topics=BEST_NUM_TOPICS, num_words=TOP_DIVERSITY_WORDS_N, formatted=False)

# Build structured list
topic_word_data = []
for topic_num, word_list in topics:
    for rank, (word, weight) in enumerate(word_list, start=1):
        topic_word_data.append({
            "topic": topic_num,
            "word_rank": rank,
            "word": word,
            "weight": weight
        })

# Convert to DataFrame
df_topic_words = pd.DataFrame(topic_word_data)

# Save to CSV
topic_words_filename = os.path.join(results_dir, f"lda_top{TOP_DIVERSITY_WORDS_N}_words_per_topic_{date_today}.csv")
df_topic_words.to_csv(topic_words_filename, index=False)

print(f"Top {TOP_DIVERSITY_WORDS_N} words per topic saved to: {topic_words_filename}")

# Sample 30 Words per Topic
print(f"\nTop {TOP_DIVERSITY_WORDS_N} Words per Topic:")
for topic_num, word_list in topics:
    words_only = [word for word, weight in word_list]
    print(f"Topic {topic_num}: {', '.join(words_only)}")