# Latent Semantic Analysis (LSA) Training

## Dependencies

In [None]:
import pandas as pd
import itertools
import datetime
from gensim import corpora, models
import os
import matplotlib.pyplot as plt
from src.utils.topic_diversity import topic_diversity
import warnings

warnings.filterwarnings("ignore")

## Load Dataset

In [None]:
# Load Processed Data
df = pd.read_csv("../data/processed/20250516_1955_clean_merged_tweets.csv")
df.info()

## Preparation / Config

In [None]:
# CONFIGURATION FOR SAVING
model_name = 'LSA'

# Get today's date in YYYYMMDD format
date_today = datetime.datetime.today().strftime("%Y%m%d_%H%M")

# Saved dir path
results_dir = f"../results/{date_today}_{model_name}"

# Create the directory if it doesn't exist
os.makedirs(results_dir, exist_ok=True)

# Set Top-N of words
TOP_DIVERSITY_WORDS_N = 30
TOP_COHERENCE_WORDS_N = 10

# Tokenize
df['tokenized_content'] = df['final_text'].apply(lambda x: str(x).split())
texts = df['tokenized_content'].tolist()

## Hyperparameter Filter Extremes (no_below, no_above, keep_n)

In [None]:
# Parameter grid
no_below_values = [2, 5, 10, 50, 100]
no_above_values = [0.5, 0.7, 0.9, 0.95]
keep_n_values = [30000, 50000, 70000, 90000]

# Fixed model settings
NUM_TOPICS = 16

# Store results
filter_extremes_hyperparameter = []
best_coherence = -1
best_filter_extremes = None

for no_below, no_above, keep_n in itertools.product(no_below_values, no_above_values, keep_n_values):
    # Build dictionary
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)

    # Create corpus (BOW)
    corpus_bow = [dictionary.doc2bow(doc) for doc in texts]

    # Create TF-IDF model
    tfidf_model = models.TfidfModel(corpus_bow)
    corpus_tfidf = tfidf_model[corpus_bow]

    # Train LSI model on TF-IDF corpus
    lsi_model = models.LsiModel(
        corpus=corpus_tfidf,
        id2word=dictionary,
        num_topics=NUM_TOPICS
    )

    # Compute coherence
    coherence_model = models.CoherenceModel(model=lsi_model, texts=texts, dictionary=dictionary, coherence='c_v', topn=TOP_COHERENCE_WORDS_N)
    coherence_score = coherence_model.get_coherence()

    # Store result
    filter_extremes_hyperparameter.append({
        "no_below": no_below,
        "no_above": no_above,
        "keep_n": keep_n,
        "coherence": coherence_score,
        "dictionary_size": len(dictionary),
        "num_topics": NUM_TOPICS
    })

    print(f"no_below={no_below}, no_above={no_above}, keep_n={keep_n}, dict_size={len(dictionary)} coherence={coherence_score:.4f}")

    # Track best
    if coherence_score > best_coherence:
        best_coherence = coherence_score
        best_filter_extremes = (no_below, no_above, keep_n)

# Summary
print("\nBest Filtering Parameters:")
print(f"Best no_below: {best_filter_extremes[0]}, Best no_above: {best_filter_extremes[1]}, Best keep_n: {best_filter_extremes[2]}, Best Coherence Score: {best_coherence:.4f}")

# Save Results
df_filter_extremes_hyperparameter = pd.DataFrame(filter_extremes_hyperparameter)
df_filter_extremes_hyperparameter.to_csv(os.path.join(results_dir, f"filter_extremes_hyperparameter_lsa_{date_today}.csv"), index=False)
print(f"Filter extremes results saved in: {results_dir}")

## Create Dict Based on Best Filter Params

In [None]:
# Load or define the best parameters
BEST_NO_BELOW = best_filter_extremes[0]
BEST_NO_ABOVE = best_filter_extremes[1]
BEST_KEEP_N = best_filter_extremes[2]

# Create dictionary and apply best filtering
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=BEST_NO_BELOW, no_above=BEST_NO_ABOVE, keep_n=BEST_KEEP_N)

# Create BOW corpus
corpus_bow = [dictionary.doc2bow(doc) for doc in texts]

# Convert to TF-IDF
tfidf_model = models.TfidfModel(corpus_bow)
corpus_tfidf = tfidf_model[corpus_bow]

## Hyperparameter Num Topics (num_topics)

In [None]:
# Define num_topics range
num_topics_range = list(range(2, 27))

# Store results
topic_num_tuning_results = []
best_coherence = -1
best_num_topics = None

# Grid search over num_topics
for num_topics in num_topics_range:
    # Train LSA model
    lsi_model = models.LsiModel(
        corpus=corpus_tfidf,
        id2word=dictionary,
        num_topics=num_topics
    )

    # Compute coherence
    coherence_model = models.CoherenceModel(model=lsi_model, texts=texts, dictionary=dictionary, coherence='c_v', topn=TOP_COHERENCE_WORDS_N)
    coherence_score = coherence_model.get_coherence()

    # Store results
    topic_num_tuning_results.append({
        "num_topics": num_topics,
        "coherence": coherence_score,
        "no_below": BEST_NO_BELOW,
        "no_above": BEST_NO_ABOVE,
        "keep_n": BEST_KEEP_N,
        "dictionary_size": len(dictionary)
    })

    print(f"num_topics={num_topics} coherence={coherence_score:.4f}")

    if coherence_score > best_coherence:
        best_coherence = coherence_score
        best_num_topics = num_topics

# Summary
print("\nBest Number of Topics:")
print(f"Best num_topics: {best_num_topics}, Best Coherence Score: {best_coherence:.4f}")

# Save results
df_topic_num_tuning = pd.DataFrame(topic_num_tuning_results)
df_topic_num_tuning.to_csv(os.path.join(results_dir, f"num_topics_hyperparameter_lsa_{date_today}.csv"), index=False)
print(f"LSA num_topics tuning results saved in: {results_dir}")

### Visualize num_topics Grid Search

In [None]:
plt.plot(df_topic_num_tuning["num_topics"], df_topic_num_tuning["coherence"], marker='o')
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score (c_v)")
plt.title("LSA Topic Coherence vs Number of Topics")
plt.grid(True)
plt.tight_layout()
plt.show()

## Hyperparameter Best Params (chunk_size, decay, power_iters)

In [None]:
BEST_NUM_TOPICS = best_num_topics

# Grid Parameters
chunksize_values = [20000, 50000, 70000, 100000]
decay_values = [0.3, 0.5, 1.0]
power_iters_values = [2, 5, 10]

# Store results
lsa_param_tuning_results = []
best_coherence = -1
best_lsa_params = None

# Grid search over chunksize, decay, power_iters
for chunksize, decay, power_iters in itertools.product(chunksize_values, decay_values, power_iters_values):
    # Train LSA model with custom parameters
    lsi_model = models.LsiModel(
        corpus=corpus_tfidf,
        id2word=dictionary,
        num_topics=BEST_NUM_TOPICS,
        chunksize=chunksize,
        decay=decay,
        power_iters=power_iters
    )

    # Coherence score
    coherence_model = models.CoherenceModel(model=lsi_model, texts=texts, dictionary=dictionary, coherence='c_v', topn=TOP_COHERENCE_WORDS_N)
    coherence_score = coherence_model.get_coherence()

    # Store results
    lsa_param_tuning_results.append({
        "chunksize": chunksize,
        "decay": decay,
        "power_iters": power_iters,
        "coherence": coherence_score,
        "no_below": BEST_NO_BELOW,
        "no_above": BEST_NO_ABOVE,
        "keep_n": BEST_KEEP_N,
        "num_topics": BEST_NUM_TOPICS,
        "dictionary_size": len(dictionary)
    })

    print(f"chunksize={chunksize}, decay={decay}, power_iters={power_iters} coherence={coherence_score:.4f}")

    if coherence_score > best_coherence:
        best_coherence = coherence_score
        best_lsa_params = (chunksize, decay, power_iters)

# Summary
print("\nBest LSA Parameters:")
print(f"Best chunksize: {best_lsa_params[0]}, Best decay: {best_lsa_params[1]}, Best power_iters: {best_lsa_params[2]}, Best Coherence Score: {best_coherence:.4f}")

# Save results
df_lsa_param_tuning = pd.DataFrame(lsa_param_tuning_results)
df_lsa_param_tuning.to_csv(os.path.join(results_dir, f"lsa_param_tuning_{date_today}.csv"), index=False)
print(f"LSA parameter tuning results saved in: {results_dir}")

## Final Model

In [None]:
BEST_NO_BELOW = best_filter_extremes[0]
BEST_NO_ABOVE = best_filter_extremes[1]
BEST_KEEP_N = best_filter_extremes[2]
BEST_NUM_TOPICS = best_num_topics
BEST_CHUNKSIZE = best_lsa_params[0]
BEST_DECAY = best_lsa_params[1]
BEST_POWER_ITERS = best_lsa_params[2]

# Train final LSA model
lsi_model = models.LsiModel(
    corpus=corpus_tfidf,
    id2word=dictionary,
    num_topics=BEST_NUM_TOPICS,
    chunksize=BEST_CHUNKSIZE,
    decay=BEST_DECAY,
    power_iters=BEST_POWER_ITERS
)

# Compute coherence score
coherence_model = models.CoherenceModel(model=lsi_model, texts=texts, dictionary=dictionary, coherence='c_v', topn=TOP_COHERENCE_WORDS_N)
coherence_score = coherence_model.get_coherence()
print(f"🎯 Final model coherence (c_v): {coherence_score:.4f}")

# Save model and artifacts
lsi_model.save(os.path.join(results_dir, f"lsa_model_{date_today}.gensim"))
dictionary.save(os.path.join(results_dir, f"lsa_dictionary_{date_today}.dict"))
corpora.MmCorpus.serialize(os.path.join(results_dir, f"lsa_corpus_{date_today}.mm"), corpus_bow)

# Save topic-word distributions to CSV
topics = lsi_model.show_topics(num_topics=BEST_NUM_TOPICS, num_words=TOP_DIVERSITY_WORDS_N, formatted=False)
topic_word_list = []
for topic_num, topic_words in topics:
    for word, weight in topic_words:
        topic_word_list.append({
            "topic": topic_num,
            "word": word,
            "weight": weight
        })

df_topics = pd.DataFrame(topic_word_list)
df_topics.to_csv(os.path.join(results_dir, f"lsa_topic_word_distributions_{date_today}.csv"), index=False)

# Save document-topic distributions
doc_topics = []
for i, doc_bow in enumerate(corpus_bow):
    topic_dist = lsi_model[doc_bow]
    row = {"doc_id": i}
    row.update({f"topic_{t[0]}": t[1] for t in topic_dist})
    doc_topics.append(row)

df_doc_topics = pd.DataFrame(doc_topics)
df_doc_topics.to_csv(os.path.join(results_dir, f"lsa_document_topic_distributions_{date_today}.csv"), index=False)

# Save final summary
summary = {
    "no_below": BEST_NO_BELOW,
    "no_above": BEST_NO_ABOVE,
    "num_topics": BEST_NUM_TOPICS,
    "chunksize": BEST_CHUNKSIZE,
    "decay": BEST_DECAY,
    "power_iters": BEST_POWER_ITERS,
    "keep_n": BEST_KEEP_N,
    "coherence_score": coherence_score,
    "dictionary_size": len(dictionary),
    "num_documents": len(corpus_bow),
}

# Diversity Scores
top_n_values = [5, 10, 20, 30]
diversity_score_results = []
for top_n in top_n_values:
    diversity_score = topic_diversity(lsi_model, top_n=top_n, model_type='lsa')
    diversity_score_results.append({"top_n": top_n, "topic_diversity": diversity_score})
    print(f"top_n: {top_n} topic_diversity: {diversity_score}")

df_diversity = pd.DataFrame(diversity_score_results)
df_diversity.to_csv(os.path.join(results_dir, f"topic_diversity_scores_{date_today}.csv"), index=False)

for row in diversity_score_results:
    summary[f"diversity_score_top{row['top_n']}"] = row["topic_diversity"]

# Save
pd.DataFrame([summary]).to_csv(os.path.join(results_dir, f"lsa_model_summary_{date_today}.csv"), index=False)
print(f"Final LSA model, topics, and summaries saved to: {results_dir}")

### View Top-30 Words per Topic

In [None]:
topics = lsi_model.show_topics(num_topics=BEST_NUM_TOPICS, num_words=TOP_DIVERSITY_WORDS_N, formatted=False)

# Build structured list
topic_word_data = []
for topic_num, word_list in topics:
    for rank, (word, weight) in enumerate(word_list, start=1):
        topic_word_data.append({
            "topic": topic_num,
            "word_rank": rank,
            "word": word,
            "weight": weight
        })

# Convert to DataFrame
df_topic_words = pd.DataFrame(topic_word_data)

# Save to CSV
topic_words_filename = os.path.join(results_dir, f"lsa_top{TOP_DIVERSITY_WORDS_N}_words_per_topic_{date_today}.csv")
df_topic_words.to_csv(topic_words_filename, index=False)

print(f"Top {TOP_DIVERSITY_WORDS_N} words per topic saved to: {topic_words_filename}")

# Sample 30 Words per Topic
print(f"\nTop {TOP_DIVERSITY_WORDS_N} Words per Topic:")
for topic_num, word_list in topics:
    words_only = [word for word, weight in word_list]
    print(f"Topic {topic_num}: {', '.join(words_only)}")