# Non-negative Matrix Factorization (NMF) Training

## Dependencies

In [None]:
import pandas as pd
import itertools
import datetime
from gensim import corpora, models
import os
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from src.utils.topic_diversity import topic_diversity
import joblib
import warnings

warnings.filterwarnings("ignore")

## Load Dataset

In [None]:
# Load Processed Data
df = pd.read_csv("../data/processed/20250516_1955_clean_merged_tweets.csv")
df.info()

## Preparation / Config

In [None]:
# CONFIGURATION FOR SAVING
model_name = 'NMF'

# Get today's date in YYYYMMDD format
date_today = datetime.datetime.today().strftime("%Y%m%d_%H%M")

# Saved dir path
results_dir = f"../results/{date_today}_{model_name}"

# Create the directory if it doesn't exist
os.makedirs(results_dir, exist_ok=True)

# Set Top-N of words
TOP_DIVERSITY_WORDS_N = 30
TOP_COHERENCE_WORDS_N = 10

# Tokenize
df['tokenized_content'] = df['final_text'].apply(lambda x: str(x).split())
texts = df['tokenized_content'].tolist()
docs_raw = df['final_text'].astype(str).tolist()

## Hyperparameter Filter Extremes (no_below, no_above, max_feat)

In [None]:
# Parameter grid
no_below_values = [2, 5, 10, 50, 100]
no_above_values = [0.5, 0.7, 0.95]
max_features_values = [30000, 50000, 70000, 90000]


# Fixed model settings
NUM_TOPICS = 16

# Store results
filter_extremes_hyperparameter = []
best_coherence = -1
best_filter_extremes = None

for no_below, no_above, max_feat in itertools.product(no_below_values, no_above_values, max_features_values):
    print(f"\n Trying min_df={no_below}, max_df={no_above}, max_features={max_feat}")

    # TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(min_df=no_below, max_df=no_above, max_features=max_feat)
    try:
        X_tfidf = vectorizer.fit_transform(docs_raw)
    except ValueError:
        print(f" Skipping: not enough terms with min_df={no_below}, max_df={no_above}")
        continue

    feature_names = vectorizer.get_feature_names_out()

    # Skip if vocabulary is too small
    if len(feature_names) < 10:
        print(f" Skipping min_df={no_below}, max_df={no_above} — vocab too small ({len(feature_names)} terms)")
        continue

    # Train NMF Model
    nmf_model = NMF(n_components=NUM_TOPICS)
    W = nmf_model.fit_transform(X_tfidf)
    H = nmf_model.components_

    # Recreate gensim dictionary
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=max_feat)

    # OPTIONAL: Match dictionary to vectorizer vocab (optional)
    vocab_set = set(feature_names)
    dictionary.filter_tokens(bad_ids=[tokenid for tokenid, token in dictionary.iteritems() if token not in vocab_set])
    dictionary.compactify()

    # Generate topic words
    topics = []
    for topic_idx in range(NUM_TOPICS):
        top_words_idx = H[topic_idx].argsort()[::-1][:30]
        topic_words = [feature_names[i] for i in top_words_idx]
        topics.append(topic_words)

    # Coherence evaluation
    coherence_model = models.CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v', topn=TOP_COHERENCE_WORDS_N)
    coherence_score = coherence_model.get_coherence()

    # Store result
    filter_extremes_hyperparameter.append({
        "min_df": no_below,
        "max_df": no_above,
        "max_features": max_feat,
        "coherence": coherence_score,
        "vocab_size": len(feature_names),
        "num_topics": NUM_TOPICS
    })

    print(f" min_df={no_below}, max_df={no_above}, max_features={max_feat}, vocab={len(feature_names)}  coherence={coherence_score:.4f}")

    # Track best
    if coherence_score > best_coherence:
        best_coherence = coherence_score
        best_filter_extremes = (no_below, no_above, max_feat)

# Summary
print("\nBest Filtering Parameters:")
print(f"Best min_df: {best_filter_extremes[0]}, max_df: {best_filter_extremes[1]}, max_features: {best_filter_extremes[2]}, Coherence: {best_coherence:.4f}")

# Save Results
df_filter_extremes_hyperparameter = pd.DataFrame(filter_extremes_hyperparameter)
df_filter_extremes_hyperparameter.to_csv(os.path.join(results_dir, f"nmf_filter_extremes_hyperparameter_{date_today}.csv"), index=False)
print(f"Filter extremes results saved in: {results_dir}")

## Create Dict Based on Best Filter Params

In [None]:
BEST_MIN_DF = best_filter_extremes[0]
BEST_MAX_DF = best_filter_extremes[1]
BEST_MAX_FEATURES = best_filter_extremes[2]

# Create dictionary
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=BEST_MIN_DF, no_above=BEST_MAX_DF, keep_n=BEST_MAX_FEATURES)

# Prepare TF-IDF vectorizer with best filtering
vectorizer = TfidfVectorizer(min_df=BEST_MIN_DF, max_df=BEST_MAX_DF, max_features=BEST_MAX_FEATURES)
X_tfidf = vectorizer.fit_transform(docs_raw)
feature_names = vectorizer.get_feature_names_out()

## Hyperparameter Num Topics (num_topics)

In [None]:
# Define range for num_topics (n_components)
num_topics_range = list(range(2, 27))

# Store results
topic_num_tuning_results = []
best_coherence = -1
best_num_topics = None

# Grid search over number of topics (n_components)
for num_topics in num_topics_range:
    print(f"\n Trying num_topics (n_components) = {num_topics}")

    try:
        # Train NMF model
        nmf_model = NMF(
            n_components=num_topics,
        )
        W = nmf_model.fit_transform(X_tfidf)
        H = nmf_model.components_

        # Extract top words per topic for coherence scoring
        topics = []
        for topic_idx in range(num_topics):
            top_word_indices = H[topic_idx].argsort()[::-1][:30]
            topic_words = [feature_names[i] for i in top_word_indices]
            topics.append(topic_words)

        # Compute coherence score using Gensim
        coherence_model = models.CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v', topn=TOP_COHERENCE_WORDS_N)
        coherence_score = coherence_model.get_coherence()

        # Store result
        topic_num_tuning_results.append({
            "num_topics": num_topics,
            "coherence": coherence_score,
            "min_df": BEST_MIN_DF,
            "max_df": BEST_MAX_DF,
            "max_features": BEST_MAX_FEATURES,
            "dictionary_size": len(dictionary),
        })

        print(f" num_topics={num_topics}  coherence={coherence_score:.4f}")

        if coherence_score > best_coherence:
            best_coherence = coherence_score
            best_num_topics = num_topics

    except Exception as e:
        print(f" Skipping num_topics={num_topics} due to error: {e}")
        continue

# Summary
print("\nBest Number of Topics:")
print(f"Best num_topics: {best_num_topics}, Best Coherence Score: {best_coherence:.4f}")

# Save results
df_topic_num_tuning = pd.DataFrame(topic_num_tuning_results)
df_topic_num_tuning.to_csv(os.path.join(results_dir, f"nmf_num_topics_hyperparameter_{date_today}.csv"), index=False)
print(f"Topic number tuning results saved in: {results_dir}")

### Visualize num_topics Grid Search

In [None]:
plt.plot(df_topic_num_tuning["num_topics"], df_topic_num_tuning["coherence"], marker='o')
plt.xlabel("Number of Topics")
plt.ylabel("Coherence Score (c_v)")
plt.title("NMF Topic Coherence vs Number of Topics")
plt.grid(True)
plt.tight_layout()
plt.show()

## Hyperparameter Best Params (alpha_W, alpha_H, l1_ratio)

In [None]:
BEST_NUM_TOPICS = best_num_topics

#  Parameter grid
alpha_Ws = [0.0, 0.01, 0.1, 0.5]
alpha_Hs = [0.0, 0.01, 0.1, 0.5]
l1_ratios = [0.0, 0.25, 0.5, 0.75, 1.0]
init_values = ['random', 'nndsvd', 'nndsvda', 'nndsvdar']

#  Grid search
regularization_tuning_results = []
best_coherence = -1
best_params = None

for init_value, alpha_W, alpha_H, l1_ratio in itertools.product(init_values, alpha_Ws, alpha_Hs, l1_ratios):
    print(f"\n Trying init={init_value}, alpha_W={alpha_W}, alpha_H={alpha_H}, l1_ratio={l1_ratio}")

    try:
        # Fit NMF
        nmf_model = NMF(
            n_components=BEST_NUM_TOPICS,
            init=init_value,
            alpha_W=alpha_W,
            alpha_H=alpha_H,
            l1_ratio=l1_ratio,
        )
        W = nmf_model.fit_transform(X_tfidf)
        H = nmf_model.components_

        # Topic-word extraction
        topics = []
        for topic_idx in range(BEST_NUM_TOPICS):
            top_word_indices = H[topic_idx].argsort()[::-1][:30]
            topic_words = [feature_names[i] for i in top_word_indices]
            topics.append(topic_words)

        # Coherence Score
        coherence_model = models.CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v', topn=TOP_COHERENCE_WORDS_N)
        coherence_score = coherence_model.get_coherence()

        # Store result
        regularization_tuning_results.append({
            "init_value": init_value,
            "alpha_W": alpha_W,
            "alpha_H": alpha_H,
            "l1_ratio": l1_ratio,
            "coherence": coherence_score,
            "min_df": BEST_MIN_DF,
            "max_df": BEST_MAX_DF,
            "max_features": BEST_MAX_FEATURES,
            "num_topics": BEST_NUM_TOPICS,
            "dictionary_size": len(dictionary)
        })

        print(f" coherence={coherence_score:.4f}")

        # Track best
        if coherence_score > best_coherence:
            best_coherence = coherence_score
            best_params = (init_value, alpha_W, alpha_H, l1_ratio)

    except Exception as e:
        print(f" Skipping init={init_value}, Skipping alpha_W={alpha_W}, Skipping alpha_H={alpha_H}, l1_ratio={l1_ratio} due to error: {e}")
        continue


#  Summary
print("\nBest Regularization Combination:")
print(f"Best init: {best_params[0]}, Best alpha_W: {best_params[1]}, alpha_H: {best_params[2]}, l1_ratio: {best_params[3]}, Coherence Score: {best_coherence:.4f}")

# Save results
df_regularization_tuning = pd.DataFrame(regularization_tuning_results)
df_regularization_tuning.to_csv(os.path.join(results_dir, f"nmf_regularization_hyperparameter_{date_today}.csv"), index=False)
print(f"Stage 3 tuning results saved in: {results_dir}")

## Final Model

In [None]:
BEST_NO_BELOW = best_filter_extremes[0]
BEST_NO_ABOVE = best_filter_extremes[1]
BEST_MAX_FEATURES = best_filter_extremes[2]
BEST_NUM_TOPICS = best_num_topics
BEST_INIT = best_params[0]
BEST_ALPHA_W = best_params[1]
BEST_ALPHA_H = best_params[2]
BEST_L1_RATIO = best_params[3]

print("TF-IDF matrix shape:", X_tfidf.shape)
print("Vocabulary size:", len(feature_names))

#  Train final NMF model
nmf_model = NMF(
    n_components=BEST_NUM_TOPICS,
    init=BEST_INIT,
    alpha_W=BEST_ALPHA_W,
    alpha_H=BEST_ALPHA_H,
    l1_ratio=BEST_L1_RATIO,
)
W = nmf_model.fit_transform(X_tfidf)
H = nmf_model.components_

#  Compute coherence score
topics = []
for topic_idx in range(BEST_NUM_TOPICS):
    top_word_indices = H[topic_idx].argsort()[::-1][:TOP_DIVERSITY_WORDS_N]
    topic_words = [feature_names[i] for i in top_word_indices]
    topics.append(topic_words)

coherence_model = models.CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v', topn=TOP_COHERENCE_WORDS_N)
coherence_score = coherence_model.get_coherence()
print(f"Final model coherence (c_v): {coherence_score:.4f}")

# Save topic-word distributions
topic_word_list = []
for topic_idx, topic_words in enumerate(topics):
    for word in topic_words:
        weight = H[topic_idx][feature_names.tolist().index(word)]
        topic_word_list.append({
            "topic": topic_idx,
            "word": word,
            "weight": weight
        })

df_topics = pd.DataFrame(topic_word_list)
df_topics.to_csv(os.path.join(results_dir, f"nmf_topic_word_distributions_{date_today}.csv"), index=False)

# Save document-topic distributions
doc_topics = []
for i, topic_weights in enumerate(W):
    row = {"doc_id": i}
    row.update({f"topic_{t}": topic_weights[t] for t in range(BEST_NUM_TOPICS)})
    doc_topics.append(row)

df_doc_topics = pd.DataFrame(doc_topics)
df_doc_topics.to_csv(os.path.join(results_dir, f"nmf_document_topic_distributions_{date_today}.csv"), index=False)

# Save final model summary
summary = {
    "no_below": BEST_NO_BELOW,
    "no_above": BEST_NO_ABOVE,
    "max_features": BEST_MAX_FEATURES,
    "num_topics": BEST_NUM_TOPICS,
    "init": BEST_INIT,
    "alpha_W": BEST_ALPHA_W,
    "alpha_H": BEST_ALPHA_H,
    "l1_ratio": BEST_L1_RATIO,
    "coherence_score": coherence_score,
    "dictionary_size": len(dictionary),
    "num_documents": len(docs_raw)
}

# Topic diversity
top_n_values = [5, 10, 20, 30]
diversity_score_results = []

for top_n in top_n_values:
    diversity_score = topic_diversity(nmf_model, top_n=top_n, model_type='nmf', feature_names=feature_names)
    diversity_score_results.append({"top_n": top_n, "topic_diversity": diversity_score})
    print(f"top_n: {top_n} topic_diversity: {diversity_score:.4f}")

df_diversity = pd.DataFrame(diversity_score_results)
df_diversity.to_csv(os.path.join(results_dir, f"topic_diversity_scores_{date_today}.csv"), index=False)

for row in diversity_score_results:
    summary[f"diversity_score_top{row['top_n']}"] = row["topic_diversity"]

df_summary = pd.DataFrame([summary])
df_summary.to_csv(os.path.join(results_dir, f"nmf_model_summary_{date_today}.csv"), index=False)

# Save Model
joblib.dump(vectorizer, os.path.join(results_dir, f"nmf_vectorizer_{date_today}.pkl"))
joblib.dump(nmf_model, os.path.join(results_dir, f"nmf_model_{date_today}.pkl"))

print(f" Final NMF model, topics, and distributions saved to: {results_dir}")

### View Top-30 Words per Topic

In [None]:
topic_word_data = []

for topic_idx, topic_weights in enumerate(H):
    top_indices = topic_weights.argsort()[::-1][:TOP_DIVERSITY_WORDS_N]
    for rank, index in enumerate(top_indices, start=1):
        topic_word_data.append({
            "topic": topic_idx,
            "word_rank": rank,
            "word": feature_names[index],
            "weight": topic_weights[index]
        })

# Convert to DataFrame
df_topic_words = pd.DataFrame(topic_word_data)

# Save to CSV
topic_words_filename = os.path.join(results_dir, f"nmf_top{TOP_DIVERSITY_WORDS_N}_words_per_topic_{date_today}.csv")
df_topic_words.to_csv(topic_words_filename, index=False)

print(f"Top {TOP_DIVERSITY_WORDS_N} words per topic saved to: {topic_words_filename}")

# Sample 30 Words per Topic
print(f"\n Top {TOP_DIVERSITY_WORDS_N} Words per Topic:")
feature_names = vectorizer.get_feature_names_out()

nmf_topics = []
for topic_idx, topic in enumerate(nmf_model.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-TOP_DIVERSITY_WORDS_N - 1:-1]]
    nmf_topics.append(f"Topic {topic_idx + 1}: {', '.join(top_words)}")

for topic in nmf_topics:
    print(topic)