In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

# Load 20 Newsgroups Dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
documents = newsgroups.data
labels = newsgroups.target_names  # 20 categories

# Preprocessing: Vectorization (TF-IDF for NMF, Count for LDA)
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

tfidf = tfidf_vectorizer.fit_transform(documents)
tf = tf_vectorizer.fit_transform(documents)
feature_names_tfidf = tfidf_vectorizer.get_feature_names_out()
feature_names_tf = tf_vectorizer.get_feature_names_out()

# Function to Display Top Words per Topic
def print_top_words(model, feature_names, n_top_words=20):
    for topic_idx, topic in enumerate(model.components_):
        top_words = [(feature_names[i], topic[i]) for i in topic.argsort()[:-n_top_words - 1:-1]]
        print(f"Topic #{topic_idx}: {top_words}\n")

# Function to Train and Display LDA and NMF Models
def train_and_display_models(K):
    print(f"\n==== LDA with {K} Topics ====")
    lda = LatentDirichletAllocation(n_components=K, random_state=42)
    lda.fit(tf)
    print_top_words(lda, feature_names_tf)

    print(f"\n==== NMF with {K} Topics ====")
    nmf = NMF(n_components=K, random_state=42)
    nmf.fit(tfidf)
    print_top_words(nmf, feature_names_tfidf)

# Run for K=10, 20, 50
for K in [10, 20, 50]:
    train_and_display_models(K)



==== LDA with 10 Topics ====
Topic #0: [('space', np.float64(1277.911090850788)), ('00', np.float64(825.7157389676995)), ('armenian', np.float64(742.2094083471989)), ('turkish', np.float64(648.3663222737166)), ('new', np.float64(592.5094960862341)), ('earth', np.float64(524.1869527287594)), ('armenians', np.float64(442.31791818901786)), ('nasa', np.float64(383.1212481555648)), ('50', np.float64(376.20682491029476)), ('10', np.float64(370.8493499309885)), ('turkey', np.float64(368.20636674100933)), ('university', np.float64(341.55906557196295)), ('years', np.float64(318.56396745493083)), ('orbit', np.float64(310.37881520071704)), ('greek', np.float64(309.2268213829357)), ('book', np.float64(304.0090972855739)), ('moon', np.float64(298.45623651277384)), ('shuttle', np.float64(293.9604286041427)), ('genocide', np.float64(291.5348260660997)), ('000', np.float64(290.71132496188847))]

Topic #1: [('use', np.float64(2242.616621935453)), ('windows', np.float64(2222.958800316841)), ('file', np

