In [1]:
import numpy as np
from time import time
from scipy.sparse import csr_matrix
from tqdm import tqdm

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

from gensim.corpora import Dictionary
from gensim.models import CoherenceModel

In [2]:
def get_topics_sklearn(model, feature_names, n_top_words=10):
    topics = []
    for comp in model.components_:
        top_idx = np.argsort(comp)[::-1][:n_top_words]
        topics.append([feature_names[i] for i in top_idx])
    return topics


def compute_coherence(topics, corpus_bow, dictionary, texts):
    cm_umass = CoherenceModel(
        topics=topics, corpus=corpus_bow, dictionary=dictionary, coherence="u_mass"
    )
    cm_cv = CoherenceModel(
        topics=topics, texts=texts, dictionary=dictionary, coherence="c_v"
    )
    return cm_umass.get_coherence(), cm_cv.get_coherence()

In [None]:
class LDA_EM:
    def __init__(self, n_topics, n_iter=30, alpha=0.1, beta=0.01, random_state=42):
        self.K = n_topics
        self.n_iter = n_iter
        self.alpha = alpha
        self.beta = beta
        np.random.seed(random_state)

    def fit(self, X):
        X = csr_matrix(X, dtype=np.float64)
        D, V = X.shape

        self.phi = np.random.dirichlet(np.ones(V), self.K)
        self.theta = np.random.dirichlet(np.ones(self.K), D)

        for _ in tqdm(range(self.n_iter), desc="EM iterations"):
            n_wt = np.zeros((self.K, V))
            n_td = np.zeros((D, self.K))

            rows, cols = X.nonzero()
            data = X.data
            for idx, cnt in enumerate(data):
                d, w = rows[idx], cols[idx]
                p = self.phi[:, w] * self.theta[d]
                p /= p.sum()
                n_wt[:, w] += cnt * p
                n_td[d] += cnt * p

            self.phi = n_wt + self.beta
            self.phi /= self.phi.sum(axis=1, keepdims=True)

            self.theta = n_td + self.alpha
            self.theta /= self.theta.sum(axis=1, keepdims=True)

        return self

    def transform(self, X, n_iter=20):
        X = csr_matrix(X, dtype=np.float64)
        D, _ = X.shape
        theta_new = np.random.dirichlet(np.ones(self.K), D)

        for _ in range(n_iter):
            n_td = np.zeros_like(theta_new)
            rows, cols = X.nonzero()
            data = X.data
            for idx, cnt in enumerate(data):
                d, w = rows[idx], cols[idx]
                p = self.phi[:, w] * theta_new[d]
                p /= p.sum()
                n_td[d] += cnt * p

            theta_new = n_td + self.alpha
            theta_new /= theta_new.sum(axis=1, keepdims=True)

        return theta_new

    def get_topics(self, feature_names, n_top_words=10):
        topics = []
        for k in range(self.K):
            top_idx = np.argsort(self.phi[k])[::-1][:n_top_words]
            topics.append([feature_names[i] for i in top_idx])
        return topics

In [14]:
data = fetch_20newsgroups(
    subset="train", remove=("headers", "footers", "quotes")
).data

In [None]:
texts = [doc.lower().split() for doc in data]
dictionary = Dictionary(texts)
corpus_bow = [dictionary.doc2bow(txt) for txt in texts]

In [None]:
vectorizer = CountVectorizer(
    min_df=5, max_df=0.5, stop_words="english", max_features=2000
)
X = vectorizer.fit_transform(data)
feat_names = vectorizer.get_feature_names_out()

In [18]:
start = time()
lda_em = LDA_EM(n_topics=10, n_iter=50, alpha=0.1, beta=0.01)
lda_em.fit(X)
print(f"Time: {time() - start:.2f}s")

topics_em = lda_em.get_topics(feat_names, n_top_words=10)
umass_em, cv_em = compute_coherence(topics_em, corpus_bow, dictionary, texts)
print("Topics (EM):", topics_em)
print(f"Coherence UMass: {umass_em:.4f}, C_V: {cv_em:.4f}\n")

EM iterations: 100%|██████████| 50/50 [02:54<00:00,  3.48s/it]

Time: 174.17s





Topics (EM): [['good', 'time', 'year', 'better', 'like', 'just', 'years', 'db', 'best', 'make'], ['drive', 'windows', 'thanks', 'use', 'card', 'does', 'problem', 'dos', 'disk', 'scsi'], ['10', '00', '20', '15', '25', '12', '14', '50', '16', '11'], ['game', 'team', 'games', 'play', 'season', 'hockey', 'league', 'players', 'cx', 'period'], ['key', 'use', 'government', 'law', 'gun', 'public', 'encryption', 'used', 'chip', 'people'], ['god', 'people', 'does', 'jesus', 'believe', 'say', 'true', 'think', 'life', 'bible'], ['people', 'armenian', 'israel', 'state', 'states', 'said', 'armenians', 'war', 'new', 'turkish'], ['edu', 'com', 'file', 'space', 'available', 'information', 'program', 'mail', 'use', 'ftp'], ['ax', 'max', 'g9v', 'b8f', 'a86', 'pl', '145', '1d9', '0t', '1t'], ['don', 'know', 'just', 'think', 'like', 'people', 'going', 've', 'want', 'did']]
Coherence UMass: -4.2270, C_V: 0.5170



In [None]:
start = time()
lda_sk = LatentDirichletAllocation(n_components=10, max_iter=50, random_state=42)
lda_sk.fit(X)
print(f"Time: {time() - start:.2f}s")

topics_sk = get_topics_sklearn(lda_sk, feat_names, n_top_words=10)
umass_sk, cv_sk = compute_coherence(topics_sk, corpus_bow, dictionary, texts)
print("Topics (sklearn):", topics_sk)
print(f"Coherence UMass: {umass_sk:.4f}, C_V: {cv_sk:.4f}")

Time: 2.23s
Topics (sklearn): [['space', 'use', 'men', 'university', 'people', 'ago', 'class', 'station', 'nasa', 'control'], ['mr', 'president', 'think', 'going', 'know', 'don', 'people', 'jobs', 'time', 'just'], ['space', 'nasa', 'orbit', 'launch', 'lunar', 'earth', 'shuttle', 'surface', '93', 'satellite'], ['just', 'car', 'like', 'think', 'know', 'people', 'don', 'problem', 've', 'use'], ['edu', 'graphics', 'data', 'software', 'image', 'ftp', 'available', 'information', 'package', 'code']]
Coherence UMass: -4.3693, C_V: 0.4994
