In [1]:
import re
import time
from collections import defaultdict

import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
try:
    import nltk
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    nltk.download('stopwords', quiet=True)
    use_nltk = True
except Exception:
    from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as stopwords
    PorterStemmer = None
    use_nltk = False

In [3]:
news = fetch_20newsgroups(subset='train', remove=('headers','footers','quotes'))
raw_docs = news.data[:500]

if use_nltk and PorterStemmer:
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
else:
    stop_words = set(stopwords)
    stemmer = None

In [4]:
def preprocess(text):
    tokens = re.sub(r'[^a-zA-Z]', ' ', text).lower().split()
    res = []
    for t in tokens:
        if t not in stop_words and len(t)>2:
            res.append(stemmer.stem(t) if stemmer else t)
    return res
docs = [preprocess(doc) for doc in raw_docs]

In [5]:
class GibbsLDA:
    def __init__(self, K, alpha=0.1, beta=0.01, iterations=1000):
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iter = iterations

    def fit(self, docs):
        vocab = {w for doc in docs for w in doc}
        self.word2id = {w:i for i,w in enumerate(vocab)}
        V = len(vocab)
        D = len(docs)
        K = self.K

        nw = np.zeros((K, V)) + self.beta      # слово-тема
        nd = np.zeros((D, K)) + self.alpha     # док-тема
        nwsum = np.zeros(K) + V*self.beta      # всего слов в темах
        z = []  # темы для каждого слова

        for d, doc in enumerate(docs):
            z_doc = []
            for w in doc:
                wid = self.word2id[w]
                topic = np.random.randint(K)
                z_doc.append(topic)
                nw[topic, wid] += 1
                nd[d, topic] += 1
                nwsum[topic] += 1
            z.append(z_doc)

        for it in range(self.iter):
            for d, doc in enumerate(docs):
                for i, w in enumerate(doc):
                    wid = self.word2id[w]
                    t = z[d][i]
                    # удаляем
                    nw[t, wid] -= 1
                    nd[d, t] -= 1
                    nwsum[t] -= 1
                    # вероятности
                    p = (nd[d] * nw[:, wid] / nwsum)
                    p /= p.sum()
                    # новый топик
                    new_t = np.random.choice(K, p=p)
                    z[d][i] = new_t
                    nw[new_t, wid] += 1
                    nd[d, new_t] += 1
                    nwsum[new_t] += 1

        self.phi = nw / nwsum[:, None]
        self.theta = nd / nd.sum(axis=1)[:,None]

In [6]:
def compute_umass(phi, word2id, docs, top_n=10):
    id2w = {i:w for w,i in word2id.items()}
    D_w = defaultdict(int)
    D_pair = defaultdict(int)
    for doc in docs:
        uniq = set(doc)
        for w in uniq:
            D_w[w] += 1
        for w1 in uniq:
            for w2 in uniq:
                if w1 < w2:
                    D_pair[(w1,w2)] +=1
    scores = []
    K = phi.shape[0]
    for k in range(K):
        top_ids = np.argsort(phi[k])[-top_n:]
        words = [id2w[i] for i in top_ids]
        sc, cnt = 0,0
        for i in range(len(words)):
            for j in range(i):
                w1,w2 = words[i],words[j]
                pair = tuple(sorted((w1,w2)))
                Dp = D_pair.get(pair,0)
                if D_w[w2]>0:
                    sc += np.log((Dp+1)/D_w[w2])
                    cnt +=1
        scores.append(sc/cnt if cnt else 0)
    return np.mean(scores)


In [7]:
K=10; iters=50

lda_custom = GibbsLDA(K, iterations=iters)
start = time.time()
lda_custom.fit(docs)
time_custom = time.time()-start
umass_custom = compute_umass(lda_custom.phi, lda_custom.word2id, docs)

vec = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
X = vec.fit_transform([' '.join(d) for d in docs])
lda_sk = LatentDirichletAllocation(n_components=K, learning_method='online',
                                   random_state=42, doc_topic_prior=0.1,
                                   topic_word_prior=0.01, max_iter=iters)
start = time.time()
lda_sk.fit(X)
time_sk = time.time()-start
class Wrap:
    def __init__(self,m,fn):
        self.phi = m.components_ / m.components_.sum(axis=1)[:,None]
        self.word2id = {w:i for i,w in enumerate(fn)}
lda_wr = Wrap(lda_sk, vec.get_feature_names_out())
umass_sk = compute_umass(lda_wr.phi, lda_wr.word2id, docs)

In [8]:
print(f"{'Модель':<15} | {'Время (с)':<10} | {'UMass':<8}")
print('-'*40)
print(f"{'Custom LDA':<15} | {time_custom:<10.2f} | {umass_custom:<8.4f}")
print(f"{'sklearn LDA':<15} | {time_sk:<10.2f} | {umass_sk:<8.4f}")

Модель          | Время (с)  | UMass   
----------------------------------------
Custom LDA      | 48.79      | -1.1486 
sklearn LDA     | 3.53       | -1.5407 
