In [8]:
import numpy as np
from collections import defaultdict
import time
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as SklearnLDA
from sklearn.model_selection import train_test_split

In [None]:
class LDAGibbs:
    def __init__(self, n_topics=10, alpha=0.1, beta=0.1, max_iter=100):
        self.n_topics = n_topics
        self.alpha = alpha
        self.beta = beta
        self.max_iter = max_iter
        self.vocab = None
        self.phi = None
        self.theta = None
        
    def fit(self, X, vocab):
        n_docs = X.shape[0]
        self.vocab = vocab
        self.vocab_size = len(vocab)
        
        self.n_dk = np.zeros((n_docs, self.n_topics)) + self.alpha
        self.n_kw = np.zeros((self.n_topics, self.vocab_size)) + self.beta
        self.n_k = np.zeros(self.n_topics) + self.vocab_size * self.beta
        
        self.z = []
        for d in range(n_docs):
            doc = X[d].tocoo()
            cols = doc.col
            data = doc.data
            word_indices = np.repeat(cols, data)
            n_words_in_doc = len(word_indices)
            
            topics_in_doc = np.random.choice(self.n_topics, size=n_words_in_doc)
            self.z.append(topics_in_doc)
            
            for w, topic in zip(word_indices, topics_in_doc):
                self.n_dk[d, topic] += 1
                self.n_kw[topic, w] += 1
                self.n_k[topic] += 1
        
        for iteration in range(self.max_iter):
            for d in range(n_docs):
                doc = X[d].tocoo()
                cols = doc.col
                data = doc.data
                word_indices = np.repeat(cols, data)
                topics_in_doc = self.z[d]
                
                for i in range(len(word_indices)):
                    w = word_indices[i]
                    old_topic = topics_in_doc[i]
                    
                    self.n_dk[d, old_topic] -= 1
                    self.n_kw[old_topic, w] -= 1
                    self.n_k[old_topic] -= 1
                    
                    p_topics = (self.n_dk[d] / (self.n_dk[d].sum() + self.n_topics * self.alpha)) * \
                               (self.n_kw[:, w] / (self.n_k + 1e-12))
                    p_topics = p_topics / p_topics.sum()
                    
                    new_topic = np.random.choice(self.n_topics, p=p_topics)
                    topics_in_doc[i] = new_topic
                    
                    self.n_dk[d, new_topic] += 1
                    self.n_kw[new_topic, w] += 1
                    self.n_k[new_topic] += 1
        
        self.phi = self.n_kw / self.n_k[:, np.newaxis]
        self.theta = (self.n_dk) / (self.n_dk.sum(axis=1)[:, np.newaxis] + 1e-12)
        return self
    
    def get_topics(self, n_words=10):
        topic_words = []
        for k in range(self.n_topics):
            top_indices = self.phi[k].argsort()[-n_words:][::-1]
            topic_words.append([self.vocab[i] for i in top_indices])
        return topic_words

In [None]:
def calculate_coherence(topics, X, vocab):
    word2id = {word: idx for idx, word in enumerate(vocab)}
    doc_count = np.zeros(len(vocab))
    co_doc_count = defaultdict(int)
    
    for i in range(X.shape[0]):
        doc = X[i].tocoo()
        words = set(doc.col)
        for w in words:
            doc_count[w] += 1
        word_list = list(words)
        for i1 in range(len(word_list)):
            for i2 in range(i1+1, len(word_list)):
                w1, w2 = word_list[i1], word_list[i2]
                if w1 > w2:
                    w1, w2 = w2, w1
                co_doc_count[(w1, w2)] += 1
    
    coherence = 0
    topic_count = 0
    for topic in topics:
        topic_words = [word for word in topic if word in word2id]
        if len(topic_words) < 2:
            continue
            
        topic_coherence = 0
        pair_count = 0
        word_ids = [word2id[w] for w in topic_words]
        
        for i in range(1, len(word_ids)):
            for j in range(i):
                w1, w2 = word_ids[j], word_ids[i]
                if w1 > w2:
                    w1, w2 = w2, w1
                count = co_doc_count.get((w1, w2), 1e-5)
                topic_coherence += np.log((count + 1e-5) / (doc_count[w2] + 1e-5))
                pair_count += 1
        
        if pair_count > 0:
            coherence += topic_coherence / pair_count
            topic_count += 1
    
    return coherence / topic_count if topic_count > 0 else 0

def preprocess_data():
    categories = [
        'comp.windows.x',
        'misc.forsale',
        'rec.autos',
        'rec.sport.hockey',
        'sci.crypt',
        'sci.electronics',
        'sci.med',
        'sci.space',
        'soc.religion.christian',
        'talk.politics.guns']
    newsgroups = fetch_20newsgroups(
        subset='all', categories=categories,
        remove=('headers', 'footers', 'quotes')
    )
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    X = vectorizer.fit_transform(newsgroups.data)
    return X, vectorizer.get_feature_names_out()

In [None]:
def compare_models(X, vocab, n_topics=10, max_iter=20):
    _, X_sample, _, _ = train_test_split(X, np.zeros(X.shape[0]), test_size=0.1, random_state=42)
    
    start = time.time()
    our_lda = LDAGibbs(n_topics=n_topics, max_iter=max_iter).fit(X_sample, vocab)
    our_time = time.time() - start
    our_topics = our_lda.get_topics(n_words=10)
    our_coherence = calculate_coherence(our_topics, X_sample, vocab)
    
    start = time.time()
    sklearn_lda = SklearnLDA(
        n_components=n_topics,
        learning_method='batch',
        max_iter=max_iter,
        random_state=42
    )
    sklearn_lda.fit(X_sample)
    sklearn_time = time.time() - start
    
    sklearn_topics = []
    for topic_weights in sklearn_lda.components_:
        top_indices = topic_weights.argsort()[-10:][::-1]
        sklearn_topics.append([vocab[i] for i in top_indices])
    
    sklearn_coherence = calculate_coherence(sklearn_topics, X_sample, vocab)
    
    return {
        'our_time': our_time,
        'our_coherence': our_coherence,
        'sklearn_time': sklearn_time,
        'sklearn_coherence': sklearn_coherence,
        'our_topics': our_topics,
        'sklearn_topics': sklearn_topics
    }

In [18]:
X, vocab = preprocess_data()

results = compare_models(X, vocab, n_topics=10, max_iter=10)

In [16]:
print(f"Ручной LDA:")
print(f"  Время обучения: {results['our_time']:.2f} сек")
print(f"  Когерентность тем: {results['our_coherence']:.4f}")
for i, topic in enumerate(results['our_topics']):
    print(f"Тема {i+1}: {', '.join(topic[:5])}")

print("\nSklearn LDA:")
print(f"  Время обучения: {results['sklearn_time']:.2f} сек")
print(f"  Когерентность тем: {results['sklearn_coherence']:.4f}")
for i, topic in enumerate(results['sklearn_topics']):
    print(f"Тема {i+1}: {', '.join(topic[:5])}")

Ручной LDA:
  Время обучения: 51.58 сек
  Когерентность тем: -2.1486
Тема 1: entry, output, use, program, file
Тема 2: god, people, don, say, think
Тема 3: right, know, like, power, think
Тема 4: server, work, using, time, use
Тема 5: file, number, program, information, oname
Тема 6: cancer, people, group, book, just
Тема 7: car, think, game, vitamin, good
Тема 8: edu, keyboard, pc, available, xfree86
Тема 9: 00, 10, government, new, 20
Тема 10: like, want, know, just, way

Sklearn LDA:
  Время обучения: 5.61 сек
  Когерентность тем: -2.4519
Тема 1: entry, use, like, xfree86, file
Тема 2: output, file, government, people, like
Тема 3: cancer, clutch, people, hiv, information
Тема 4: 00, 10, 50, 1st, 15
Тема 5: like, know, people, does, just
Тема 6: god, know, want, don, think
Тема 7: keyboard, like, pc, new, price
Тема 8: gm, game, 03, team, 02
Тема 9: 10, 00, people, right, don
Тема 10: vitamin, retinol, use, liver, time
