In [1]:
import numpy as np
import time
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
from nltk.corpus import stopwords
import nltk
import re

In [2]:
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')

In [None]:
class MyLDA:
    def __init__(self, n_topics, n_iter=100, alpha=0.1, beta=0.01):
        self.n_topics = n_topics
        self.n_iter = n_iter
        self.alpha = alpha
        self.beta = beta

    def fit(self, X):
        n_docs, n_words = X.shape
        self.phi = np.random.dirichlet([self.beta] * n_words, self.n_topics)
        self.theta = np.random.dirichlet([self.alpha] * self.n_topics, n_docs)

        for it in range(self.n_iter):
            print(f"Iteration {it+1}/{self.n_iter}")

            # E-шаг
            p_tdw = np.zeros((n_docs, n_words, self.n_topics))
            for d in range(n_docs):
                for w in X[d].nonzero()[1]:
                    probs = self.phi[:, w] * self.theta[d, :]
                    p_tdw[d, w, :] = probs / probs.sum()

            # M-шаг
            nwt = np.zeros((self.n_topics, n_words))
            ntd = np.zeros((n_docs, self.n_topics))
            for d in range(n_docs):
                for w in X[d].nonzero()[1]:
                    count = X[d, w]
                    for t in range(self.n_topics):
                        p = p_tdw[d, w, t]
                        nwt[t, w] += count * p
                        ntd[d, t] += count * p

            # обновление
            self.phi = (nwt + self.beta - 1)
            self.phi /= self.phi.sum(axis=1, keepdims=True)

            self.theta = (ntd + self.alpha - 1)
            self.theta /= self.theta.sum(axis=1, keepdims=True)

    def get_topics(self, vocab, n_top_words=10):
        topics = []
        for t in range(self.n_topics):
            top_indices = self.phi[t].argsort()[-n_top_words:][::-1]
            topics.append([vocab[i] for i in top_indices])
        return topics


In [None]:
def preprocess_text(text):
    text = re.sub(r'\S*@\S*\s?', '', text)  #emails
    text = re.sub(r'\s+', ' ', text)  #newline chars
    text = re.sub(r"\'", "", text)  #single quotes
    text = re.sub(r'[^\w\s]', '', text, re.UNICODE) #punctuation
    return text.lower()

def calculate_coherence(topics, texts, dictionary):
    coherence_model = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
    return coherence_model.get_coherence()

def print_topics(topics):
    for i, topic in enumerate(topics):
        print(f"Тема {i+1}: {' '.join(topic)}")

In [20]:
N_TOPICS = 4
N_ITERATIONS = 120
N_TOP_WORDS = 10
MAX_FEATURES = 1000

categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
newsgroups_train = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes'))

processed_docs = [preprocess_text(doc) for doc in newsgroups_train.data]

stop_words = stopwords.words('english')
vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=MAX_FEATURES, stop_words=stop_words)
X = vectorizer.fit_transform(processed_docs)
vocab = vectorizer.get_feature_names_out()

texts_for_coherence = [[word for word in doc.split() if word in vocab] for doc in processed_docs]
dictionary = Dictionary(texts_for_coherence)

my_lda = MyLDA(n_topics=N_TOPICS, n_iter=N_ITERATIONS, alpha=0.4, beta=0.1)

start_time = time.time()
my_lda.fit(X)
my_lda_time = time.time() - start_time

my_lda_topics = my_lda.get_topics(vocab, n_top_words=N_TOP_WORDS)
my_lda_coherence = calculate_coherence(my_lda_topics, texts_for_coherence, dictionary)

sklearn_lda = LatentDirichletAllocation(n_components=N_TOPICS, max_iter=N_ITERATIONS,
                                        learning_method='batch', random_state=42)
start_time = time.time()
sklearn_lda.fit(X)
sklearn_lda_time = time.time() - start_time

sklearn_lda_topics = []
for topic_idx, topic in enumerate(sklearn_lda.components_):
    top_words_indices = topic.argsort()[:-N_TOP_WORDS - 1:-1]
    topic_words = [vocab[i] for i in top_words_indices]
    sklearn_lda_topics.append(topic_words)
sklearn_lda_coherence = calculate_coherence(sklearn_lda_topics, texts_for_coherence, dictionary)

Iteration 1/120
Iteration 2/120
Iteration 3/120
Iteration 4/120
Iteration 5/120
Iteration 6/120
Iteration 7/120
Iteration 8/120
Iteration 9/120
Iteration 10/120
Iteration 11/120
Iteration 12/120
Iteration 13/120
Iteration 14/120
Iteration 15/120
Iteration 16/120
Iteration 17/120
Iteration 18/120
Iteration 19/120
Iteration 20/120
Iteration 21/120
Iteration 22/120
Iteration 23/120
Iteration 24/120
Iteration 25/120
Iteration 26/120
Iteration 27/120
Iteration 28/120
Iteration 29/120
Iteration 30/120
Iteration 31/120
Iteration 32/120
Iteration 33/120
Iteration 34/120
Iteration 35/120
Iteration 36/120
Iteration 37/120
Iteration 38/120
Iteration 39/120
Iteration 40/120
Iteration 41/120
Iteration 42/120
Iteration 43/120
Iteration 44/120
Iteration 45/120
Iteration 46/120
Iteration 47/120
Iteration 48/120
Iteration 49/120
Iteration 50/120
Iteration 51/120
Iteration 52/120
Iteration 53/120
Iteration 54/120
Iteration 55/120
Iteration 56/120
Iteration 57/120
Iteration 58/120
Iteration 59/120
Iterat

In [21]:
results_summary = f"""
Кастом LDA
- Время обучени: {my_lda_time:.2f} сек
- Когерентность: {my_lda_coherence:.4f}
- Темы:
"""
for i, topic in enumerate(my_lda_topics):
    results_summary += f"  - Тема {i+1}: {' '.join(topic)}\n"

results_summary += f"""
scikit-learn LDA
- Время обучени: {sklearn_lda_time:.2f} сек
- Когерентность: {sklearn_lda_coherence:.4f}
- Темы:
"""
for i, topic in enumerate(sklearn_lda_topics):
    results_summary += f"  - Тема {i+1}: {' '.join(topic)}\n"
comparison_summary = f"""\
Когерентность mylda: {my_lda_coherence:.4f}\n\
Когерентность sklearn: {sklearn_lda_coherence:.4f}\n
"""

print(results_summary)
print(comparison_summary)



Кастом LDA
- Время обучени: 558.95 сек
- Когерентность: 0.2811
- Темы:
  - Тема 1: explain baptism death study hate time anyone get said problems
  - Тема 2: explain support per anyone made people posted get problems death
  - Тема 3: support per anyone posted described faq provide control database following
  - Тема 4: explain course made support people guess cant problems issue total

scikit-learn LDA
- Время обучени: 113.53 сек
- Когерентность: 0.6927
- Темы:
  - Тема 1: god one would people jesus believe us church say bible
  - Тема 2: image graphics jpeg file images available data software files also
  - Тема 3: would one dont know like im get think people could
  - Тема 4: medical health 10 disease patients cancer research 1993 information hiv

Когерентность mylda: 0.2811
Когерентность sklearn: 0.6927


