In [4]:
import re
import time
from collections import defaultdict
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models.coherencemodel import CoherenceModel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk


nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/wignorbo/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/wignorbo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Загрузка и предобработка данных

In [5]:
data = fetch_20newsgroups(
    subset='train', 
    remove=('headers', 'footers', 'quotes'), 
    categories=['rec.sport.baseball', 'comp.sys.mac.hardware', 'sci.med', 'talk.politics.mideast'],
)
documents = data.data

In [8]:
stop_words = set(stopwords.words('english'))

def preprocess(texts):
    processed = []
    for doc in texts:
        tokens = word_tokenize(re.sub(r'\W+', ' ', doc.lower()))
        filtered = [word for word in tokens if word not in stop_words and len(word) > 2]
        processed.append(filtered)
    return processed

processed_docs = preprocess(documents)
processed_docs = [doc for doc in processed_docs if len(doc) > 0]

## Построение словаря и корпуса

In [9]:
dictionary = defaultdict()
dictionary.default_factory = lambda: len(dictionary)
vocab_size = 0

corpus = []
for doc in processed_docs:
    bow = []
    for word in doc:
        idx = dictionary[word]
        bow.append(idx)
    corpus.append(bow)
    vocab_size = max(vocab_size, max(bow) + 1)

## Обучение модели

In [11]:
from lda import LDA


index_to_word = {v: k for k, v in dictionary.items()}

lda_custom = LDA(n_topics=10, n_iter=20)
start_time = time.time()
lda_custom.fit(corpus, vocab_size)
end_time = time.time()

training_time_custom = end_time - start_time
print("Время обучения (custom):", training_time_custom)

topics_custom = lda_custom.get_vocabulary(index_to_word)
for i, words in enumerate(topics_custom):
    print(f"Тема {i}: {', '.join(words)}")

Время обучения (custom): 59.181934118270874
Тема 0: two, well, think, day, government, right, armenia, year, three, anything
Тема 1: jews, israeli, also, jewish, state, greek, human, food, water, rights
Тема 2: health, medical, research, 1993, information, new, university, number, disease, center
Тема 3: people, could, see, years, say, come, think, way, even, came
Тема 4: mac, com, keyboard, scsi, software, used, memory, hardware, monitor, disk
Тема 5: armenian, armenians, one, turkish, said, killed, children, went, first, anti
Тема 6: israel, people, would, genocide, turks, world, right, could, please, give
Тема 7: one, know, problem, use, apple, better, system, still, team, last
Тема 8: would, get, like, time, also, back, something, good, going, much
Тема 9: edu, may, turkey, know, game, soon, cancer, win, first, san


## Оценка когерентности тем

In [13]:
from gensim.corpora import Dictionary


gensim_dictionary = Dictionary(processed_docs)

coherence_model_custom = CoherenceModel(
    topics=topics_custom, 
    texts=processed_docs, 
    dictionary=gensim_dictionary, 
    coherence='c_v',
)
coherence_custom = coherence_model_custom.get_coherence()
print("Когерентность (custom):", coherence_custom)

Когерентность (custom): 0.4995167082720335


## Сравнение с sklearn

In [18]:
from gensim import corpora


vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
tf = vectorizer.fit_transform(documents)

start_time_sk = time.time()
lda_sklearn = LatentDirichletAllocation(n_components=10, random_state=42, max_iter=20)
lda_sklearn.fit(tf)
end_time_sk = time.time()

training_time_sk = end_time_sk - start_time_sk
print("Время обучения (sklearn):", training_time_sk)

def get_sklearn_topics(model, feature_names, n_top_words):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics.append(top_words)
    return topics

topics_sklearn = get_sklearn_topics(lda_sklearn, vectorizer.get_feature_names_out(), 10)

dictionary_sk = corpora.Dictionary([vectorizer.get_feature_names_out().tolist()]) 

cm_sklearn = CoherenceModel(
    topics=[
        [dictionary_sk.token2id[w] for w in topic] 
        for topic in topics_sklearn
    ],
    texts=processed_docs,
    dictionary=gensim_dictionary,
    coherence='c_v',
)
coherence_sklearn = cm_sklearn.get_coherence()
print("Когерентность (sklearn):", coherence_sklearn)

Время обучения (sklearn): 7.499375104904175
Когерентность (sklearn): 0.5740620318969729


In [19]:
for i, words in enumerate(topics_sklearn):
    print(f"Тема {i}: {', '.join(words)}")

Тема 0: edu, gordon, banks, soon, geb, pitt, intellect, don, skepticism, n3jxp
Тема 1: mac, apple, bit, card, scsi, problem, 32, use, monitor, color
Тема 2: year, game, team, 00, good, games, runs, players, hit, better
Тема 3: msg, don, water, food, like, just, know, think, adam, people
Тема 4: drive, know, just, thanks, like, does, ve, mail, apple, good
Тема 5: israel, israeli, just, think, arab, people, like, know, don, time
Тема 6: said, people, don, know, didn, went, say, just, came, told
Тема 7: edu, com, university, people, medical, cancer, patients, pain, disease, hiv
Тема 8: armenian, turkish, armenians, jews, people, turkey, government, turks, armenia, greek
Тема 9: health, use, keyboard, 1993, information, 10, number, medical, 20, edu
