# Лабораторная работа №4: Латентное размещение Дирихле (LDA)

## Загрузка и предобработка данных (20 Newsgroups)

In [3]:
import nltk
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('stopwords')
nltk.download('wordnet')

# Загрузка датасета
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
texts = newsgroups.data

# Предобработка
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z ]', ' ', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and len(token) > 2]
    return ' '.join(tokens)

texts_clean = [preprocess(doc) for doc in texts]

# Преобразование в мешок слов
vectorizer = CountVectorizer(max_features=2000)
X = vectorizer.fit_transform(texts_clean)

print(f'Размер корпуса: {X.shape}')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...


Размер корпуса: (18846, 2000)


## Обучение и анализ: Ручная реализация LDA

In [4]:
import time
from lda_manual import LDAManual

n_topics = 10
n_iter = 100

lda_manual = LDAManual(n_topics=n_topics, n_iter=n_iter, alpha=0.1, beta=0.01, random_state=42)

start = time.time()
lda_manual.fit(X)
manual_time = time.time() - start

# Получаем топ-10 слов для каждой темы
feature_names = vectorizer.get_feature_names_out()
top_words_manual = lda_manual.get_top_words(feature_names, n_top_words=10)

for idx, words in enumerate(top_words_manual):
    print(f'Тема {idx+1}:', ', '.join(words))

print(f'Время обучения (ручная реализация): {manual_time:.2f} секунд')

Итерация 10/100 завершена
Итерация 20/100 завершена
Итерация 30/100 завершена
Итерация 40/100 завершена
Итерация 50/100 завершена
Итерация 60/100 завершена
Итерация 70/100 завершена
Итерация 80/100 завершена
Итерация 90/100 завершена
Итерация 100/100 завершена
Тема 1: drive, card, do, system, disk, thanks, would, use, window, know
Тема 2: key, government, president, system, public, use, information, chip, state, number
Тема 3: god, jesus, one, christian, say, believe, christ, bible, man, belief
Тема 4: armenian, people, gun, state, said, government, israel, right, child, war
Тема 5: max, space, nasa, earth, disease, system, bhj, patient, medical, stephanopoulos
Тема 6: one, would, get, like, time, car, know, good, back, could
Тема 7: would, people, think, one, like, know, get, make, right, say
Тема 8: one, book, would, people, group, church, article, word, also, time
Тема 9: file, edu, image, window, program, use, com, ftp, available, version
Тема 10: game, team, year, player, play, wi

## Обучение и анализ: LDA из sklearn

In [5]:
from sklearn.decomposition import LatentDirichletAllocation

lda_sklearn = LatentDirichletAllocation(n_components=n_topics, max_iter=10, learning_method='batch', random_state=42)

start = time.time()
lda_sklearn.fit(X)
sklearn_time = time.time() - start

# Получаем топ-10 слов для каждой темы
top_words_sklearn = []
for topic_idx, topic in enumerate(lda_sklearn.components_):
    top = topic.argsort()[::-1][:10]
    top_words_sklearn.append([feature_names[i] for i in top])

for idx, words in enumerate(top_words_sklearn):
    print(f'Тема {idx+1}:', ', '.join(words))

print(f'Время обучения (sklearn): {sklearn_time:.2f} секунд')

Тема 1: window, drive, do, card, system, disk, problem, use, bit, work
Тема 2: space, program, please, thanks, would, information, anyone, know, application, window
Тема 3: armenian, state, year, new, people, war, muslim, turkish, russian, american
Тема 4: god, game, one, jesus, christian, team, would, year, church, player
Тема 5: would, people, key, government, law, right, gun, think, one, know
Тема 6: max, israel, israeli, arab, bhj, giz, jew, medical, disease, palestinian
Тема 7: one, would, people, think, may, like, say, many, thing, make
Тема 8: one, get, like, time, would, know, back, said, say, could
Тема 9: car, new, price, one, like, good, would, sale, get, also
Тема 10: file, edu, image, program, com, available, ftp, version, use, graphic
Время обучения (sklearn): 47.10 секунд


## Оценка когерентности тем и сравнение результатов

In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

# Преобразуем тексты в список токенов
texts_tokens = [doc.split() for doc in texts_clean]
dictionary = Dictionary(texts_tokens)
corpus = [dictionary.doc2bow(text) for text in texts_tokens]

manual_topics = top_words_manual
cm_manual = CoherenceModel(topics=manual_topics, texts=texts_tokens, dictionary=dictionary, coherence='c_v')
coh_manual = cm_manual.get_coherence()
print(f'Когерентность (ручная реализация): {coh_manual:.4f}')

sklearn_topics = top_words_sklearn
cm_sklearn = CoherenceModel(topics=sklearn_topics, texts=texts_tokens, dictionary=dictionary, coherence='c_v')
coh_sklearn = cm_sklearn.get_coherence()
print(f'Когерентность (sklearn): {coh_sklearn:.4f}')

Когерентность (ручная реализация): 0.5897
Когерентность (sklearn): 0.5307
