## Обучаем LDA модель

In [None]:
from topic_theme_lda import LDAModel
from utils_lda import preprocess_text
from gensim import corpora
from tqdm import tqdm
import pandas as pd

text_len_threshold = 100 #Минимальная длина текста в символах
num_topics = 10 #Количество топиков для вычисления

### Собираем данные из таблицы

In [None]:
data = pd.read_csv('../resources/post.csv')[['description', 'published']]
data['published'] = pd.to_datetime(data.published, format="%Y-%m-%d %H:%M:%S")
data['description'] = data['description'].fillna('')
data = data[data['description'].apply(lambda i: len(i)) >= text_len_threshold]
tqdm.pandas()
data['lemmas'] = data['description'].progress_apply(preprocess_text, args=(4, False, True))
# min_word_len=4, exclude_hashtags=False, lemmatize=True

### Обучаем новую модель

In [None]:
text_data = data.lemmas.to_list()[:10000]  #Берем первые 10000 текстов
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
lda_model = LDAModel(corpus=corpus, num_topics=num_topics,
                     id2word=dictionary, passes=15, 
                     start_date=data.published.min(), end_date=data.published.max())

### Предсказание топика по документу

In [None]:
document = data['description'].values[0]
print(document)
topics = lda_model.predict_raw_document_topic(document, min_word_len=4, exclude_hashtags=False, lemmatize=True)
print('Topic id: {0}, weight: {1}'.format(*topics[0]))
lda_model.print_topic(topics[0][0])

## Визуализация

In [None]:
import pyLDAvis.gensim

lda_display = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary, sort_topics=False)
pyLDAvis.show(lda_display)