In [37]:
import spacy
from spacy.lang.zh import Chinese
import gensim
from gensim import corpora
import pickle
import pyLDAvis.gensim

In [38]:
parser = Chinese()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace() or token.is_punct:
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        elif token.like_num:
            lda_tokens.append('NUM')
        else:
            lda_tokens.append(str(token))
    return lda_tokens

In [39]:
def prepare_data_for_lda(file_path):
    text_data = []
    with open(file_path, encoding="utf-8") as f:
        for line in f:
            tokens = tokenize(line)
            text_data.append(tokens)
    return text_data

In [41]:
def train_lda(train_data_path, topic_num):
    text_data = prepare_data_for_lda(train_data_path)
    dictionary = corpora.Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = topic_num, id2word=dictionary, passes=15)
    return dictionary, corpus, ldamodel

In [43]:
# 训练模型
TRAIN_DATA_PATH = 'prmt0020.txt'
NUM_TOPICS = 5
dictionary, corpus, ldamodel = train_lda(TRAIN_DATA_PATH, NUM_TOPICS)

In [48]:
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.047*"NUM" + 0.047*"的" + 0.033*"自律" + 0.030*"大本营" + 0.026*"班" + 0.015*"和" + 0.012*"送花" + 0.012*"你" + 0.011*"我" + 0.011*"你们"')
(1, '0.022*"老师" + 0.016*"签到" + 0.015*"NUM" + 0.015*"你" + 0.014*"打卡" + 0.013*"http" + 0.009*"刷题" + 0.008*"早安" + 0.008*"害羞" + 0.008*"是"')
(2, '0.058*"的" + 0.031*"了" + 0.023*"我" + 0.018*"是" + 0.017*"你" + 0.013*"都" + 0.010*"在" + 0.010*"撒花" + 0.009*"成绩" + 0.009*"学习"')
(3, '0.037*"的" + 0.023*"我" + 0.018*"了" + 0.015*"有" + 0.013*"会" + 0.010*"你" + 0.010*"想" + 0.010*"￼" + 0.009*"请" + 0.008*"冰冻"')
(4, '0.152*"NUM" + 0.062*"=" + 0.022*"https" + 0.022*"pagedetail" + 0.022*"param" + 0.021*"cc" + 0.021*"ay2tda" + 0.021*"mlinks" + 0.018*"unitidstr" + 0.018*"homework"')


In [44]:
# 可视化
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [46]:
# 保存模型
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')
ldamodel.save(f'model_{NUM_TOPICS}.gensim')

In [56]:
# 测试模型
test_data_path = "prmt0020.txt"
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
ldamodel_ = gensim.models.ldamodel.LdaModel.load(f'model_{NUM_TOPICS}.gensim')
text_data_ = prepare_data_for_lda(test_data_path)
corpus_ = [dictionary.doc2bow(text) for text in text_data_]
lda_display = pyLDAvis.gensim.prepare(ldamodel_, corpus_, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
