数据准备与导入

In [1]:
import pandas as pd #数据表
import numpy as np    #数据处理
import re
import jieba   #中文分词
import matplotlib.pyplot as plt  #
from gensim import corpora, models   #gensim是一个自然语言处理的库
import pyLDAvis   
import pyLDAvis.gensim_models as gensimvis    #gensim的可视化库

In [2]:
df = pd.read_excel('text_analysis_weibo_sample.xlsx', index_col=0)

语料预处理


In [3]:

# 文本清洗与分词函数
def clean_text(text):
    #剔除符号与数字，只保留汉字
    processed = re.sub('[^\u4e00-\u9fa5]+','',text)
    #分词
    words = jieba.lcut(processed)
    #剔除停用词
    stopwords = ['的','了','在','和','是','我','有',
                 '就','不','人','都','说','要','这','也','为','他',
                 '她','它','一个','上','去','会','着','对','也','吗']
    words = [w for w in words if w not in stopwords]
    #return words
    return ' '.join(words)

# 测试函数
#print(df['标题/微博内容'][0])
#print(clean_text(df['标题/微博内容'][0]))


In [4]:
df['微博内容分词'] = df['标题/微博内容'].astype(str).apply(clean_text)   # 对原始文本进行分词处理
df['微博内容分词'] = df['微博内容分词'].apply(lambda x: x.split())    # 将分词结果转换为列表形式
#df['微博内容分词']


Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/5n/3vzzg7zn6p93tcm8vtx98j_80000gn/T/jieba.cache
Loading model cost 0.352 seconds.
Prefix dict has been built successfully.


LDA主题模型

In [5]:
texts = [['human', 'interface', 'computer']]
# 创建词典
dictionary = corpora.Dictionary(texts)  
print(dictionary.token2id)
# convert tokenized document into bag-of-words (BoW) format
# format: list of (token_id, token_count) tuples
print(dictionary.doc2bow(['human', 'interface', 'computer']))



{'computer': 0, 'human': 1, 'interface': 2}
[(0, 1), (1, 1), (2, 1)]


In [6]:
dictionary.add_documents([["cat","say","meow"],["dog"]])   # 添加新文档
#print(dictionary.token2id)   # 查看词典
#print(dictionary.doc2bow(["dog","computer","non_existent_word"]))   # 查看新文档的BoW格式

In [7]:
dictionary = corpora.Dictionary(df['微博内容分词']) #根据分词结果创建字典
corpus = [dictionary.doc2bow(text) for text in df['微博内容分词']] #根据分词结果创建语料库
#dictionary

#corpus



In [8]:
# 训练LDA模型
lda_model = models.LdaModel(corpus, num_topics=5,id2word=dictionary, passes=15)

In [9]:
# 查看主题
topics = lda_model.print_topics(num_words=5)
#for topic in topics:
#    print(topic)

In [10]:
# 按主题权重降序排列并打印前5个关键词
for index, score in sorted(lda_model[corpus[0]], key=lambda tup: -1*tup[1]):   
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.9885679483413696	 Topic: 0.014*"电子" + 0.014*"烟" + 0.008*"郑州" + 0.008*"年" + 0.007*"亿元"
Score: 0.010956695303320885	 Topic: 0.016*"月" + 0.014*"万吨" + 0.009*"市场" + 0.008*"库存" + 0.008*"日"


In [11]:
def infer_topic(lda_model, document):   
    bow = dictionary.doc2bow(document)     # 将分词列表转换为词袋向量 (word_id, word_count)
    topics = lda_model.get_document_topics(bow)  # 获取文档的主题分布
    return topics


docunments = df['微博内容分词'].values.tolist()  # 获取所有文档的分词列表
for i, doc in enumerate(docunments):  # 遍历每个文档
    # 对每个文档进行主题推断        
    doc_topics = infer_topic(lda_model, doc)
    # 打印每个文档的主题分布
    #print(f"Document {i+1}:")
    #print(doc_topics)
    #print()
    
    

可视化

In [19]:
#如果在数据量较大时，使用pyLDAvis可视化LDA模型
lda_vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
# 可视化LDA模型
pyLDAvis.display(lda_vis_data)