In [1]:
from gensim.test.utils import common_texts
from gensim.models import TfidfModel, LdaModel, LsiModel
from gensim.corpora import Dictionary

# 一、数据加载

In [7]:
# 构建字典
common_dictionary = Dictionary(common_texts)
print("原始字典:\n{}".format(common_dictionary))
# 各个文本对应的词袋法的值(doc2bow)
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
print("原始数据:\n{}".format(common_texts))
print("\n词袋法后的值:\n{}".format(common_corpus))  # 第一个文本包含三个单词，ID分别为0、1和2的单词各出现一次 共九个文本
print(f"文本数目:{len(common_texts)}")
print(f"去重后单词数目:{len(common_dictionary)}")

原始字典:
Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...>
原始数据:
[['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]

词袋法后的值:
[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]
文本数目:9
去重后单词数目:12


In [5]:
other_texts = [
    ['computer', 'time', 'graph', 'graph'],  # 文本1
    ['survey', 'response', 'eps'],  # 文本2
    ['human', 'system', 'computer']  # 文本3
]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
print("测试数据对应的词袋法的值:\n{}".format(other_corpus))

测试数据对应的词袋法的值:
[[(0, 1), (6, 1), (10, 2)], [(3, 1), (4, 1), (8, 1)], [(0, 1), (1, 1), (5, 1)]]


# 二、TF-IDF Model

In [6]:
# 模型构建
model = TfidfModel(corpus=common_corpus)

In [8]:
# 计算Tfidf值
vectors = model[other_corpus]
for vector in vectors:
    print(vector)

[(0, 0.49182558987264147), (6, 0.49182558987264147), (10, 0.7184811607083769)]
[(3, 0.5773502691896257), (4, 0.5773502691896257), (8, 0.5773502691896257)]
[(0, 0.6282580468670046), (1, 0.6282580468670046), (5, 0.45889394536615247)]


# 三、LDA Model

In [9]:
# 模型构建&训练
model = LdaModel(common_corpus, num_topics=4)

In [10]:
# 模型保存
model.save('./datas/lda_model.pkl')

In [11]:
# 模型加载
lda = LdaModel.load('./datas/lda_model.pkl')

In [12]:
print(lda.print_topics())

[(0, '0.224*"11" + 0.223*"10" + 0.217*"9" + 0.125*"4" + 0.027*"5" + 0.027*"7" + 0.026*"2" + 0.026*"1" + 0.026*"3" + 0.026*"0"'), (1, '0.291*"5" + 0.202*"8" + 0.113*"7" + 0.112*"1" + 0.112*"2" + 0.028*"9" + 0.025*"10" + 0.024*"3" + 0.024*"0" + 0.024*"6"'), (2, '0.159*"7" + 0.159*"6" + 0.159*"3" + 0.090*"5" + 0.090*"9" + 0.089*"4" + 0.089*"0" + 0.089*"10" + 0.019*"2" + 0.019*"11"'), (3, '0.204*"1" + 0.203*"2" + 0.203*"0" + 0.047*"9" + 0.044*"10" + 0.043*"7" + 0.043*"5" + 0.043*"11" + 0.043*"3" + 0.043*"6"')]


In [11]:
# 模型结果获取(文本向量)
vectors = lda[other_corpus]
for vector in vectors:
    print(vector)

[(0, 0.313532), (1, 0.5485457), (2, 0.064657025), (3, 0.073265254)]
[(0, 0.0626659), (1, 0.06871439), (2, 0.5577589), (3, 0.3108608)]
[(0, 0.06260544), (1, 0.805602), (2, 0.06913189), (3, 0.062660664)]


In [12]:
# 更新模型（在当前模型基础上继续更新模型参数）
lda.update(other_corpus)

In [13]:
# 更新后模型结果获取(文本向量)
vectors = lda[other_corpus]
for vector in vectors:
    print(vector)

[(0, 0.3148391), (1, 0.5582936), (2, 0.062982015), (3, 0.06388527)]
[(0, 0.06256654), (1, 0.063109346), (2, 0.5634132), (3, 0.31091094)]
[(0, 0.06255915), (1, 0.81108934), (2, 0.06378226), (3, 0.06256926)]


# 四、Other

官网文档：https://radimrehurek.com/gensim/apiref.html

In [14]:
print("各个单词对应的主题向量:")
word_embedding_tabel = model.get_topics().T
print(type(word_embedding_tabel))
print(word_embedding_tabel.shape)
print(word_embedding_tabel)

各个单词对应的主题向量:
<class 'numpy.ndarray'>
(12, 4)
[[0.02887763 0.13959666 0.02576802 0.04239996]
 [0.02940821 0.0773494  0.12395072 0.04310027]
 [0.02941975 0.1387495  0.02632878 0.04289774]
 [0.02908398 0.07818598 0.02628603 0.20191553]
 [0.02891098 0.07813057 0.1232695  0.04290731]
 [0.02956096 0.14017616 0.22299403 0.04399477]
 [0.02925362 0.0778763  0.02600799 0.2029333 ]
 [0.02988685 0.14077409 0.0263424  0.20103562]
 [0.02891281 0.07781912 0.12405407 0.04242984]
 [0.35081893 0.01795606 0.02759609 0.04856333]
 [0.24875954 0.01697309 0.12308007 0.04421122]
 [0.13710676 0.01641307 0.12432227 0.04361112]]


In [15]:
common_dictionary.token2id

{'computer': 0,
 'human': 1,
 'interface': 2,
 'response': 3,
 'survey': 4,
 'system': 5,
 'time': 6,
 'user': 7,
 'eps': 8,
 'trees': 9,
 'graph': 10,
 'minors': 11}