# 注意：

如何获取单词与单词之间的相关性？或者说获取单词与单词之间的相似性？

In [3]:
import gensim
import jieba

In [16]:
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]

texts=list(map(lambda document: list(filter(lambda word: len(word) > 1, jieba.cut(document))), documents))

In [17]:
for w in texts:
    print(w)

['Human', 'machine', 'interface', 'for', 'lab', 'abc', 'computer', 'applications']
['survey', 'of', 'user', 'opinion', 'of', 'computer', 'system', 'response', 'time']
['The', 'EPS', 'user', 'interface', 'management', 'system']
['System', 'and', 'human', 'system', 'engineering', 'testing', 'of', 'EPS']
['Relation', 'of', 'user', 'perceived', 'response', 'time', 'to', 'error', 'measurement']
['The', 'generation', 'of', 'random', 'binary', 'unordered', 'trees']
['The', 'intersection', 'graph', 'of', 'paths', 'in', 'trees']
['Graph', 'minors', 'IV', 'Widths', 'of', 'trees', 'and', 'well', 'quasi', 'ordering']
['Graph', 'minors', 'survey']


In [29]:
# sentences给定文本序列
# size: 最终形成的向量的维度大小
# window: 窗口大小
# workers：使用多少cpu核来训练模型，表示的是并行度
model = gensim.models.Word2Vec(sentences=texts, size=100, min_count=1, window=3, workers=2)

In [30]:
# 模型持久化
model.save('word2vec.mm')

In [31]:
# 模型加载
model2 = gensim.models.Word2Vec.load('word2vec.mm')

In [34]:
# 使用模型进行操作
# 1. 查看单词与单词之间的相似度
word = "Graph"
print("和单词{}最相似的单词已经相似度为：".format(word))
sim_word_list = model2.wv.most_similar(word, topn=10)
for sim_word in sim_word_list:
    print("相似单词为:{}, 相似度为:{}".format(sim_word[0], sim_word[1]))

和单词Graph最相似的单词已经相似度为：
相似单词为:minors, 相似度为:0.19433052837848663
相似单词为:trees, 相似度为:0.17099346220493317
相似单词为:perceived, 相似度为:0.1704540103673935
相似单词为:for, 相似度为:0.12178187817335129
相似单词为:unordered, 相似度为:0.11858190596103668
相似单词为:to, 相似度为:0.08818459510803223
相似单词为:quasi, 相似度为:0.06915955990552902
相似单词为:applications, 相似度为:0.06531854718923569
相似单词为:paths, 相似度为:0.06520514190196991
相似单词为:of, 相似度为:0.061371441930532455


In [39]:
# 使用模型进行操作
# 2. 将文本数据转换为特征矩阵的形式
# a. 生成词典
dictionary = gensim.corpora.Dictionary(texts)
# b. 进行文档转换
corpus = [dictionary.doc2bow(text) for text in texts]
# c. 使用tf-idf将频数转换为idf值
tfidf = gensim.models.TfidfModel(corpus=corpus)
# d. 将数据转换为tfidf格式
corpus_tfidf = tfidf[corpus]
# e. 结果输出
count = 0
for doc in corpus_tfidf:
    count += 1
    print(doc)
    if count > 5:
        break
# f. 使用gensim自带的主题模型进行模型训练
lda = gensim.models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=10)
# g. 输出lda的主题
print(lda.print_topics(5))

[(0, 0.37967204593042664), (1, 0.37967204593042664), (2, 0.37967204593042664), (3, 0.25989885074199909), (4, 0.37967204593042664), (5, 0.25989885074199909), (6, 0.37967204593042664), (7, 0.37967204593042664)]
[(3, 0.36534835293964857), (8, 0.19697923755694743), (9, 0.53371746832234968), (10, 0.36534835293964857), (11, 0.36534835293964857), (12, 0.26685873416117484), (13, 0.36534835293964857), (14, 0.26685873416117484)]
[(5, 0.41758763104062496), (12, 0.30501548925628524), (14, 0.30501548925628524), (15, 0.41758763104062496), (16, 0.30501548925628524), (17, 0.61003097851257049)]
[(8, 0.080759315951762001), (12, 0.21881827845311883), (15, 0.29957759440488085), (18, 0.43763655690623765), (19, 0.29957759440488085), (20, 0.43763655690623765), (21, 0.43763655690623765), (22, 0.43763655690623765)]
[(8, 0.073984529194696252), (10, 0.27444644643304716), (13, 0.27444644643304716), (14, 0.20046191723835088), (23, 0.40092383447670177), (24, 0.40092383447670177), (25, 0.40092383447670177), (26, 0.4