### gensim中的word2vec的使用

In [3]:
from gensim.models import Word2Vec
import pandas as pd
import numpy as np

sentences = [["cat", "say", "meow"], ["dog", "say", "woof"], ["<ubk>"]]

# 定义一个空的模型
model = Word2Vec(min_count=1) # 词频数少于min_count的数据会被清除
model.build_vocab(sentences) #
model.train(sentences, total_examples=model.corpus_total_words, epochs=model.epochs)

(1, 35)

In [4]:
# model.wv.add_vector('<unk>', np.random.randn(100))
model.wv.most_similar('cat')

[('meow', 0.13887986540794373),
 ('dog', 0.13149003684520721),
 ('woof', 0.0640898123383522),
 ('<ubk>', 0.009391184896230698),
 ('say', -0.059876278042793274)]

In [5]:
model.wv['王天赐']

KeyError: "Key '王天赐' not present"

### 如何处理word2vec中词袋里没有的单词

- [word2vec缺少单词怎么办？](https://www.zhihu.com/question/329708785/answer/739525740)

In [None]:
from collections import Counter
sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]

# 传入的数据是一个单词数组 : ["hello", "world"]
word_list = [word for sentence in sentences for word in sentence]

print(word_list)

# 统计词频获取词袋

word_dict = Counter(word_list)
print(word_dict)

# 获取词袋
vocab = [item[0] for item in word_dict.items()]
print(vocab)

In [None]:


UNK = '<unk>'
sentences.append([UNK])

print(sentences)

In [None]:
# 定义一个空的模型
model = Word2Vec(min_count=1) # 词频数少于min_count的数据会被清除
model.build_vocab(sentences) #
model.train(sentences, total_examples=model.corpus_total_words, epochs=model.epochs)

In [None]:
# 获取词表索引
model.wv.key_to_index

In [None]:
# 获取词表
model.wv.index_to_key

In [6]:
weibo_model = Word2Vec.load("../out_dir/word2vec.model")

In [7]:
weibo_model.wv.index_to_key[:10]

['[', ']', '/', '@', ':', '泪', '~', '嘻嘻', '#', '爱']

In [11]:
line_vec = weibo_model.wv[['cat', 'say']]

In [12]:
print(type(line_vec))

<class 'numpy.ndarray'>
