## 第一节 加载公开词向量

In [1]:
from gensim.models import KeyedVectors

model_path = '/root/wordvectors/sgns.weibo.word.bz2'
model = KeyedVectors.load_word2vec_format(model_path)

In [4]:
# 1.维数
model.vector_size

300

In [3]:
# 2.词数
len(model.index_to_key)

195202

In [5]:
# 3.查看向量
model['地铁']

array([ 2.92064e-01, -5.18680e-02, -2.13720e-01,  1.82131e-01,
        2.82900e-03,  4.14104e-01,  1.56440e-01, -1.27940e-02,
       -3.28332e-01, -8.25000e-02, -8.46890e-02, -2.14700e-02,
        1.18650e-01, -4.73659e-01, -1.97850e-02,  1.13939e-01,
        1.82734e-01, -6.46420e-02,  5.60832e-01, -6.65230e-02,
       -1.97960e-01,  1.26039e-01, -3.28720e-01, -3.09730e-02,
       -3.46580e-01, -1.53190e-01, -2.96226e-01, -5.75517e-01,
        1.10684e-01,  8.19220e-02, -1.04721e-01, -1.77477e-01,
       -1.21332e-01,  1.49816e-01,  2.86278e-01, -8.11200e-03,
        6.72540e-02,  6.92220e-02, -3.50973e-01, -5.49500e-02,
       -7.80250e-02, -1.92952e-01, -1.70920e-01, -1.28289e-01,
        1.08204e-01, -7.24913e-01, -1.11735e-01, -6.75000e-03,
        4.38086e-01, -8.75720e-02, -1.41320e-01, -1.91726e-01,
        1.68363e-01, -7.85700e-02, -1.79772e-01, -1.27950e-01,
        3.24675e-01,  2.70616e-01,  1.96330e-02, -3.09431e-01,
       -4.02670e-02,  5.80160e-02, -1.06603e-01,  2.480

In [6]:
# 4.相似度
model.similarity('地铁', '图书馆')

np.float32(0.2721027)

In [7]:
# 5.最相似
model.most_similar(positive=['男人', '女孩'], negative=['男孩'], topn=5)

[('女人', 0.6578881740570068),
 ('女孩子', 0.515068531036377),
 ('女生', 0.45194485783576965),
 ('女人真', 0.4420627951622009),
 ('女人们', 0.43698593974113464)]

## 第二节 训练自己的词向量

In [8]:
import pandas as pd
import jieba
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

In [9]:
df = pd.read_csv('./data/online_shopping_10_cats.csv', encoding='utf-8').dropna()

In [10]:
df.head()

Unnamed: 0,cat,label,review
0,书籍,1,做父母一定要有刘墉这样的心态，不断地学习，不断地进步，不断地给自己补充新鲜血液，让自己保持一...
1,书籍,1,作者真有英国人严谨的风格，提出观点、进行论述论证，尽管本人对物理学了解不深，但是仍然能感受到...
2,书籍,1,作者长篇大论借用详细报告数据处理工作和计算结果支持其新观点。为什么荷兰曾经县有欧洲最高的生产...
3,书籍,1,作者在战几时之前用了＂拥抱＂令人叫绝．日本如果没有战败，就有会有美军的占领，没胡官僚主义的延...
4,书籍,1,作者在少年时即喜阅读，能看出他精读了无数经典，因而他有一个庞大的内心世界。他的作品最难能可贵...


In [11]:
sentences = [[token for token in jieba.lcut(sentence) if token.strip() != ''] for sentence in df['review']]

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.796 seconds.
Prefix dict has been built successfully.


In [12]:
model = Word2Vec(
    sentences,  # 已分词的句子序列
    vector_size=100,  # 词向量维度
    window=5,  # 上下文窗口大小
    min_count=2,  # 最小词频（低于将被忽略）
    sg=1,  # 1:Skip-Gram，0:CBOW
    workers=4  # 并行训练线程数
)

In [13]:
model.wv.save_word2vec_format('./data/word2vec.txt')

In [14]:
KeyedVectors.load_word2vec_format('./data/word2vec.txt')['地铁']

array([ 0.42350453,  0.2974304 , -0.59897876,  0.0121823 ,  0.02840736,
       -0.34541267,  0.5101222 ,  0.33443198, -0.7562844 , -0.30642283,
       -0.11158749, -0.44964233,  0.14501119,  0.05794272,  0.11984475,
        0.03522236,  0.53707635, -0.33583412, -0.035938  , -0.5602812 ,
        0.5781546 , -0.30486673,  0.38048387, -0.33548173, -0.346148  ,
       -0.24371345,  0.60412323, -0.09432408, -0.10564709, -0.27668947,
        0.40151602, -0.16383585,  0.5436203 , -0.43483347,  0.6263288 ,
        0.13166222, -0.36251763, -0.10096867, -0.17781648, -0.5142302 ,
       -0.3717342 , -0.20507288, -0.05918563,  0.3358518 ,  0.46693254,
       -0.17390488, -0.31133175,  0.63135064, -0.07080424,  0.35630476,
        0.24654943,  0.026253  , -0.09652434, -0.09773483,  0.5330527 ,
       -0.17318478,  0.19410142,  0.72254133, -0.28426206,  0.2116275 ,
        0.08558647,  0.3870699 ,  0.01804344,  0.15544029, -0.04218333,
        0.05805991,  0.45783716,  0.5429634 ,  0.305183  ,  0.59

## 第三节 词向量应用

In [15]:
from torch import nn
from gensim.models import KeyedVectors
import torch
import jieba

In [16]:
# 1.加载词向量
wv = KeyedVectors.load_word2vec_format('./data/word2vec.txt')

In [17]:
# 2.处理OOV
unk_token = '<unk>'
index2word = [unk_token] + wv.index_to_key
word2index = {word: index for index, word in enumerate(index2word)}

In [18]:
# 3.准备词向量矩阵
num_embeddings = len(index2word)
embedding_dim = wv.vector_size
embedding_matrix = torch.randn(num_embeddings, embedding_dim)

for index, word in enumerate(index2word):
    if word in wv:
        embedding_matrix[index] = torch.tensor(wv[word])

In [19]:
# 4.创建Embedding
embedding = nn.Embedding.from_pretrained(embedding_matrix)

In [20]:
# 5.测试
text = "我喜欢乘坐宇宙飞船"
tokens = jieba.lcut(text)
input_ids = [word2index.get(token, word2index[unk_token]) for token in tokens]
input_tensor = torch.tensor(input_ids)
embedding(input_tensor).shape

torch.Size([4, 100])