In [2]:
import re
import jieba
import pandas as pd
# 引入 word2vec
from gensim.models.word2vec import LineSentence
from gensim.models.fasttext import FastText
from gensim.models import word2vec
import gensim
import numpy as np

# 引入日志配置
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# 0. 作业2要求：

1. 通过gensim训练词向量
 + 1.1 利用分词后的项目数据生成训练词向量用的训练数据
 + 1.2 保存词向量训练数据
 + 1.3 应用gensim中Word2Vec或Fasttext训练词向量
 + 1.4 保存训练好的词向量

2. 构建embedding_matrix

> 读取上步计算词向量和构建的`vocab`词表，以`vocab`中的`index`为`key`值构建`embedding_matrix`

`eg: embedding_matrix[i] = [embedding_vector]`

# 1. 路径

In [3]:
# 数据路径
merger_data_path = 'data/merged_train_test_seg_data.csv'
# 模型保存路径
save_model_path='data/wv/word2vec.model'

# 2. 训练模型

## 2.1 使用word2vec训练

In [5]:
model_wv = word2vec.Word2Vec(LineSentence(merger_data_path), sg=1,workers=8,min_count=5,size=200)

2019-11-24 20:05:04,851 : INFO : collecting all words and their counts
2019-11-24 20:05:04,852 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-11-24 20:05:05,061 : INFO : PROGRESS: at sentence #10000, processed 937272 words, keeping 36653 word types
2019-11-24 20:05:05,276 : INFO : PROGRESS: at sentence #20000, processed 1889030 words, keeping 53934 word types
2019-11-24 20:05:05,484 : INFO : PROGRESS: at sentence #30000, processed 2829438 words, keeping 66706 word types
2019-11-24 20:05:05,687 : INFO : PROGRESS: at sentence #40000, processed 3741912 words, keeping 77607 word types
2019-11-24 20:05:05,905 : INFO : PROGRESS: at sentence #50000, processed 4714603 words, keeping 87459 word types
2019-11-24 20:05:06,135 : INFO : PROGRESS: at sentence #60000, processed 5748572 words, keeping 97387 word types
2019-11-24 20:05:06,370 : INFO : PROGRESS: at sentence #70000, processed 6805872 words, keeping 106963 word types
2019-11-24 20:05:06,581 : INFO : PROGRE

2019-11-24 20:05:40,014 : INFO : EPOCH 4 - PROGRESS: at 8.47% examples, 711598 words/s, in_qsize 15, out_qsize 0
2019-11-24 20:05:41,030 : INFO : EPOCH 4 - PROGRESS: at 18.66% examples, 787799 words/s, in_qsize 15, out_qsize 0
2019-11-24 20:05:42,032 : INFO : EPOCH 4 - PROGRESS: at 28.97% examples, 819991 words/s, in_qsize 15, out_qsize 0
2019-11-24 20:05:43,036 : INFO : EPOCH 4 - PROGRESS: at 39.03% examples, 822826 words/s, in_qsize 15, out_qsize 0
2019-11-24 20:05:44,037 : INFO : EPOCH 4 - PROGRESS: at 48.42% examples, 824737 words/s, in_qsize 15, out_qsize 0
2019-11-24 20:05:45,041 : INFO : EPOCH 4 - PROGRESS: at 56.74% examples, 816662 words/s, in_qsize 15, out_qsize 0
2019-11-24 20:05:46,052 : INFO : EPOCH 4 - PROGRESS: at 65.71% examples, 820300 words/s, in_qsize 15, out_qsize 0
2019-11-24 20:05:47,076 : INFO : EPOCH 4 - PROGRESS: at 75.19% examples, 822586 words/s, in_qsize 15, out_qsize 0
2019-11-24 20:05:48,086 : INFO : EPOCH 4 - PROGRESS: at 85.90% examples, 824090 words/s, 

In [7]:
model_wv.wv.most_similar(['奇瑞'], topn=10)

[('瑞虎', 0.7333983182907104),
 ('瑞虎5', 0.6896098852157593),
 ('瑞虎3', 0.6474862694740295),
 ('风云', 0.6450612545013428),
 ('昌河', 0.6410837769508362),
 ('江淮', 0.6389217376708984),
 ('华普', 0.6363241076469421),
 ('鹰', 0.626153826713562),
 ('旗云1', 0.6259176731109619),
 ('名爵', 0.6205928921699524)]

## 2.2 使用FastText训练

In [8]:
model_ft = FastText(sentences=LineSentence(merger_data_path), workers=8, min_count=5, size=200)

2019-11-24 20:06:39,377 : INFO : resetting layer weights


KeyboardInterrupt: 

In [133]:
model_ft.wv.most_similar(['奇瑞'], topn=10)

2019-11-23 21:24:34,080 : INFO : precomputing L2-norms of word weight vectors
2019-11-23 21:24:34,115 : INFO : precomputing L2-norms of ngram weight vectors


[('奇瑞X1', 0.9358711242675781),
 ('奇瑞E5', 0.8860869407653809),
 ('奇瑞A1', 0.8819175958633423),
 ('瑞虎', 0.8770526051521301),
 ('瑞虎5', 0.8756348490715027),
 ('奇瑞A5', 0.874131977558136),
 ('奇瑞A3', 0.871688961982727),
 ('东风皮卡', 0.8686317205429077),
 ('海马', 0.8678934574127197),
 ('瑞虎7', 0.8597326278686523)]

## 2.3 模型保存

In [135]:
model_wv.save(save_model_path)

2019-11-23 21:25:34,295 : INFO : saving Word2Vec object under data/wv/word2vec.model, separately None
2019-11-23 21:25:34,296 : INFO : not storing attribute vectors_norm
2019-11-23 21:25:34,296 : INFO : not storing attribute cum_table
2019-11-23 21:25:34,653 : INFO : saved data/wv/word2vec.model


## 2.4 模型的加载

In [145]:
model = word2vec.Word2Vec.load(save_model_path)

2019-11-23 21:31:50,327 : INFO : loading Word2Vec object from data/wv/word2vec.model
2019-11-23 21:31:50,591 : INFO : loading wv recursively from data/wv/word2vec.model.wv.* with mmap=None
2019-11-23 21:31:50,592 : INFO : setting ignored attribute vectors_norm to None
2019-11-23 21:31:50,592 : INFO : loading vocabulary recursively from data/wv/word2vec.model.vocabulary.* with mmap=None
2019-11-23 21:31:50,593 : INFO : loading trainables recursively from data/wv/word2vec.model.trainables.* with mmap=None
2019-11-23 21:31:50,593 : INFO : setting ignored attribute cum_table to None
2019-11-23 21:31:50,593 : INFO : loaded data/wv/word2vec.model


## 2.5 测试效果

In [146]:
model.wv.most_similar(['奇瑞'], topn=10)

2019-11-23 21:32:14,207 : INFO : precomputing L2-norms of word weight vectors


[('瑞虎5', 0.7280553579330444),
 ('瑞虎', 0.7237233519554138),
 ('风云', 0.6439235806465149),
 ('瑞虎3', 0.6422736644744873),
 ('东南', 0.6421748399734497),
 ('风云2', 0.6406161189079285),
 ('瑞麒', 0.6265493631362915),
 ('威麟', 0.6246023774147034),
 ('吉利', 0.6150368452072144),
 ('名爵', 0.61062091588974)]

# 3. 构建embedding_matrix

## 3.1 构建vocab

In [12]:
vocab = {word:index for index, word in enumerate(model_wv.wv.index2word)}
reverse_vocab = {index: word for index, word in enumerate(model_wv.wv.index2word)}

## 3.2 获取embedding_matrix

## 方法一

In [14]:
save_embedding_matrix_path='data/embedding_matrix.txt'

def get_embedding_matrix(wv_model):
    # 获取vocab大小
    vocab_size = len(wv_model.wv.vocab)
    # 获取embedding维度
    embedding_dim = wv_model.wv.vector_size
    print('vocab_size, embedding_dim:', vocab_size, embedding_dim)
    # 初始化矩阵
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    # 按顺序填充
    for i in range(vocab_size):
        embedding_matrix[i, :] = wv_model.wv[wv_model.wv.index2word[i]]
        embedding_matrix = embedding_matrix.astype('float32')
    # 断言检查维度是否符合要求
    assert embedding_matrix.shape == (vocab_size, embedding_dim)
    # 保存矩阵
    np.savetxt('save_embedding_matrix_path', embedding_matrix, fmt='%0.8f')
    print('embedding matrix extracted')
    return embedding_matrix

In [16]:
embedding_matrix=get_embedding_matrix(model_wv)
print(embedding_matrix.shape)

vocab_size, embedding_dim: 32800 200
embedding matrix extracted
(32800, 200)


In [144]:
embedding_matrix.shape

(32906, 200)

## 方法二

In [17]:
embedding_matrix_wv=model_wv.wv.vectors

In [18]:
embedding_matrix_wv.shape

(32800, 200)

## 对比

In [20]:
embedding_matrix==embedding_matrix_wv

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [19]:
(embedding_matrix==embedding_matrix_wv).all()

True

### Q1. 有没有一个标准的处理流程,怕前期数据处理影响后期项目效果? 

# 参考

1. https://radimrehurek.com/gensim/models/word2vec.html 