In [2]:
import re
import jieba
import pandas as pd
# 引入 word2vec
from gensim.models.word2vec import LineSentence
from gensim.models.fasttext import FastText
from gensim.models import word2vec
import gensim
import numpy as np

# 引入日志配置
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# 0. 作业2要求：

1. 通过gensim训练词向量 即Gensim工具的使用
 + 1.1 利用分词后的项目数据生成训练词向量用的训练数据
 + 1.2 保存词向量训练数据
 + 1.3 应用gensim中Word2Vec或Fasttext训练词向量
 + 1.4 保存训练好的词向量

2. 构建embedding_matrix

> 读取上步计算词向量和构建的`vocab`词表，以`vocab`中的`index`为`key`值构建`embedding_matrix`

`eg: embedding_matrix[i] = [embedding_vector]`

# 1. 路径

In [3]:
# 数据路径
merger_data_path = 'data/merged_train_test_seg_data.csv'
# 模型保存路径
save_model_path='data/wv/word2vec.model'

# 2. 训练模型

## 2.1 使用word2vec训练

In [4]:
# 查看包的具体功能的时候，？不好使的话可以使用help()
help(word2vec.Word2Vec)
里边有一个sg参数，通过设置该参数来指定是使用哪一个算法
sg : {0, 1}, optional
           Training algorithm: 1 for skip-gram; otherwise CBOW.
    
上一节讲到的一个softmax的优化的方法，这里使用下边这个参数就可以指定使用哪一个优化方法
hs : {0, 1}, optional
           If 1, hierarchical softmax will be used for model training. 分层softmax
           If 0, and `negative` is non-zero, negative sampling will be used.  负采样

Help on class Word2Vec in module gensim.models.word2vec:

class Word2Vec(gensim.models.base_any2vec.BaseWordEmbeddingsModel)
 |  Word2Vec(sentences=None, corpus_file=None, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=0.001, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=<built-in function hash>, iter=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000, compute_loss=False, callbacks=(), max_final_vocab=None)
 |  
 |  Train, use and evaluate neural networks described in https://code.google.com/p/word2vec/.
 |  
 |  Once you're finished training a model (=no more updates, only querying)
 |  store and use only the :class:`~gensim.models.keyedvectors.KeyedVectors` instance in `self.wv` to reduce memory.
 |  
 |  The model can be stored/loaded via its :meth:`~gensim.models.word2vec.Word2Vec.save` and
 |  :meth:`~gensim.models.word2vec.Word2Vec.load` methods.
 |  
 |  The trained word vectors can a

In [5]:
# 这里直接使用word2vec.Word2Vec这个包来训练word2vec这个模型,训练词向量
# 实例化word2vec模型为model_wv
model_wv = word2vec.Word2Vec(LineSentence(merger_data_path), sg=1,workers=8,min_count=5,size=200)
# sg=1: 使用Skip-Gram来构建word2vec
# workers=8: 使用8个进程来跑
# min_count=5：词频小于5的直接滤掉
# size=200：训练一个200维的词向量
# 这里word2vec如何定义是使用Skip-Gram还是CBOW，这里可以直接通过help(word2vec.Word2Vec)来查看
# 这里名字里边的wv就是word2vec的缩写

2020-03-04 17:16:32,152 : INFO : collecting all words and their counts
2020-03-04 17:16:32,163 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-04 17:16:32,467 : INFO : PROGRESS: at sentence #10000, processed 941657 words, keeping 36796 word types
2020-03-04 17:16:32,787 : INFO : PROGRESS: at sentence #20000, processed 1897796 words, keeping 54149 word types
2020-03-04 17:16:33,107 : INFO : PROGRESS: at sentence #30000, processed 2842477 words, keeping 66984 word types
2020-03-04 17:16:33,437 : INFO : PROGRESS: at sentence #40000, processed 3759167 words, keeping 77921 word types
2020-03-04 17:16:33,787 : INFO : PROGRESS: at sentence #50000, processed 4736386 words, keeping 87832 word types
2020-03-04 17:16:34,134 : INFO : PROGRESS: at sentence #60000, processed 5775137 words, keeping 97810 word types
2020-03-04 17:16:34,470 : INFO : PROGRESS: at sentence #70000, processed 6837177 words, keeping 107437 word types
2020-03-04 17:16:34,796 : INFO : PROGRE

2020-03-04 17:17:15,294 : INFO : EPOCH 3 - PROGRESS: at 32.40% examples, 456888 words/s, in_qsize 16, out_qsize 0
2020-03-04 17:17:16,300 : INFO : EPOCH 3 - PROGRESS: at 37.79% examples, 453977 words/s, in_qsize 16, out_qsize 0
2020-03-04 17:17:17,309 : INFO : EPOCH 3 - PROGRESS: at 42.68% examples, 449442 words/s, in_qsize 16, out_qsize 0
2020-03-04 17:17:18,353 : INFO : EPOCH 3 - PROGRESS: at 48.20% examples, 452521 words/s, in_qsize 15, out_qsize 0
2020-03-04 17:17:19,378 : INFO : EPOCH 3 - PROGRESS: at 53.23% examples, 452572 words/s, in_qsize 15, out_qsize 0
2020-03-04 17:17:20,384 : INFO : EPOCH 3 - PROGRESS: at 58.34% examples, 455815 words/s, in_qsize 16, out_qsize 0
2020-03-04 17:17:21,402 : INFO : EPOCH 3 - PROGRESS: at 63.01% examples, 454484 words/s, in_qsize 14, out_qsize 1
2020-03-04 17:17:22,410 : INFO : EPOCH 3 - PROGRESS: at 67.87% examples, 455591 words/s, in_qsize 16, out_qsize 0
2020-03-04 17:17:23,416 : INFO : EPOCH 3 - PROGRESS: at 73.01% examples, 455330 words/s,

In [6]:
model_wv.wv.most_similar(['奇瑞'], topn=10)

2020-03-04 17:19:31,773 : INFO : precomputing L2-norms of word weight vectors


[('瑞虎5', 0.734700083732605),
 ('瑞虎', 0.7169025540351868),
 ('风云', 0.6548324227333069),
 ('东方之子', 0.6316857933998108),
 ('旗云1', 0.6134951710700989),
 ('昌河', 0.611372172832489),
 ('福田', 0.6112220287322998),
 ('瑞虎3', 0.6103478670120239),
 ('吉利', 0.6091591119766235),
 ('旗云2', 0.6053952574729919)]

## 2.2 使用FastText训练

In [7]:
# 这里直接使用FastText这个包来训练FastText这个模型，训练词向量
# 实例化FastText模型为model_ft
model_ft = FastText(sentences=LineSentence(merger_data_path), workers=8, min_count=5, size=200)

2020-03-04 17:19:52,710 : INFO : resetting layer weights
2020-03-04 17:20:04,056 : INFO : collecting all words and their counts
2020-03-04 17:20:04,063 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-03-04 17:20:04,381 : INFO : PROGRESS: at sentence #10000, processed 941657 words, keeping 36796 word types
2020-03-04 17:20:04,701 : INFO : PROGRESS: at sentence #20000, processed 1897796 words, keeping 54149 word types
2020-03-04 17:20:04,953 : INFO : PROGRESS: at sentence #30000, processed 2842477 words, keeping 66984 word types
2020-03-04 17:20:05,255 : INFO : PROGRESS: at sentence #40000, processed 3759167 words, keeping 77921 word types
2020-03-04 17:20:05,587 : INFO : PROGRESS: at sentence #50000, processed 4736386 words, keeping 87832 word types
2020-03-04 17:20:05,945 : INFO : PROGRESS: at sentence #60000, processed 5775137 words, keeping 97810 word types
2020-03-04 17:20:06,298 : INFO : PROGRESS: at sentence #70000, processed 6837177 words, keeping 

2020-03-04 17:20:44,804 : INFO : EPOCH - 3 : training on 9748591 raw words (8612003 effective words) took 10.2s, 847034 effective words/s
2020-03-04 17:20:45,817 : INFO : EPOCH 4 - PROGRESS: at 8.12% examples, 686271 words/s, in_qsize 0, out_qsize 0
2020-03-04 17:20:46,818 : INFO : EPOCH 4 - PROGRESS: at 17.36% examples, 740582 words/s, in_qsize 0, out_qsize 0
2020-03-04 17:20:47,828 : INFO : EPOCH 4 - PROGRESS: at 25.74% examples, 736571 words/s, in_qsize 3, out_qsize 0
2020-03-04 17:20:48,839 : INFO : EPOCH 4 - PROGRESS: at 35.12% examples, 744946 words/s, in_qsize 0, out_qsize 0
2020-03-04 17:20:49,840 : INFO : EPOCH 4 - PROGRESS: at 44.29% examples, 750204 words/s, in_qsize 10, out_qsize 1
2020-03-04 17:20:50,849 : INFO : EPOCH 4 - PROGRESS: at 53.23% examples, 762425 words/s, in_qsize 14, out_qsize 1
2020-03-04 17:20:51,849 : INFO : EPOCH 4 - PROGRESS: at 61.87% examples, 772474 words/s, in_qsize 12, out_qsize 1
2020-03-04 17:20:52,856 : INFO : EPOCH 4 - PROGRESS: at 71.13% exampl

In [8]:
model_ft.wv.most_similar(['奇瑞'], topn=10)

2020-03-04 17:22:04,837 : INFO : precomputing L2-norms of word weight vectors
2020-03-04 17:22:04,870 : INFO : precomputing L2-norms of ngram weight vectors


[('奇瑞E5', 0.8922537565231323),
 ('奇瑞A1', 0.8768239617347717),
 ('奇瑞A5', 0.8766922950744629),
 ('东南', 0.8725718855857849),
 ('奇瑞QQ', 0.8679873943328857),
 ('奇瑞QQ6', 0.8551181554794312),
 ('瑞虎5', 0.8534832000732422),
 ('瑞虎', 0.8531208038330078),
 ('奇瑞A3', 0.8509798049926758),
 ('奇瑞E3', 0.8476381301879883)]

## 2.3 模型保存

In [9]:
model_wv.save(save_model_path)

2020-03-04 17:22:21,372 : INFO : saving Word2Vec object under data/wv/word2vec.model, separately None
2020-03-04 17:22:21,373 : INFO : not storing attribute vectors_norm
2020-03-04 17:22:21,374 : INFO : not storing attribute cum_table
2020-03-04 17:22:21,930 : INFO : saved data/wv/word2vec.model


## 2.4 模型的加载

In [10]:
model = word2vec.Word2Vec.load(save_model_path)

2020-03-04 17:22:29,916 : INFO : loading Word2Vec object from data/wv/word2vec.model
2020-03-04 17:22:30,351 : INFO : loading wv recursively from data/wv/word2vec.model.wv.* with mmap=None
2020-03-04 17:22:30,351 : INFO : setting ignored attribute vectors_norm to None
2020-03-04 17:22:30,352 : INFO : loading vocabulary recursively from data/wv/word2vec.model.vocabulary.* with mmap=None
2020-03-04 17:22:30,353 : INFO : loading trainables recursively from data/wv/word2vec.model.trainables.* with mmap=None
2020-03-04 17:22:30,353 : INFO : setting ignored attribute cum_table to None
2020-03-04 17:22:30,354 : INFO : loaded data/wv/word2vec.model


## 2.5 测试效果

In [11]:
model.wv.most_similar(['奇瑞'], topn=10)

2020-03-04 17:22:35,805 : INFO : precomputing L2-norms of word weight vectors


[('瑞虎5', 0.734700083732605),
 ('瑞虎', 0.7169025540351868),
 ('风云', 0.6548324227333069),
 ('东方之子', 0.6316857933998108),
 ('旗云1', 0.6134951710700989),
 ('昌河', 0.611372172832489),
 ('福田', 0.6112220287322998),
 ('瑞虎3', 0.6103478670120239),
 ('吉利', 0.6091591119766235),
 ('旗云2', 0.6053952574729919)]

# 3. 构建embedding_matrix
这里提前构建好词的embedding矩阵，这样的话后边进行模型训练的时候，就不用再进行词的Embedding了，直接将这里的Embedding矩阵导入即可。
这里就相当于在研究BERT的Attention时，拿着要输入的词到词向量的表里查找对应的词向量一样，种类就是构建词向量表的一个过程。（一个字典的感觉，根据key查找value的过程）

## 3.1 构建vocab
查看构建出来的vocab效果好不好：
在Gensim里边有这么一个方法：score，用来判断这个词向量好还是不好，也就是看一个它的输入和输出，它的输入是词，输出也是词。后边自己试一下

In [12]:
# 这个vocab是从model里边来，model里边是定义了一个词频参数，低于5的就滤掉，原先的语料就不用再去统计
# 这个框架就直接是低于5的就滤掉了，就直接实现了过滤掉了低频词
vocab = {word:index for index, word in enumerate(model_wv.wv.index2word)}
# 所以这里的表要定义成字典的形式，便于根据key得到value（词向量）
reverse_vocab = {index: word for index, word in enumerate(model_wv.wv.index2word)}

## 3.2 获取embedding_matrix

## 方法一
这种方法就完整的复现了第二次课里所描述的方法，就是直接拿到第i个词的词向量赋值给初始化的矩阵

In [12]:
# 定义保存路径
save_embedding_matrix_path='data/embedding_matrix.txt'

def get_embedding_matrix(wv_model):
    # 获取vocab大小
    vocab_size = len(wv_model.wv.vocab)
    # 获取embedding维度
    embedding_dim = wv_model.wv.vector_size
    print('vocab_size, embedding_dim:', vocab_size, embedding_dim)
    # 初始化矩阵  shape和词向量矩阵一样
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    # 这里的vocab_size就是词的个数
    # 按顺序填充
    for i in range(vocab_size):
        embedding_matrix[i, :] = wv_model.wv[wv_model.wv.index2word[i]]
        # wv_model.wv.index2word[i]从第一个词开始依次输出词表里边的词，拿到它对应的向量，然后赋值给这个初始化全为0的numpy矩阵矩阵
        # 转换一下格式
        embedding_matrix = embedding_matrix.astype('float32')
    # 断言检查维度是否符合要求，是否是自己想要的大小
    assert embedding_matrix.shape == (vocab_size, embedding_dim)
    # 保存矩阵
    np.savetxt('save_embedding_matrix_path', embedding_matrix, fmt='%0.8f')
    print('embedding matrix extracted')
    return embedding_matrix

In [13]:
embedding_matrix=get_embedding_matrix(model_wv)
print(embedding_matrix.shape)

vocab_size, embedding_dim: 32905 200
embedding matrix extracted
(32905, 200)


In [14]:
embedding_matrix.shape

(32905, 200)

## 方法二
这里直接通过这里的方法，直接拿矩阵也可以

In [15]:
embedding_matrix_wv=model_wv.wv.vectors

In [16]:
embedding_matrix_wv.shape

(32905, 200)

## 对比
对比两种方法得到的矩阵，所有的参数都是一样的，所以整体来说这个方法要好一些，直接拿取

In [17]:
embedding_matrix==embedding_matrix_wv

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

In [18]:
(embedding_matrix==embedding_matrix_wv).all()

True

### Q1. 有没有一个标准的处理流程,怕前期数据处理影响后期项目效果? 
对于数据处理这个部分，一开始的方法可能会是一个比较low的方法，后边会不断的去完善数据处理这个部分，结合任务，不断的优化这个模块，这是一个不断修改，不断矫正的过程

# 参考

1. https://radimrehurek.com/gensim/models/word2vec.html 