### 使用 gensim 训练word2vec词向量

In [7]:
import pandas as pd
import jieba
import os
from gensim.models.word2vec import Word2Vec

In [3]:
review_data = pd.read_csv("../data/weibo_senti_100k.csv", encoding="utf-8")['review']
review_data.head()

0                ﻿更博了，爆照了，帅的呀，就是越来越爱你！生快傻缺[爱你][爱你][爱你]
1    @张晓鹏jonathan 土耳其的事要认真对待[哈哈]，否则直接开除。@丁丁看世界 很是细心...
2    姑娘都羡慕你呢…还有招财猫高兴……//@爱在蔓延-JC:[哈哈]小学徒一枚，等着明天见您呢/...
3                                           美~~~~~[爱你]
4                                    梦想有多大，舞台就有多大![鼓掌]
Name: review, dtype: object

In [5]:
stop_words = open("../data/cn_stopwords.txt", "r", encoding="utf-8").readlines()
stop_words = [word.strip() for word in stop_words]
stop_words.append('')
stop_words.append(' ')
stop_words.append('\n')
print(len(stop_words))
print(stop_words[:5])

749
['$', '0', '1', '2', '3']


In [8]:
# 对数据进行分词
sentence_cut_list = []
for sentence in review_data:
    # 去除左右空格
    sentence = sentence.strip()
    # 分词
    seq_list = jieba.cut(sentence, cut_all=False)
    seq_res = []
    for seq in seq_list:
        if seq in stop_words:
            continue
        seq_res.append(seq)
    sentence_cut_list.append(seq_res)
print(len(sentence_cut_list))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\WANGTI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.657 seconds.
Prefix dict has been built successfully.


119988


In [9]:
sentence_cut_list[:3]

[['\ufeff',
  '更博',
  '爆照',
  '帅',
  '越来越',
  '爱',
  '生快',
  '傻',
  '缺',
  '[',
  '爱',
  ']',
  '[',
  '爱',
  ']',
  '[',
  '爱',
  ']'],
 ['@',
  '张晓鹏',
  'jonathan',
  '土耳其',
  '事要',
  '认真对待',
  '[',
  ']',
  '直接',
  '开除',
  '@',
  '丁丁',
  '世界',
  '细心',
  '酒店',
  'OK'],
 ['姑娘',
  '羡慕',
  '…',
  '招财猫',
  '高兴',
  '…',
  '…',
  '/',
  '/',
  '@',
  '爱',
  '蔓延',
  '-',
  'JC',
  ':',
  '[',
  ']',
  '学徒',
  '一枚',
  '明天',
  '见',
  '/',
  '/',
  '@',
  '李欣芸',
  'SharonLee',
  ':',
  '大佬',
  '范儿',
  '[',
  '书呆子',
  ']']]

In [10]:
# 设定词向量训练的参数
num_features = 300    # 词向量的维度
min_word_count = 40   # 最小的单词数
num_workers = 4       # 线程数
context = 10          # 上下文窗口大小

In [18]:
model = Word2Vec(sentence_cut_list, workers=num_workers,
                 vector_size=num_features, min_count = min_word_count,
                 window = context)
model.init_sims(replace=True)
model.save("../out_dir/word2vec.model")

  after removing the cwd from sys.path.


In [21]:
word2vec_model = Word2Vec.load("../out_dir/word2vec.model")
word2vec_model.predict_output_word(['帅'])

[('帅', 0.0004354505),
 ('夜千', 0.00031023467),
 ('night', 0.00030464306),
 ('组合', 0.00028996187),
 ('威武', 0.00028367087),
 ('哼哼', 0.00028320987),
 ('酷', 0.00028053677),
 ('花心', 0.000275052),
 ('帅气', 0.00027003844),
 ('好听', 0.00026634455)]

In [25]:
word2vec_model.wv['帅']

array([-2.02073175e-02, -8.58505908e-03,  5.01995012e-02,  2.86155008e-02,
       -3.32855210e-02, -7.22161680e-02,  1.67416167e-02, -1.65169016e-02,
       -9.88638401e-02, -7.76550174e-02,  2.58547924e-02,  3.50956880e-02,
       -6.32177964e-02, -6.08485751e-03, -8.09701607e-02,  6.20651729e-02,
       -1.66047495e-02, -2.23584101e-02,  4.24207672e-02, -4.29108366e-03,
       -2.87506226e-02, -1.09143378e-02, -3.83164957e-02, -7.46205216e-03,
        9.13354626e-04, -6.38783956e-03,  4.46041264e-02,  1.30476048e-02,
       -9.08714086e-02, -5.87853976e-03,  4.49670292e-02,  6.90870802e-04,
        7.64244795e-02, -5.74476831e-02, -1.88916475e-02,  2.30713040e-02,
       -3.52734551e-02, -1.98989753e-02, -4.01794873e-02, -2.01420337e-02,
       -2.22034892e-03,  6.89975247e-02, -6.46331608e-02,  4.14755009e-02,
        1.02841355e-01,  4.28765304e-02, -9.69242491e-03,  2.03673653e-02,
       -3.09160873e-02,  8.03758577e-02,  1.04790861e-02,  8.90563354e-02,
       -4.71113175e-02, -