### 训练微博评论数据集的词向量

In [17]:
import pandas as pd
import jieba
import os
from collections import Counter
from gensim.models.word2vec import Word2Vec

In [18]:
weibo_data = pd.read_csv("weibo/weibo_cut.csv")  # weibo_cut 是使用停用词过滤后的表
weibo_data.head()

Unnamed: 0,label,review
0,1,﻿ 更博 爆照 帅 越来越 爱 生快 傻 缺 爱 爱 爱
1,1,张晓鹏 jonathan 土耳其 事要 认真对待 直接 开除 丁丁 世界 细心 酒店 OK
2,1,姑娘 羡慕 … 招财猫 高兴 … … 爱 蔓延 - JC : 学徒 一枚 明天 见 李欣芸 ...
3,1,美 ~ ~ ~ ~ ~ 爱
4,1,梦想 舞台 鼓掌


In [19]:
# 获取词频表
comments = weibo_data['review']
comments = [sentence.strip().split(" ") for sentence in comments]
print(comments[:3])

[['\ufeff', '更博', '爆照', '帅', '越来越', '爱', '生快', '傻', '缺', '爱', '爱', '爱'], ['张晓鹏', 'jonathan', '土耳其', '事要', '认真对待', '直接', '开除', '丁丁', '世界', '细心', '酒店', 'OK'], ['姑娘', '羡慕', '…', '招财猫', '高兴', '…', '…', '爱', '蔓延', '-', 'JC', ':', '学徒', '一枚', '明天', '见', '李欣芸', 'SharonLee', ':', '大佬', '范儿', '书呆子']]


In [30]:
# 设定词向量训练的参数
num_features = 300    # 词向量的维度
min_word_count = 1   # 最小的单词数
num_workers = 4       # 线程数
context = 10          # 上下文窗口大小
UNK = '<unk>'          # 如果词表中不存在则替换为 <unk>

comments.append([UNK])

In [31]:
model = Word2Vec(min_count=min_word_count, workers=num_workers, vector_size=num_features, window=context)
model.build_vocab(comments) # 构建词表

In [36]:
model.wv.index_to_key[:10]

[':', '泪', '~', '嘻嘻', '爱', '抓狂', '鼓掌', '…', '回复', '-']

In [38]:
'<unk>' in model.wv.index_to_key

True

In [39]:
model.train(comments, total_examples=model.corpus_total_words, epochs=model.epochs)

(11329233, 12515370)

In [42]:
print(model.wv['<unk>'])
print(model.wv['张晓鹏'])

[-2.5804567e-03 -3.1805993e-03 -2.2079516e-03 -7.0928497e-04
 -1.2957271e-03  8.3487667e-04 -6.5390271e-04  2.6049621e-03
  2.2124005e-03  3.3123025e-03  2.3598417e-03  1.5071471e-04
  1.2264061e-03  3.1785909e-03  8.7896187e-05 -1.3145423e-03
  3.2714002e-03 -2.0803658e-03 -7.0173346e-04  4.8452854e-04
  2.0922001e-03  7.3691685e-04 -7.3983433e-04 -8.8982104e-04
  2.7111045e-03 -3.1673925e-03 -7.3265156e-04 -2.4654707e-03
  2.6862780e-03  7.6295214e-04 -8.0040138e-04 -1.7321737e-03
  6.7659694e-04  7.9957169e-04  2.7723359e-03  1.3515607e-03
 -4.6602883e-05  1.9866705e-03 -2.0364006e-03 -2.3961894e-03
 -3.2697883e-03  8.0254872e-04 -2.3216684e-03 -2.1229975e-03
 -2.4958546e-03 -2.2913211e-03  8.6786749e-05 -1.6633432e-03
 -2.8697355e-03 -2.7279179e-03  1.9369730e-03 -2.8416570e-03
  2.2408215e-03  3.1895614e-03 -9.0886513e-04  1.2645491e-03
  1.3965893e-03 -1.3480290e-03 -1.9903190e-03 -2.0644036e-03
 -2.1815863e-03 -2.1211306e-06  1.3321352e-03  7.4155093e-04
  1.1536701e-03 -5.03642

In [43]:
model.save("weibo/word2vec.model")