## 读取数据

In [80]:
import pandas as pd

df = pd.read_csv('douban.csv')
df.head()

Unnamed: 0,ID,Movie_Name_EN,Movie_Name_CN,Crawl_Date,Number,Username,Date,Star,Comment,Like
0,0,Avengers Age of Ultron,复仇者联盟2,2017-01-22,1,然潘,2015-05-13,3,连奥创都知道整容要去韩国。,2404
1,1,Avengers Age of Ultron,复仇者联盟2,2017-01-22,2,更深的白色,2015-04-24,2,非常失望，剧本完全敷衍了事，主线剧情没突破大家可以理解，可所有的人物都缺乏动机，正邪之间、...,1231
2,2,Avengers Age of Ultron,复仇者联盟2,2017-01-22,3,有意识的贱民,2015-04-26,2,2015年度最失望作品。以为面面俱到，实则画蛇添足；以为主题深刻，实则老调重弹；以为推陈出...,1052
3,3,Avengers Age of Ultron,复仇者联盟2,2017-01-22,4,不老的李大爷耶,2015-04-23,4,《铁人2》中勾引钢铁侠，《妇联1》中勾引鹰眼，《美队2》中勾引美国队长，在《妇联2》中终于...,1045
4,4,Avengers Age of Ultron,复仇者联盟2,2017-01-22,5,ZephyrO,2015-04-22,2,虽然从头打到尾，但是真的很无聊啊。,723


In [86]:
print("电影  : {} 部".format(df.Movie_Name_CN.nunique()))
print("评论  : {} 条".format(len(df)))

电影  : 28 部
评论  : 2125056 条


## 训练模型
使用cntext库训练词向量word2vec模型

In [None]:
from cntext import W2VModels
import os

#训练word2vec模型
model = W2VModels(cwd=os.getcwd())  #语料数据
model.train(input_txt_file='douban.txt')

```
Step 1/4:...预处理    语料 ...
Step 2/4:...训练   word2vec模型
            耗时   2001 s
        
```

cntext可以用于扩展词典

In [None]:
model.find(seedword_txt_file='pos.txt', 
           topn=100)
model.find(seedword_txt_file='neg.txt', 
           topn=100)

```
Step 3/4:...准备 每个seed在word2vec模型中的相似候选词...
Step 4/4 完成! 耗时 2302 s
Step 3/4:...准备 每个seed在word2vec模型中的相似候选词...
Step 4/4 完成! 耗时 2303 s
```

在代码所在文件夹内可以找到 
- output/w2v_candi_words/w2v.model
- 新的  pos.txt
- 新的  neg.txt


新的pos.txt是对pos.txt词典的扩展。

<br>


## 导入模型
有的时候数据量特别大，模型训练十分不易。

这时，保存已训练好的模型，不止下次不用再同样的数据再次训练，也可分享给其他人使用。


训练结束后，在代码所在文件夹内可以找到 ``output/w2v_candi_words/w2v.model``

In [4]:
from gensim.models import KeyedVectors

w2v_model = KeyedVectors.load('output/w2v_candi_words/w2v.model')
w2v_model

<gensim.models.keyedvectors.KeyedVectors at 0x7fb0c8fedc70>

## 玩转词向量
用户级的数据(如在线评论)感觉生成的向量会准一些，词向量的方向，近义反义在向量中都有体现。

例如本文使用的是28部电影的2125056条影评， 一般评论内容包含电影相关信息，如电影题材、是否值的观影等。

In [87]:
#获取某词语的向量
w2v_model.get_vector('给力')

array([ 0.06488553,  0.74188954,  0.25468495,  0.89755714,  1.8139195 ,
       -0.6950082 ,  0.24339403, -1.2188634 ,  0.543618  , -0.9988698 ,
        0.27471313,  0.9325699 , -0.5860608 , -0.5081917 ,  1.6423215 ,
       -0.0490295 , -0.3927043 ,  0.659067  ,  0.03185922, -1.021391  ,
       -1.3214804 , -0.28208104, -0.7819419 , -0.30637202, -1.5944146 ,
       -0.12383854, -0.70463836,  0.45689437,  1.223081  , -1.9453759 ,
       -0.5538997 , -0.9750523 , -0.10031194, -0.9568689 ,  0.30341247,
        1.1102395 ,  0.667315  , -1.1600997 , -0.26674765, -0.55144155,
       -0.3246094 ,  0.82902473, -0.47339582, -0.9009957 ,  1.7722464 ,
        0.28959563, -0.03453476,  0.4786787 , -0.48074463, -0.23090109,
       -0.49390873,  0.71246386,  2.1557336 ,  2.4899387 , -0.51481706,
        0.5579966 , -0.6973235 , -1.1408254 ,  0.72495663, -1.0326954 ,
       -0.5455598 ,  0.98941576, -1.2155218 , -0.9088408 ,  1.9184568 ,
       -0.21800426, -1.2009395 ,  0.29684314,  1.3672423 , -2.26

In [88]:
#从 `keys_list` 中获取与 `key1` 最相似的 `key`。
w2v_model.most_similar_to_given(key1='太空', 
                                keys_list=['爱情', '悬疑', '飞船', '历史', '战争'])

'飞船'

In [142]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity([w2v_model.get_vector('理想')],  
                  [w2v_model.get_vector('现实')])[0][0]

0.5371934

In [144]:
#cosine算法
w2v_model.n_similarity(['理想'], 
                       ['现实'])

0.5371934

In [143]:
#计算两组键之间的余弦相似度。
w2v_model.n_similarity(['给力', '精彩', '赞', '推荐'], 
                       ['无聊', '尴尬', '垃圾'])

0.35008422

In [132]:
w2v_model.n_similarity(['理想', '梦想'], 
                       ['现实', '生活'])

0.48020104

In [92]:
#获取所有更接近 `key1` 的键，而不是 `key2` 。
w2v_model.closer_than(key1='理想', 
                      key2='现实')

['梦想', '妥协', '追梦', '愿望', '骨感']

In [159]:
# 开心 - 难过 ~=  享受 - d
a = w2v_model.get_vector('开心')
b = w2v_model.get_vector('难过')
c = w2v_model.get_vector('享受')
#d?
a-b+c

array([ 1.9152211 , -1.7800813 ,  0.09506518,  3.659023  , -1.6512613 ,
       -2.6480322 , -1.6235403 , -2.2486987 , -0.71400046, -0.1728031 ,
       -1.6929617 ,  0.37878633, -3.7019467 ,  1.1462314 ,  5.63078   ,
       -1.2278146 , -0.05264823,  0.5286775 ,  4.2917743 , -0.45218396,
        3.3316884 , -0.09896609,  0.4882021 , -3.69908   , -6.1497726 ,
       -0.88942206, -1.3687794 ,  2.216267  ,  1.5074555 ,  0.33952543,
        0.20799491,  1.4562494 ,  3.1210434 , -4.191899  ,  0.9389907 ,
        2.7055738 ,  2.4715266 , -2.9045568 , -0.07290494,  2.5370574 ,
       -3.8020585 , -2.1317313 , -3.217988  , -3.2676027 , -0.73200524,
       -0.52575606, -0.71774274,  0.8610803 ,  3.2065763 ,  3.887546  ,
        0.37415326, -2.172914  , -0.7803238 ,  1.6013707 , -2.8930688 ,
        2.0645087 ,  0.3473823 ,  0.8250154 ,  4.3268127 , -0.00809503,
        1.2304852 , -1.6286395 ,  0.31568825, -0.05730438, -0.17140332,
       -3.1898942 ,  0.15534413, -2.7835934 , -1.3714645 , -1.35

In [160]:
w2v_model.similar_by_vector(a-b+c)

[('享受', 0.7833479046821594),
 ('开心', 0.6825607419013977),
 ('愉快', 0.6298696994781494),
 ('娱乐', 0.6215130090713501),
 ('感官', 0.6085000038146973),
 ('图个', 0.6052624583244324),
 ('图一乐', 0.6039161682128906),
 ('休闲', 0.60273677110672),
 ('视觉享受', 0.6006160378456116),
 ('轻松愉快', 0.5961319804191589)]

In [None]:
w2v_model.most_similar(positive=['给力', '精彩', '过瘾'],
                       negative=['垃圾'],
                       topn=10)

[('激动人心', 0.6859163045883179),
 ('惊心动魄', 0.6767394542694092),
 ('带感', 0.6723690032958984),
 ('惊险刺激', 0.667783796787262),
 ('刺激', 0.6445038318634033),
 ('燃', 0.6429688930511475),
 ('爽快', 0.6287934184074402),
 ('带劲', 0.6254130005836487),
 ('爽', 0.624543309211731),
 ('酣畅淋漓', 0.6140543818473816)]

In [None]:
w2v_model.most_similar(positive=['给力', '精彩', '过瘾'],
                       negative=['垃圾'],
                       topn=10)

array([ 1.9152211 , -1.7800813 ,  0.09506518,  3.659023  , -1.6512613 ,
       -2.6480322 , -1.6235403 , -2.2486987 , -0.71400046, -0.1728031 ,
       -1.6929617 ,  0.37878633, -3.7019467 ,  1.1462314 ,  5.63078   ,
       -1.2278146 , -0.05264823,  0.5286775 ,  4.2917743 , -0.45218396,
        3.3316884 , -0.09896609,  0.4882021 , -3.69908   , -6.1497726 ,
       -0.88942206, -1.3687794 ,  2.216267  ,  1.5074555 ,  0.33952543,
        0.20799491,  1.4562494 ,  3.1210434 , -4.191899  ,  0.9389907 ,
        2.7055738 ,  2.4715266 , -2.9045568 , -0.07290494,  2.5370574 ,
       -3.8020585 , -2.1317313 , -3.217988  , -3.2676027 , -0.73200524,
       -0.52575606, -0.71774274,  0.8610803 ,  3.2065763 ,  3.887546  ,
        0.37415326, -2.172914  , -0.7803238 ,  1.6013707 , -2.8930688 ,
        2.0645087 ,  0.3473823 ,  0.8250154 ,  4.3268127 , -0.00809503,
        1.2304852 , -1.6286395 ,  0.31568825, -0.05730438, -0.17140332,
       -3.1898942 ,  0.15534413, -2.7835934 , -1.3714645 , -1.35

[('享受', 0.7833479046821594),
 ('开心', 0.6825607419013977),
 ('愉快', 0.6298696994781494),
 ('娱乐', 0.6215130090713501),
 ('感官', 0.6085000038146973),
 ('图个', 0.6052624583244324),
 ('图一乐', 0.6039161682128906),
 ('休闲', 0.60273677110672),
 ('视觉享受', 0.6006160378456116),
 ('轻松愉快', 0.5961319804191589)]

In [151]:
# 伦敦 - 巴黎 ~=  英国 - 法国 
a = w2v_model.get_vector('伦敦')
b = w2v_model.get_vector('巴黎')
c = w2v_model.get_vector('英国')
#d?
a-b+c

array([ 1.007507  , -0.1820916 ,  0.6697757 ,  0.88806605,  0.6691683 ,
        1.7563941 , -0.85471684,  0.8821401 , -0.2569713 , -0.44992787,
        0.6418032 ,  0.7345715 , -0.43072355,  0.46093544, -0.7151015 ,
       -2.056971  ,  1.0439651 , -0.42023313, -2.5863557 ,  0.03025109,
        0.13442376, -0.38248014, -0.8919534 , -0.23705997, -0.7003383 ,
        0.07063363,  0.68377745,  0.47872806,  0.6220162 ,  1.5900486 ,
       -0.4905202 , -0.95338947,  0.47789815, -0.06848627,  0.6765355 ,
        0.3313188 , -0.04952534, -1.4087405 , -0.81394625,  0.05646482,
       -0.49954718,  0.28358608,  0.87388206,  0.08769192,  0.57763284,
        1.9625483 , -1.3734789 ,  0.16545641, -0.5438298 ,  0.30143985,
        0.00978538, -1.137804  ,  0.22528765,  0.30789745,  1.9370279 ,
        0.01426886,  0.4425897 ,  0.3120418 ,  0.42083612, -0.09379338,
        1.2180228 , -0.02684045,  0.663282  ,  0.22101693,  0.80935216,
        0.30412102,  0.585623  , -0.8644108 ,  0.32378218,  0.15

In [152]:
w2v_model.similar_by_vector(a-b+c)

[('院线', 0.4055056571960449),
 ('浩克', 0.39985811710357666),
 ('公安部', 0.39095765352249146),
 ('一波', 0.3905521631240845),
 ('国庆', 0.3844847083091736),
 ('国产', 0.3787783980369568),
 ('灭霸', 0.37772294878959656),
 ('贺岁', 0.3775702714920044),
 ('年底', 0.3746964931488037),
 ('业界', 0.3677688539028168)]

In [None]:
理想+坚持 ~= 现实+骨干

In [None]:
生活+梦想 = 理想 + x 



In [125]:
w2v_model.most_similar(positive=['生活', '理想'],
                       negative=['梦想'],
                       topn=10)

[('现实', 0.720781683921814),
 ('现实生活', 0.7144508957862854),
 ('梦想', 0.710375189781189),
 ('人生', 0.6180729866027832),
 ('自由', 0.6059634685516357),
 ('向往', 0.5970909595489502),
 ('渴望', 0.5810344219207764),
 ('不如意', 0.5783224701881409),
 ('奋斗', 0.5777333378791809),
 ('憧憬', 0.5702699422836304)]

In [126]:
w2v_model.most_similar(positive=['坚持', '理想'],
                       negative=['放弃'],
                       topn=10)

[('梦想', 0.6077551245689392),
 ('向往', 0.46950867772102356),
 ('残酷', 0.4592956304550171),
 ('骨感', 0.4574880301952362),
 ('追梦', 0.4359501600265503),
 ('精神', 0.4269462525844574),
 ('向上', 0.42693281173706055),
 ('奋斗', 0.4259515404701233),
 ('执着', 0.4232107698917389),
 ('愿望', 0.4147609770298004)]

In [128]:
w2v_model.get_vector('理想')

array([-1.5740788 , -1.1827817 , -2.0982406 , -1.8619287 , -1.9427722 ,
        0.89860815, -0.5405978 , -1.0203604 ,  1.0329467 ,  0.41594455,
       -0.04104965, -0.22347333,  0.4418202 , -1.379047  ,  0.8790959 ,
       -0.2371473 , -1.9836859 , -2.2639606 ,  1.0326494 ,  1.1482949 ,
        0.9831494 , -0.56956273,  0.71161836,  1.0069033 ,  0.47646594,
       -0.65231335, -0.54108256,  1.5010564 , -1.8829947 , -1.976282  ,
       -0.8263211 ,  1.3937911 ,  0.9372222 ,  3.0390866 ,  1.2789438 ,
       -1.0426775 , -1.7056977 ,  0.08206071,  0.3006815 ,  0.164831  ,
        0.42811653, -1.4167889 , -1.0358403 , -0.99934614, -1.7999685 ,
       -0.05767645, -1.2424793 , -0.86518973,  0.6469621 , -1.0046158 ,
       -1.2974502 ,  2.3626518 , -0.08805981,  1.2402917 , -0.10417724,
       -2.237515  , -0.05165293,  1.3529747 ,  0.1513781 ,  0.4328735 ,
       -2.5062704 ,  0.48801896, -0.8842916 , -2.6616333 ,  0.84164405,
       -1.2898173 , -1.0246799 , -1.7479022 , -0.95494103,  1.02

In [129]:
w2v_model.get_vector('现实')

array([-2.62357759e+00, -1.12201679e+00, -1.35010016e+00, -9.06375885e-01,
       -8.44908416e-01, -1.17541742e+00, -2.70668983e+00, -1.13971758e+00,
        1.41478813e+00, -1.93978393e+00, -1.01404130e+00, -1.08490455e+00,
       -2.16125384e-01, -3.16928685e-01,  1.39019653e-01, -5.70508957e-01,
       -5.35224259e-01,  1.31409988e-01,  1.37573099e+00,  1.24135053e+00,
        2.00920248e+00, -4.08619076e-01, -1.73826182e+00,  5.34517002e+00,
       -1.49165249e+00,  1.21048737e+00,  1.19520001e-01,  1.19064927e+00,
       -1.45098913e+00,  6.02748767e-02, -1.61535919e-01,  9.57701743e-01,
       -1.17666078e+00,  1.28897297e+00,  3.05026472e-01, -5.99268436e-01,
       -1.63650227e+00, -1.78395641e+00,  1.34983480e+00, -2.59409666e-01,
       -8.29024971e-01, -1.62081158e+00,  7.98488915e-01, -5.79234302e-01,
       -2.03107572e+00,  1.90959501e+00,  1.90077388e+00,  1.62042901e-01,
        1.96331277e-01, -2.27028966e+00,  4.26672131e-01,  7.86847115e-01,
       -2.45918930e-01,  

In [130]:
from scipy import spatial

dataSetI = [3, 45, 7, 2]
dataSetII = [2, 54, 13, 15]
result = 1 - spatial.distance.cosine(w2v_model.get_vector('现实'), w2v_model.get_vector('理想'))

result

0.5371934771537781