# Word embedding

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import sys
sys.path.insert(0, '..')

In [4]:
import os
import json
import jieba
import gensim
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import tensorflow as tf
import tensorflow_datasets as tfds

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [5]:
%matplotlib inline

In [6]:
tf.enable_eager_execution()

## Load dataset

In [8]:
from senti_analysis.data import load_data_set

In [9]:
train_data_set, validation_data_set, test_data_set = load_data_set()

In [14]:
train_data_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105000 entries, 0 to 104999
Data columns (total 22 columns):
id                                          105000 non-null int64
content                                     105000 non-null object
location_traffic_convenience                105000 non-null int64
location_distance_from_business_district    105000 non-null int64
location_easy_to_find                       105000 non-null int64
service_wait_time                           105000 non-null int64
service_waiters_attitude                    105000 non-null int64
service_parking_convenience                 105000 non-null int64
service_serving_speed                       105000 non-null int64
price_level                                 105000 non-null int64
price_cost_effective                        105000 non-null int64
price_discount                              105000 non-null int64
environment_decoration                      105000 non-null int64
environment_noise             

In [17]:
len(train_data_set), len(validation_data_set), len(test_data_set)

(105000, 15000, 15000)

In [18]:
content = train_data_set['content'] + validation_data_set['content'] + test_data_set['content']

In [35]:
content = np.concatenate([np.array(train_data_set['content']), 
                          np.array(validation_data_set['content']),
                          np.array(test_data_set['content'])])

In [36]:
len(content)

135000

## Train word vector

In [37]:
def cut(text):
    return jieba.lcut(text)

In [39]:
# cut('今天北京天气不错')

In [42]:
sentences = map(cut, content)

In [50]:
model = gensim.models.Word2Vec(sentences, size=100, window=5, min_count=1, workers=4)

In [56]:
model.save('w2v.model')

In [57]:
model = gensim.models.Word2Vec.load('w2v.model')

In [58]:
model['难吃']

array([-2.3488642e-03, -1.5344648e-03, -1.5566864e-03,  3.7902326e-04,
       -2.6193100e-03, -1.4347362e-03,  3.6481237e-05,  4.6536615e-04,
       -4.0649245e-03, -2.0662528e-03,  2.6442355e-03, -3.0504286e-03,
       -4.7339215e-03,  4.0190141e-03, -1.9351526e-03,  1.7505509e-03,
        3.6532586e-03, -3.3966466e-03,  2.2384415e-04,  4.7573023e-03,
       -3.9277603e-03, -3.5877095e-03,  1.3816286e-03, -2.0394996e-03,
       -4.2933300e-03,  7.0108863e-04, -4.1266875e-03,  1.2294045e-03,
        1.6835389e-04, -8.0646662e-04,  2.2160515e-04,  1.2269986e-03,
       -7.9008844e-04, -2.3935421e-03,  1.7712463e-03, -2.3664902e-03,
       -1.6686361e-03,  3.2037019e-03,  3.3067637e-03,  4.0673264e-03,
        2.8056928e-03, -1.0077041e-03, -1.0118972e-03, -2.7633724e-03,
        2.7836757e-03, -2.5133409e-03,  1.7807141e-03,  1.3973416e-03,
       -3.4294503e-03, -1.4809686e-03, -3.7603283e-03,  2.1080626e-03,
       -3.9907265e-03, -2.5366067e-03,  4.0879007e-03, -3.8268329e-03,
      

In [59]:
model.most_similar('难吃')

[('地缝', 0.4177316725254059),
 ('清酱', 0.40707892179489136),
 ('一溜烟儿', 0.4059597849845886),
 ('空台', 0.3946380615234375),
 ('听听看', 0.3938753604888916),
 ('云尚', 0.38981521129608154),
 ('普遍现象', 0.3804473280906677),
 ('达新', 0.37946367263793945),
 ('布菜', 0.37557995319366455),
 ('为表', 0.3741552531719208)]