## 1. Gensim Word2Vec
+ Python NLP开源库，可以用来训练自己的word2vec model
+ 官网: https://radimrehurek.com/gensim/

In [1]:
from gensim.models import Word2Vec

In [3]:
sentences = [['cat', 'say', 'meow'], ['dog', 'say', 'woof']]
model = Word2Vec(sentences, min_count=1)

**Word2Vec hyperparameters**:
   + size:词向量的维度，默认为100
   + window: 训练时单词的窗口大小，默认为5
   + min_count: 单词出现的最少次数，低于该值的单词会被忽略，默认为5
   + workers:训练时使用的工作线程，默认为3
   + sg: 使用的训练算法，CBOW(0) or skip gram(1)

In [6]:
model.most_similar('cat')

  """Entry point for launching an IPython kernel.


[('dog', 0.1765635907649994),
 ('meow', 0.0343756377696991),
 ('woof', -0.03496800363063812),
 ('say', -0.06903990358114243)]

In [7]:
model.wv['cat']

array([-1.0252096e-03,  3.5024254e-04,  8.9952600e-04,  8.1324240e-04,
        2.2397263e-03, -9.5045325e-06, -4.9190847e-03,  1.0138294e-03,
       -6.2838451e-05, -9.7167504e-04,  3.3577061e-03, -4.2472850e-03,
       -4.1422326e-04,  4.7333757e-03, -2.8519572e-03, -1.1334373e-03,
        5.5171543e-04, -1.1575579e-03,  4.9818577e-03,  4.5214556e-03,
        4.5040757e-03,  4.9387119e-03,  3.3122343e-03, -8.6123793e-04,
        4.6829185e-03,  3.5381521e-04,  7.6662033e-04,  4.2540319e-03,
       -2.3883095e-03,  2.0248261e-03, -1.7996741e-03, -3.7043875e-03,
       -1.3572722e-03,  3.1715005e-03, -2.5201011e-03,  2.6401537e-03,
       -2.2769771e-03,  2.3374776e-03, -2.5612239e-03,  1.2790461e-03,
       -4.5682681e-03, -4.1522128e-03,  2.6593274e-03, -3.3317008e-03,
        3.6170965e-03, -2.4682209e-03, -4.8208451e-03, -1.1158064e-03,
       -4.0345476e-03,  2.3370866e-04, -2.6659463e-03, -4.7948947e-03,
        3.7836691e-03,  1.1267568e-03, -1.6608114e-03,  4.6734572e-03,
      

## 2. 使用新闻语料库训练Word2Vec

In [9]:
database = '../../DATA/news_data.csv'

In [10]:
import pandas as pd

In [44]:
content = pd.read_csv(database, encoding='gb18030')

In [47]:
content = content.dropna()

## 2.1 分词

In [59]:
import re
import jieba
from gensim.models.word2vec import LineSentence

In [49]:
CONTENT = content['content']

In [30]:
def token(string):
    return ' '.join(re.findall('[\w|\d]+', string))

In [51]:
def cut(string):
    return ' '.join(jieba.cut(string))

In [54]:
# 将训练语料分词后写入文件
with open('news_data_cut.txt', 'w') as f:
    for s in CONTENT:
        f.write(cut(token(s)) + '\n')

### 2.2 训练Word2Vec模型

In [56]:
sentences = LineSentence('news_data_cut.txt')

In [57]:
model = Word2Vec(sentences, min_count=1)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [60]:
model.wv['小米']

array([-3.16706210e-01, -4.45590466e-01,  1.32990524e-03, -9.31498170e-01,
       -3.96499366e-01, -4.73828107e-01, -3.21005732e-01, -6.74238086e-01,
       -5.44709146e-01, -6.47872210e-01, -8.28664362e-01, -2.16736943e-01,
       -1.08737320e-01,  9.50725794e-01,  1.04933694e-01,  7.45153308e-01,
        4.54015374e-01, -5.68031132e-01, -1.88064668e-02,  1.58507094e-01,
       -1.58174515e-01, -4.14594412e-01,  3.79645318e-01,  1.06192879e-01,
       -1.17517292e+00, -1.69858977e-01, -5.77672422e-01, -8.26854482e-02,
       -9.19363618e-01, -7.91207373e-01,  2.24485546e-01, -2.20182985e-01,
       -9.15670916e-02,  4.60807294e-01,  9.77684975e-01, -5.67515083e-02,
       -7.53349578e-03, -1.04793096e+00,  9.65224743e-01,  1.02000880e+00,
        4.14113030e-02,  8.28962803e-01,  5.36135733e-01, -6.06545210e-01,
       -1.04322064e+00,  1.63138911e-01,  9.12252665e-02, -5.56860380e-02,
        1.11694467e+00, -1.21796392e-01,  1.26015615e+00,  6.06578588e-01,
       -6.64035439e-01, -

In [67]:
model.wv.most_similar('小米')

[('代工', 0.7128967046737671),
 ('空调', 0.6765233278274536),
 ('Paytm', 0.6634405851364136),
 ('华为', 0.6613297462463379),
 ('苹果', 0.6596662998199463),
 ('茶叶', 0.6523712277412415),
 ('高科技', 0.6487736105918884),
 ('中药', 0.6457774639129639),
 ('生鲜', 0.6370936036109924),
 ('电商', 0.6343600749969482)]

In [65]:
model.wv.most_similar('苹果')

[('谷歌', 0.682900607585907),
 ('亚马逊', 0.6691598892211914),
 ('小米', 0.6596662998199463),
 ('微软', 0.6547811031341553),
 ('音箱', 0.6402027606964111),
 ('华为', 0.636533260345459),
 ('百度', 0.6337459087371826),
 ('智能', 0.6224048733711243),
 ('智能手机', 0.6182833909988403),
 ('空调', 0.6028869152069092)]

In [66]:
model.wv.most_similar('手机')

[('电脑', 0.8197458982467651),
 ('扫码', 0.7714399695396423),
 ('APP', 0.7663639187812805),
 ('刷', 0.7377079725265503),
 ('二维码', 0.7342149019241333),
 ('智能手机', 0.7300183176994324),
 ('摄像头', 0.7200523018836975),
 ('下单', 0.7168540954589844),
 ('短信', 0.7141807079315186),
 ('屏幕', 0.7110484838485718)]

## 3. NER 和 Dependency Parsing 
+ 区分出人名（person），组织机构名（organization）和地点（location）
+ 哈工大 LTP：https://stanfordnlp.github.io/CoreNLP/
+ Stanford CoreNLP：􏱋􏲓􏱔􏲒􏲤􏲘􏲦􏰓 􏲘􏲦􏰭􏱕􏱙􏱗􏰀https://stanfordnlp.github.io/CoreNLP/