# 使用gensim来体验word2vec 

[docs](https://radimrehurek.com/gensim/auto_examples/)

In [2]:
import gensim.downloader as api
# wv = api.load('word2vec-google-news-300') #1662.8MB

## 训练自己的模型

使用政府工作报告和五年规划的语料做训练

### 获取语料

In [38]:
import os
import requests
import hashlib

def cache_dir():
    work_dir = "./.cache"
    if not os.path.isdir(work_dir):
        os.makedirs(work_dir)
    return work_dir

def data_dir():
    datadir = "./data"
    if not os.path.isdir(datadir):
        os.makedirs(datadir)
    return datadir

def get(url, cache=True, force=False):
    md5 = hashlib.md5(url.encode('utf-8')).hexdigest()
    f = "%s/%s" % (cache_dir(), md5)
    if cache and not force and os.path.isfile(f):
        return open(f, "rb").read()
    content = requests.get(url).content
    if cache:
        open(f,"wb").write(content)
    return content

In [28]:
import lxml
import lxml.html
def crawl_report_list():
    '''
    抓取政府工作报告列表
    http://www.gov.cn/guoqing/2006-02/16/content_2616810.htm
    '''
    content = get("http://www.gov.cn/guoqing/2006-02/16/content_2616810.htm")
    if content is None: return []
    doc = lxml.html.document_fromstring(content)
    return doc.xpath("*//td//a/@href")

In [29]:
def crawl_plan_list():
    return ['http://www.npc.gov.cn/wxzl/gongbao/2000-12/26/content_5001764.htm', #八五
            'http://www.npc.gov.cn/wxzl/gongbao/2000-12/28/content_5002538.htm',
            'http://www.npc.gov.cn/wxzl/gongbao/2001-01/02/content_5003506.htm',
            'http://www.npc.gov.cn/wxzl/gongbao/2001-03/19/content_5134505.htm',
            'http://www.gov.cn/ztzl/2006-03/16/content_228841.htm',
            'http://www.gov.cn/2011lh/content_1825838.htm',
            'http://politics.people.com.cn/n/2015/1103/c1001-27772701-2.html', #十三五
            ]

In [30]:
urls = crawl_report_list() + crawl_plan_list()

In [40]:
import pyce3

for url in urls:
    enc, t, title, text, next_url = pyce3.parse(url, get(url))
    text = text.strip()
    title = title.strip()
    if len(text) > 0 and len(title) > 0:
        open(data_dir()+"/%s.txt"%title, "w").write(text)

In [41]:
import glob
files = glob.glob(data_dir() + "/*.txt")

In [48]:
text = open(files[15]).read()

In [50]:
import re
text = re.sub(pyce3.RE_TAG, '', text)

In [66]:
import unicodedata
import uniseg.wordbreak
import uniseg.sentencebreak
olines = [unicodedata.normalize('NFKC', x.strip()) for x in text.split('\n') if x.strip() != '']
lines = [unicodedata.normalize('NFKC', x.strip()) for x in uniseg.sentencebreak.sentences(text) if x.strip() != '']

In [72]:
words = [x for x in uniseg.wordbreak.words(lines[2])]

In [94]:
def corpus(files):
    ret = []
    for f in files:
        text = re.sub(pyce3.RE_TAG, '', open(f).read())
        lines = [unicodedata.normalize('NFKC', x.strip())\
                 #for x in uniseg.sentencebreak.sentences(text)\
                 for x in text.split('\n')\
                 if x.strip() != '']
        for line in lines:
            ret.append([x for x in uniseg.wordbreak.words(line)])
    return ret

In [95]:
texts = corpus(files)

### 训练模型

In [100]:
from gensim.models import Word2Vec
model = Word2Vec(texts)

In [101]:
for i, word in enumerate(model.wv.vocab):
    if i == 10:
        break
    print(word)

1996
年
政
府
工
作
报
告
关
于


In [105]:
model.wv.most_similar('我')

[('它', 0.7728102207183838),
 ('他', 0.655555009841919),
 ('家', 0.5821338891983032),
 ('祖', 0.549433708190918),
 ('尤', 0.39443439245224),
 ('帝', 0.38537460565567017),
 ('际', 0.3738643229007721),
 ('全', 0.3556334376335144),
 ('内', 0.35547029972076416),
 ('中', 0.3451741933822632)]

### 词语挖掘

通过word2phrase

In [106]:
from gensim.models import Phrases

In [107]:
ret = Phrases(texts)

In [114]:
ret[texts[0]]

['1996', '年', '政_府', '工', '作', '报_告']