# 練習以gensim訓練詞向量：
## 利用wiki data，以skip-gram model來建立word2vec向量模型。

### Modules

In [None]:
from collections import Counter
import nltk
import json
import pandas
import pickle
import gensim
from gensim import corpora, models, similarities, matutils
from gensim.corpora import WikiCorpus
from gensim import models
import logging
from gensim.models import word2vec
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer
import logging
import sys
import jieba

# 1. 取得語料
### 1-1 取得中文維基數據，本次練習是採用 2018/12/20 的資料。（https://zh.wikipedia.org/wiki/Wikipedia:%E6%95%B0%E6%8D%AE%E5%BA%93%E4%B8%8B%E8%BD%BD）
### 1-2 將下載後的維基數據置於與專案同個目錄，再使用wiki_to_txt.py從 xml 中提取出維基文

In [None]:
input_file = 'zhwiki-20181220-pages-articles.xml.bz2'
f = open('zhwiki.txt', encoding='utf8', mode='w')
wiki =  gensim.corpora.WikiCorpus(input_file, lemmatize=False, dictionary={})
for text in wiki.get_texts():
    str_line = ' '.join(text)
    f.write(str_line+'\n')

In [None]:
{
  "name": "Traditional Chinese to Simplified Chinese",
  "segmentation": {
    "type": "mmseg",
    "dict": {
      "type": "ocd",
      "file": "TSPhrases.ocd"
    }
  },
  "conversion_chain": [{
    "dict": {
      "type": "group",
      "dicts": [{
        "type": "ocd",
        "file": "TSPhrases.ocd"
      }, {
        "type": "ocd",
        "file": "TSCharacters.ocd"
      }]
    }
  }]
}

# 2. 使用 OpenCC 將維基文章統一轉換為繁體中文

In [5]:
# opencc -i zhwiki.txt -o zhwiki_tw.txt -c s2twp.json

# 3.  使用jieba 對文本斷詞，並去除停用詞

In [None]:
def main():

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    # jieba custom setting.
    jieba.set_dictionary('/Users/airmac/Desktop/PTT/crawler/userdictionary/dict2_PTT.txt')

    # load stopwords set
    stopword_set = set()
    with open('/Users/airmac/Desktop/PTT/crawler/stopwords.txt','r', encoding='utf-8') as stopwords:
        for stopword in stopwords:
            stopword_set.add(stopword.strip('\n'))

    output = open('wiki_seg.txt', 'w', encoding='utf-8')
    with open('zhwiki_tw.txt', 'r', encoding='utf-8') as content :
        for texts_num, line in enumerate(content):
            line = line.strip('\n')
            words = jieba.cut(line, cut_all=False)
            for word in words:
                if word not in stopword_set:
                    output.write(word + ' ')
            output.write('\n')

            if (texts_num + 1) % 10000 == 0:
                logging.info("已完成前 %d 行的斷詞" % (texts_num + 1))
    output.close()

if __name__ == '__main__':
    main()

# 4. 使用gensim 的 word2vec 模型進行訓練

In [10]:
def main():

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    sentences = word2vec.LineSentence('./wiki_seg.txt') #specify file name
    model_SG = word2vec.Word2Vec(sentences, size=250, window = 10, sg = 1, min_count = 3)

    #保存模型，供日後使用
    model_SG.save("word2vec.model_wiki")

    #模型讀取方式
    model_SG = word2vec.Word2Vec.load("word2vec.model_wiki")

if __name__ == "__main__":
    main()

2019-05-22 22:01:52,770 : INFO : collecting all words and their counts
2019-05-22 22:01:52,778 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-05-22 22:01:52,866 : INFO : collected 33301 word types from a corpus of 102451 raw words and 60 sentences
2019-05-22 22:01:52,868 : INFO : Loading a fresh vocabulary
2019-05-22 22:01:52,919 : INFO : effective_min_count=3 retains 5983 unique words (17% of original 33301, drops 27318)
2019-05-22 22:01:52,920 : INFO : effective_min_count=3 leaves 71078 word corpus (69% of original 102451, drops 31373)
2019-05-22 22:01:52,951 : INFO : deleting the raw counts dictionary of 33301 items
2019-05-22 22:01:52,955 : INFO : sample=0.001 downsamples 21 most-common words
2019-05-22 22:01:52,960 : INFO : downsampling leaves estimated 68690 word corpus (96.6% of prior 71078)
2019-05-22 22:01:52,994 : INFO : estimated required memory for 5983 words and 250 dimensions: 14957500 bytes
2019-05-22 22:01:52,996 : INFO : resetting layer

# 5. 測試訓練模型

In [None]:
def main():
	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
	model_SG = word2vec.Word2Vec.load("word2vec.model_wiki")

	print("提供 3 種測試模式\n")
	print("輸入一個詞，則去尋找前一百個該詞的相似詞")
	print("輸入兩個詞，則去計算兩個詞的餘弦相似度")
	print("輸入三個詞，進行類比推理")

	while True:
		try:
			query = input()
			q_list = query.split()

			if len(q_list) == 1:
				print("相似詞前 100 排序")
				res = model_SG.most_similar(q_list[0],topn = 100)
				for item in res:
					print(item[0]+","+str(item[1]))

			elif len(q_list) == 2:
				print("計算 Cosine 相似度")
				res = model_SG.similarity(q_list[0],q_list[1])
				print(res)
			else:
				print("%s之於%s，如%s之於" % (q_list[0],q_list[2],q_list[1]))
				res = model_SG.most_similar([q_list[0],q_list[1]], [q_list[2]], topn= 100)
				for item in res:
					print(item[0]+","+str(item[1]))
			print("----------------------------")
		except Exception as e:
			print(repr(e))

if __name__ == "__main__":
	main()

2019-05-23 21:00:20,461 : INFO : loading Word2Vec object from word2vec.model_wiki
2019-05-23 21:00:20,708 : INFO : loading wv recursively from word2vec.model_wiki.wv.* with mmap=None
2019-05-23 21:00:20,709 : INFO : setting ignored attribute vectors_norm to None
2019-05-23 21:00:20,714 : INFO : loading vocabulary recursively from word2vec.model_wiki.vocabulary.* with mmap=None
2019-05-23 21:00:20,716 : INFO : loading trainables recursively from word2vec.model_wiki.trainables.* with mmap=None
2019-05-23 21:00:20,718 : INFO : setting ignored attribute cum_table to None
2019-05-23 21:00:20,721 : INFO : loaded word2vec.model_wiki


提供 3 種測試模式

輸入一個詞，則去尋找前一百個該詞的相似詞
輸入兩個詞，則去計算兩個詞的餘弦相似度
輸入三個詞，進行類比推理
維基


2019-05-23 21:00:33,550 : INFO : precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


相似詞前 100 排序
鮑姆,0.9982941150665283
塔能,0.9981815814971924
專利,0.997909665107727
奇異,0.9978553056716919
傳送,0.9976803064346313
坎寧安,0.9976129531860352
下載,0.9974729418754578
執行緒,0.9974358081817627
git,0.9973118305206299
原生,0.9972920417785645
標,0.9972141981124878
搜尋,0.9971519112586975
磁帶機,0.9970462918281555
相容性,0.9967495203018188
power,0.996607780456543
group,0.9965818524360657
芬蘭赫爾辛,0.9965640902519226
python,0.9965051412582397
赫爾辛,0.9964991807937622
用來,0.9964420795440674
一份,0.9963663816452026
用語,0.9963191747665405
讀,0.9962285757064819
專訪,0.996159553527832
提議,0.9960776567459106
起訴,0.995819628238678
分享,0.9955840110778809
路徑,0.9955559968948364
應器,0.995527446269989
美國國家,0.9954522848129272
文學手,0.9951719045639038
environment,0.995078980922699
保證,0.9950613379478455
理器,0.9950302839279175
理察,0.9950070381164551
指南,0.9949942827224731
md,0.9949941635131836
人選,0.99495530128479
辦法,0.9949389696121216
貝爾,0.9949385523796082
屬,0.9949334263801575
基大學,0.99492347240448
ic,0.9949181079864502
語法,0.994899570941925
冊,

# 6. 分析結果提取

In [2]:
model_SG = word2vec.Word2Vec.load("word2vec.model_wiki")