# 作業 : 調整 word2vec 模型的不同訓練參數

# [作業目標]
- 調整 word2vec 模型的不同參數, 分別觀察效果並比較

# [作業重點]
- 調整 word2vec 模型的不同訓練參數, 分別觀察效果並比較

In [1]:
# 載入 gensim 與 word2vec 模型
import gensim
from gensim.models import word2vec

# 忽略警告訊息
import warnings
warnings.filterwarnings("ignore")

In [2]:
!wget http://mattmahoney.net/dc/text8.zip
!unzip text8.zip

--2021-01-27 07:19:30--  http://mattmahoney.net/dc/text8.zip
Resolving mattmahoney.net (mattmahoney.net)... 67.195.197.24
Connecting to mattmahoney.net (mattmahoney.net)|67.195.197.24|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 31344016 (30M) [application/zip]
Saving to: ‘text8.zip’


2021-01-27 07:19:45 (2.04 MB/s) - ‘text8.zip’ saved [31344016/31344016]

Archive:  text8.zip
  inflating: text8                   


# Word2Vec 訓練參數
- size : 詞向量的維度
- min_count : 最小次數，一個詞出現的次數若小於 min_count，則拋棄不參與訓練。
- window : 訓練窗格大小，也就是一個詞在看上下文關係時，上下應該各看幾個字的意思。
- 更多參數說明，請參閱官方文件
https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Text8Corpus

In [3]:
# 使用 gensim 訓練 word2vec 詞向量
sentences = word2vec.Text8Corpus('./text8')
model = word2vec.Word2Vec(sentences, size=10)
#model = word2vec.Word2Vec(sentences, size=10, min_count=3, window=5)

In [4]:
# 顯示最相近的字彙
model.most_similar(['woman'])

[('angry', 0.9443917870521545),
 ('siblings', 0.9379791617393494),
 ('husband', 0.9265410304069519),
 ('gibeah', 0.9244379997253418),
 ('sing', 0.917561948299408),
 ('daughters', 0.9072643518447876),
 ('heard', 0.9055014252662659),
 ('love', 0.9036343097686768),
 ('lover', 0.9000006318092346),
 ('dying', 0.8943802714347839)]

In [5]:
# 顯示最相近的字彙(附加反義詞)
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('tsar', 0.9282662868499756),
 ('assassinated', 0.914304256439209),
 ('emperor', 0.9057285785675049),
 ('crowned', 0.905256986618042),
 ('deposed', 0.9031223058700562)]

In [6]:
# 挑選最不相同的字彙
model.wv.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'

In [7]:
# 顯示字彙間的相似性
model.wv.similarity('woman', 'man')

0.868004

In [8]:
# 顯示字彙的詞向量
model['computer']

array([-5.189927  , -5.043546  , -2.3028586 ,  1.414655  ,  0.36990553,
        5.31249   ,  2.578273  , -2.26718   , -3.1036706 , -0.93289906],
      dtype=float32)

In [9]:
model = word2vec.Word2Vec(sentences, size=10, min_count=3, window=5)

In [10]:
# 顯示最相近的字彙
model.most_similar(['woman'])

[('angry', 0.9302777647972107),
 ('lennon', 0.9302104115486145),
 ('elisha', 0.9244479537010193),
 ('loved', 0.9238390922546387),
 ('sing', 0.9125450849533081),
 ('odin', 0.9122686982154846),
 ('love', 0.91185462474823),
 ('herself', 0.9101935625076294),
 ('angels', 0.9101755023002625),
 ('younger', 0.9099645018577576)]

In [11]:
# 顯示最相近的字彙(附加反義詞)
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('reigned', 0.9503540992736816),
 ('deposed', 0.9493151903152466),
 ('tsar', 0.9366494417190552),
 ('mindaugas', 0.9364131689071655),
 ('sultan', 0.9297733902931213)]

In [12]:
# 挑選最不相同的字彙
model.wv.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'

In [13]:
# 顯示字彙間的相似性
model.wv.similarity('woman', 'man')

0.8778384

In [14]:
# 顯示字彙的詞向量
model['computer']

array([-2.03388   , -4.4214687 , -0.73669857, -1.495707  , -1.4887706 ,
        7.449845  ,  3.4516423 , -2.9107134 , -3.8585677 ,  2.0577643 ],
      dtype=float32)