<a href="https://colab.research.google.com/github/komazawa-deep-learning/komazawa-deep-learning.github.io/blob/master/notebooks/2020_0619word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 2020 word2vec によるアナロジー

<div align='right'>
<a href='mailto:asakawa@ieee.org'>Shin Aasakawa</a>, all rights reserved.<br>
Date: 19/Jun/2020<br>
 MIT license
</div>



<center>

<img src="https://komazawa-deep-learning.github.io/assets/2013Mikolov_KingQueenFig.svg" width="600"><br/>
From Mikolov et. al (2013) Linguistic Regularities in Continuous SpaceWord Representations, Fig. 2<br/>
<br/>
<br/>

<img src="https://komazawa-deep-learning.github.io/assets/2013Mikolov_FigCountries.svg" width="680"><br/>
Modified from Mikolv et. al (2013) Distributed Representations of Words and Phrases and their Compositionality, Fig. 2</br/>

<br/><br/>
<img src="https://komazawa-deep-learning.github.io/assets/2013Mikolov_Fig1.svg" width="840"><br/>
</center>



$$
\ell=\frac{1}{T}\sum_{t=1}^{T}\sum_{-c\ge j\ge c,j\ne0}\log p\left(w_{t+1}\vert w_{t}\right),
$$

$$
p\left(w_o\vert w_i\right)=\frac{\exp(v_{w_{o}}^\top v_{w_{w_i}})}{\sum\exp(v_{w_{o}}^\top v_{w_{w_i}})}
$$

In [None]:
# -*- coding: utf-8 -*-
from six.moves.urllib import request

import numpy as np
import os
import sys
import gensim

In [None]:
!wget http://www.cis.twcu.ac.jp/~asakawa/2017jpa/2017Jul_jawiki-wakati_neologd_hid200_win20_neg20_cbow.bin.gz
#!wget http://www.cis.twcu.ac.jp/~asakawa/2017jpa/2017Jul_jawiki-wakati_neologd_hid200_win20_neg20_sgns.bin.gz
#!wget http://www.cis.twcu.ac.jp/~asakawa/2017jpa/2017Jul_jawiki-wakati_neologd_hid300_win20_neg20_sgns.bin.gz'
#!wget http://www.cis.twcu.ac.jp/~asakawa/2017jpa/2017Jul_jawiki-wakati_neologd_hid200_win20_neg20_cbow.bin.g

In [None]:
word2vec_file='2017Jul_jawiki-wakati_neologd_hid200_win20_neg20_cbow.bin.gz'
model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_file, 
                                                        encoding='utf-8', 
                                                        unicode_errors='replace', 
                                                        binary=True)

In [None]:
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

In [None]:
model.most_similar(positive=['王','女'], negative=['男'], topn=10)

In [None]:
model.most_similar(positive=['日本心理学会'], negative=['心理学'], topn=10)
model.most_similar(positive=['心理学'], negative=['科学'], topn=10)
model.most_similar(positive=['心理学'], negative=['実験'], topn=10)

In [None]:
model.most_similar(positive=['言語','実験'], topn=10)
model.most_similar(positive=['言語学','心理学'], topn=10)

In [None]:
model.most_similar(positive=['聴覚','認知科学'], topn=5)
model.most_similar(positive=['視覚','認知科学'], topn=5)
model.most_similar(positive=['視覚','認知心理学'], topn=5)
model.most_similar(positive=['言語','思考'], topn=5)

In [None]:
model.most_similar(positive=['ユニクロ','錦織圭'], negative=['テニス'], topn=10)

In [None]:
model.most_similar(positive=['ジャイアンツ','アメリカ'], negative=['日本'], topn=10)

In [None]:
model.most_similar(positive=['ユニクロ'], negative=['アパレル'], topn=10)
model.most_similar(positive=['ユニクロ','アメリカ'], negative=['日本'], topn=10)
model.most_similar(positive=['ユニクロ','ソフトバンク'], negative=['アパレル'], topn=10)

In [None]:
model.most_similar(positive=['1','10'], negative=['+'], topn=5)
model.most_similar(positive=['2','1'], negative=['+'], topn=5)
model.most_similar(positive=['2','1','-'], topn=5)

In [None]:
model.most_similar(positive=['NTT', '中国'], topn=10)

In [None]:
model.most_similar(positive=['東京', '皇居'], topn=10)
model.most_similar(positive=['ユニクロ', '楽天'], negative=['IT企業'], topn=10)
model.most_similar(positive=['ユニクロ', '無印良品'], topn=10)

In [None]:
model.similarity('女性', '男性')

In [None]:
model['computer']  # raw NumPy vector of a word

In [None]:
model['コンピュータ']  # raw NumPy vector of a word

In [None]:
model.similarity('computer', 'コンピュータ')

In [None]:
model.similarity('女性', '女')

In [None]:
model.similarity('コーヒー', '紅茶')

In [None]:
model.similarity('ビール', '日本酒')

In [None]:
model.most_similar(positive=['Berlin', 'Germany'], negative=['Paris'], topn=1)

In [None]:
model.most_similar(positive=['ドイツ','フランス'], negative=['ベルリン'], topn=5)

In [None]:
model.most_similar(positive=['京都','皇居'], negative=['東京'], topn=5)

In [None]:
model.most_similar(positive=['東京','早稲田大学'], negative=['大阪'], topn=5)

In [None]:
model.most_similar(positive=['納豆','豆腐'], negative=['大豆'],topn=10)

In [None]:
model.most_similar(positive=['権力', '麻薬'], topn=10)

In [None]:
model.most_similar(positive=['治療', '修理'], topn=10)

In [None]:
model.most_similar(positive=['月', 'お盆'], topn=10)