# 単語ベクトル
## 単語の意味を実ベクトルで表現する単語ベクトル（単語埋め込み）に関して，以下の処理を行うプログラムを作成せよ．
#### https://nlp100.github.io/ja/ch07.html

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/iamtatsuki05/NLP_100/blob/NLP_100_9/NLP_100_7.ipynb)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# 単語ベクトルの読み込みと表示
## Google Newsデータセット（約1,000億単語）での学習済み単語ベクトル（300万単語・フレーズ，300次元）をダウンロードし，”United States”の単語ベクトルを表示せよ．ただし，”United States”は内部的には”United_States”と表現されていることに注意せよ．

In [None]:
# !wget https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?resourcekey=0-wjGZdNAUop6WykTtMip30g

In [None]:
# ダウンロード制限の為ダウンロードしたものを直接読み込む
# FILE_ID = "0B7XkCwpI5KDYNlNUTTlSS21pQmM"
# FILE_NAME = "GoogleNews-vectors-negative300.bin.gz"
# !wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=$FILE_ID' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=$FILE_ID" -O $FILE_NAME && rm -rf /tmp/cookies.txt

In [None]:
from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/Tutorial/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
model['United_States']

In [None]:
# 参考https://skume.net/entry/2020/11/23/043738
# https://blog.amedama.jp/entry/gensim-fasttext-pre-trained-word-vectors

# 単語の類似度
## “United States”と”U.S.”のコサイン類似度を計算せよ

In [None]:
model.similarity('United_States', 'U.S.')

In [None]:
# https://qiita.com/DancingEnginee1/items/b10c8ef7893d99aa53be

#  類似度の高い単語10件
## “United States”とコサイン類似度が高い10語と，その類似度を出力せよ．

In [None]:
model.most_similar(['United_States'], [], 10)

In [None]:
# 参考https://www.google.com/search?q=most_similar&sxsrf=APq-WBsf2tcpBPSTroJK4JcGk9mRtGKwxA%3A1647830959279&ei=r-c3YsTWEMul2roPo5GA6AU&ved=0ahUKEwiE6ujXmNb2AhXLklYBHaMIAF0Q4dUDCA4&uact=5&oq=most_similar&gs_lcp=Cgdnd3Mtd2l6EAMyBQgAEIAEMgUIABCABDIFCAAQgAQyBQgAEIAEMgQIABAeMgQIABAeMgQIABAeMgQIABAeOgcIIxDqAhAnSgQIQRgASgQIRhgAUNMHWNMHYL8KaAFwAXgAgAFUiAFUkgEBMZgBAKABAaABArABCsABAQ&sclient=gws-wiz
# https://teratail.com/questions/306461
# https://qiita.com/iss-f/items/aec567ee5c79464413dc

# 加法構成性によるアナロジー
## “Spain”の単語ベクトルから”Madrid”のベクトルを引き，”Athens”のベクトルを足したベクトルを計算し，そのベクトルと類似度の高い10語とその類似度を出力せよ．

In [None]:
model.most_similar(positive=['Spain', 'Athens'], negative=['Madrid'], topn=10)

# アナロジーデータでの実験
## 単語アナロジーの評価データをダウンロードし，vec(2列目の単語) - vec(1列目の単語) + vec(3列目の単語)を計算し，そのベクトルと類似度が最も高い単語と，その類似度を求めよ．求めた単語と類似度は，各事例の末尾に追記せよ．

In [None]:
!wget http://download.tensorflow.org/data/questions-words.txt

In [None]:
!head -30 questions-words.txt

In [None]:
# Athens Greece Tokyo Japan
# Baghdad Iraq Bangkok Thailand

# with open('questions-words.txt', 'r') as f1:
#     questions = f1.readlines()

# with open('result.txt', 'w') as f2:
#     for idx, question in enumerate(questions):
#         words = question.split()
#         if len(words) == 4:
#             scores = model.most_similar(positive=[words[1], words[2]], negative=[words[0]], topn=1)[0]
#             words += [scores[0], str(scores[1])]
#             output = ' '.join(words) + '\n'
#         else:
#             output = question
#         f2.write(output)

In [None]:
with open('questions-words.txt', 'r') as fin:
    questions = fin.readlines()

with open('result.txt', 'w') as fout:
    for idx, question in enumerate(questions):
        words = question.split()
        if len(words) == 4:
            scores = model.most_similar(positive=[words[1], words[2]], negative=[words[0]], topn=1)[0]
            output = ' '.join(words + [scores[0], str(scores[1])]) + '\n'
        else:
            output = question
        fout.write(output)

In [None]:
# with open('questions-words.txt', 'r') as f1:
#     questions = f1.readlines()

# with open('result.txt', 'w') as f2:
#     for idx, question in enumerate(questions):
#         words = question.split()
#         if len(words) == 4:
#             scores = model.most_similar(positive=[words[1], words[2]], negative=[words[0]], topn=1)[0]
#             output = ' '.join(words + [scores[0], str(scores[1])]) + '\n'
#         else:
#             output = question
#         f2.write(output)

In [None]:
!head -30 result.txt

In [None]:
# https://blog.codecamp.jp/posts-34408
# https://yu-nix.com/blog/2021/6/2/python-write-file/
# https://dr-kayai.hatenablog.com/entry/2014/02/24/131634
# https://blog.codecamp.jp/posts-34408
# https://naruport.com/blog/2019/9/14/python-tutorial-open-and-close-file/
# http://web.wakayama-u.ac.jp/~kazama/lab/python/i18n.html
# https://qiita.com/shun-shun123/items/13474f481eebbc508a8c

# アナロジータスクでの正解率
## 64の実行結果を用い，意味的アナロジー（semantic analogy）と文法的アナロジー（syntactic analogy）の正解率を測定せよ．

In [None]:
from sklearn.metrics import accuracy_score

with open('result.txt', 'r') as fin:
    results = fin.read()

texts = results.splitlines()
label_true = []
label_pred = []
for text in texts:
    words = text.split(' ') 
    if words[0] == ':':
        continue
    label_true.append(words[3])
    label_pred.append(words[4])

print(accuracy_score(label_true, label_pred))

In [None]:
#参考https://itsakura.com/python-startswith

# WordSimilarity-353での評価
## The WordSimilarity-353 Test Collectionの評価データをダウンロードし，単語ベクトルにより計算される類似度のランキングと，人間の類似度判定のランキングの間のスピアマン相関係数を計算せよ．

In [None]:
!wget http://www.gabrilovich.com/resources/data/wordsim353/wordsim353.zip

In [None]:
!unzip wordsim353.zip

In [None]:
!head -10 'combined.csv'

In [None]:
#単語ベクトルにより計算される類似度
w_s_353 = []
with open('./combined.csv', 'r') as fin:
    next(fin)
    for text in fin:
        words = [_.strip() for _ in text.split(',')]
        words.append(model.similarity(words[0], words[1]))
        w_s_353.append(words)

for i in range(30):
    print(w_s_353[i])

In [None]:
# import numpy as np
# from scipy.stats import spearmanr

# human = np.array(w_s_353).T[2]
# w2v = np.array(w_s_353).T[3]

# correlation, pvalue = spearmanr(human, w2v)
# print(correlation)

In [None]:
import numpy as np
from scipy.stats import spearmanr

w_s_353_array = np.array(w_s_353)
human = w_s_353_array[:, 2]
w2v = w_s_353_array[:, 3]

correlation, pvalue = spearmanr(human, w2v)
print(correlation)

In [None]:
#参考https://qiita.com/dacciinfo/items/88debe69f9f4e927aafc

# k-meansクラスタリング
## 国名に関する単語ベクトルを抽出し，k-meansクラスタリングをクラスタ数k=5として実行せよ

In [None]:
!wc questions-words.txt

In [None]:
# !head -19558 questions-words.txt

In [None]:
# !head -100 questions-words.txt

In [None]:
# with open('questions-words.txt' , 'r') as f:
#     questions = f.read()
# questions.splitlines()
# questions

In [None]:
from sklearn.cluster import KMeans
import gensim

with open('questions-words.txt', 'r') as fin:
    questions = fin.read()

# 国名
countries = set()
for text in questions.splitlines():
    words = text.split(' ')

    if words[0] == ':':
        if words[1] == 'currency':
            break
        continue

    # if words[0] == ':' and words[1] == 'currency' # : のときにはスキップする必要があるので修正が必要なさそうです。
    #   break

    countries.add(words[1])

countries = list(countries)

# 国名に関する単語ベクトルを抽出
countries_vec = model[countries]

In [None]:
km = KMeans(n_clusters=5, random_state=42)
km.fit(countries_vec)

predict_list = list(km.predict(model[countries]))

for _ in range(5):
    print(_, ' : ')
    country_class = sorted([countries[num] for num in range(len(predict_list)) if predict_list[num] == _])
    print(' '.join(country_class))

# Ward法によるクラスタリング
## 国名に関する単語ベクトルに対し，Ward法による階層型クラスタリングを実行せよ．さらに，クラスタリング結果をデンドログラムとして可視化せよ．

In [None]:
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage

#作図
plt.figure(figsize=(50, 20))
link = linkage(countries_vec, method='ward')
dendrogram(link, labels=countries)
plt.show()

In [None]:
#参考https://qiita.com/pontyo4/items/a2e7dec57c3699c519a5
#https://di-acc2.com/programming/python/4478/

# t-SNEによる可視化
## ベクトル空間上の国名に関する単語ベクトルをt-SNEで可視化せよ

In [None]:
#model,data
from sklearn.manifold import TSNE

ts = TSNE(n_components=2, random_state=42) #n_components=2 次元
country_vec_embedded = ts.fit_transform(countries_vec)
xs = [_[0] for _ in country_vec_embedded]
ys = [_[1] for _ in country_vec_embedded]

#作図
fig = plt.figure(figsize=(50, 20))
plt.scatter(xs, ys)
ax = plt.gca() #Pyplotインターフェースからオブジェクト指向インターフェースに切り替える

for idx, country in enumerate(countries):
    ax.annotate(country, (xs[idx], ys[idx]))
plt.show()

In [None]:
# 参考https://qiita.com/g-k/items/120f1cf85ff2ceae4aba
# https://qiita.com/skotaro/items/08dc0b8c5704c94eafb9