# 第7章: 単語ベクトル

In [None]:
# ライブラリ読み込み
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
from gensim.models import KeyedVectors
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from tqdm import tqdm
tqdm.pandas()
%matplotlib inline

In [None]:
# データ取得　→ モデルはGoogle Driveから直接ダウンロード
! wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
! wget http://download.tensorflow.org/data/questions-words.txt
! wget http://www.gabrilovich.com/resources/data/wordsim353/wordsim353.zip
! unzip wordsim353.zip
! wget https://gist.githubusercontent.com/cupnoodlegirl/ba10cf7a412a1840714c/raw/0fbb6a53a35d5461ccf2ae6d97ec5cc5155e758a/country_list.csv

### 60. 単語ベクトルの読み込みと表示

In [None]:
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
model["United_States"]

### 61. 単語の類似度

In [None]:
def cosine(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [None]:
cosine(model["United_States"], model["U.S."])

### 62. 類似度の高い単語10件

In [None]:
model.most_similar("United_States", topn = 10)

### 63. 加法構成性によるアナロジー

In [None]:
model.most_similar(positive=["Spain", "Athens"], negative=["Madrid"], topn=10)

### 64. アナロジーデータでの実験

In [None]:
! head -n 10 questions-words.txt

In [None]:
# 64. と 65. まとめて計算する
d = [[] for _ in range(5)]
header = None
with open("questions-words.txt") as f:
    for line in f.readlines():
        if line[0] == ":":
            header = "syntactic" if line.startswith(": gram") else "semantic"
            continue
        ws = line.strip().split()
        d[0].append(header)
        for i in range(4):
            d[i+1].append(ws[i]) 

df = pd.DataFrame(
    {"header": d[0], "w1": d[1], "w2": d[2], "w3": d[3], "w4": d[4]}, 
    columns = ["header", "w1", "w2", "w3", "w4"]
)
df.head()

In [None]:
def most_similar_from_row(row):
    global model
    best = model.most_similar(positive=[row["w2"], row["w3"]], negative=[row["w1"]], topn=1)[0]
    return pd.Series(best)

In [None]:
df[["most", "score"]] = df.progress_apply(most_similar_from_row, axis=1)

In [None]:
df[["w1", "w2", "w3", "w4", "most", "score"]].head()

### 65. アナロジータスクでの正解率

In [None]:
semantic = df.query('header == "semantic"')
syntactic = df.query('header == "syntactic"')

print("accuracy for semantic analogy: {}".format((semantic["w4"] == semantic["most"]).sum() / len(semantic)))
print("accuracy for syntactic analogy: {}".format((syntactic["w4"] == syntactic["most"]).sum() / len(syntactic)))

### 66. WordSimilarity-353での評価

In [None]:
df2 = pd.read_csv("combined.csv")
df2.head()

In [None]:
def cosine_from_row(row):
    global model
    v1 = model[row["Word 1"]]
    v2 = model[row["Word 2"]]
    return cosine(v1, v2)

In [None]:
df2["similarity"] = df2.apply(cosine_from_row, axis = 1)
df2.head()

In [None]:
df2[["Human (mean)", "similarity"]].corr(method="spearman")

### 67. k-meansクラスタリング

In [None]:
# 以下の問題で共通して使用する
countries = list(pd.read_csv("country_list.csv")["ISO 3166-1に於ける英語名"].map(lambda x: x.replace(" ", "_")))
countries

names = []
vecs = []
for country in countries:
    try: 
        vec = model[country]
        names.append(country)
        vecs.append(vec)
    except:
        pass

In [None]:
K = 5
X = np.array(vecs)
kmeans = KMeans(n_clusters=K, random_state=0)
clusters = kmeans.fit_predict(X)

df3 = pd.DataFrame(
    {"country": names, "cluster_id": clusters},
    columns = ["country", "cluster_id"]
)

df3

### 68. Ward法によるクラスタリング

In [None]:
X = np.array(vecs)
clusters = linkage(X, method="ward")

plt.figure(num=None, figsize=(16, 12), dpi=200, facecolor="w", edgecolor="k")
dendrogram(clusters, labels=names)
plt.show()

### 69. t-SNEによる可視化

In [None]:
X = np.array(vecs)
tsne = TSNE(random_state=0)
embs = tsne.fit_transform(X)

fig = plt.figure(figsize=(12, 12))
ax = fig.add_subplot(1, 1, 1)

for i in range(len(names)):
    ax.annotate(names[i], embs[i])

plt.scatter(embs[:, 0], embs[:, 1])
plt.show()