-
Notifications
You must be signed in to change notification settings - Fork 0
/
ch10_vector_space_methods_2.py
315 lines (287 loc) · 14.2 KB
/
ch10_vector_space_methods_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
# -*- coding: utf-8 -*-
#第10章: ベクトル空間法 (II)
import argparse
import json
import logging
import re
from gensim.models import word2vec
from gensim.models import Word2Vec
from matplotlib import pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
#from sklearn.cluster import Ward
import numpy as np
import scipy
import scipy.spatial.distance
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.manifold import TSNE
'''
第10章では,前章に引き続き単語ベクトルの学習に取り組む.
'''
# Following list is for clustering.
country_list = [ "Japan", "Germany", "Antigua_and_Barbuda", "Bosnia_and_Herzegovina", "Burkina_Faso", "Cabo_Verde", "Central_African_Republic", "Costa_Rica", "Cote_d\'Ivoire", "Czech_Republic", "Democratic_Republic_of_the_Congo", "Dominican_Republic", "East_Timor", "El_Salvador", "Equatorial_Guinea", "Guinea_Bissau", "Holy_See", "Hong_Kong", "Marshall_Islands", "New_Zealand", "North_Korea", "Palestinian_Territories", "Papua_New_Guinea", "Republic_of_the_Congo", "Saint_Kitts_and_Nevis", "Saint_Lucia", "Saint_Vincent_and_the_Grenadines", "San_Marino", "Sao_Tome_and_Principe", "Saudi_Arabia", "Sierra_Leone", "Sint_Maarten", "Solomon_Islands", "South_Africa", "South_Korea", "South_Sudan", "Sri_Lanka", "The_Bahamas", "The_Gambia", "Timor_Leste", "Trinidad_and_Tobago", "United_Arab_Emirates", "United_Kingdom", "United_States_of_America", "United_States" ]
'''
90. word2vecによる学習
81で作成したコーパスに対してword2vecを適用し,単語ベクトルを学習せよ.さらに,学習した単語ベクトルの形式を変換し,86-89のプログラムを動かせ.
'''
def knock90(src_filename:str, dst_filename:str):
data = word2vec.Text8Corpus(src_filename)
model = word2vec.Word2Vec(data, size=300)
model.save(dst_filename)
print("Knock86: ", end="")
print(model['United_States'])
print("Knock87: ", end="")
print(model.similarity('United_States', 'U.S'))
print("Knock88: ", end="")
print(model.most_similar(positive=["England"]))
print("Knock89: ", end="")
print(model.most_similar(positive=["Spain", "Athens"], negative=["Madrid"]))
return("Completed")
'''
91. アナロジーデータの準備
単語アナロジーの評価データ (https://word2vec.googlecode.com/svn/trunk/questions-words.txt) をダウンロードせよ.このデータ中で": "で始まる行はセクション名を表す.例えば,": capital-common-countries"という行は,"capital-common-countries"というセクションの開始を表している.ダウンロードした評価データの中で,"family"というセクションに含まれる評価事例を抜き出してファイルに保存せよ.
'''
def knock91(src_text_filename:str, dst_filename:str):
with open(src_text_filename, 'r') as fds:
with open(dst_filename, 'w') as fdd:
for line in fds:
if(re.match('^: family', line)):
break
for line in fds:
if(re.match('^:', line)):
break
fdd.write(line)
return("Completed")
'''
92. アナロジーデータへの適用
91で作成した評価データの各事例に対して,vec(2列目の単語) - vec(1列目の単語) + vec(3列目の単語)を計算し,そのベクトルと類似度が最も高い単語と,その類似度を求めよ.求めた単語と類似度は,各事例の末尾に追記せよ.このプログラムを85で作成した単語ベクトル,90で作成した単語ベクトルに対して適用せよ.
'''
def knock92(question_words_filename:str, wv_90_filename:str, wv_85_model_filename:str, wv_85_dict_filename:str):
return_value = ""
model_90 = Word2Vec.load(wv_90_filename)
model_85 = np.load(wv_85_model_filename)
with open(wv_85_dict_filename, 'r') as fds:
dict_85 = json.load(fds)
with open(question_words_filename, 'r') as fds:
for line in fds:
word_similarity_dict = dict()
line = line.rstrip()
(word1, word2, word3, word4) = line.split(' ')
try:
similar_word = model_90.most_similar(positive=[word2, word3], negative=[word1])
vec = model_85[dict_85[word2]] - model_85[dict_85[word1]] + model_85[dict_85[word3]]
#print("Model90\t" + word2 + " - " + word1 + " + " + word3 + " ==> " + similar_word[0][0] + " (" + str(similar_word[0][1]) + ")")
return_value += "Model90\t" + word2 + " - " + word1 + " + " + word3 + " ==> " + similar_word[0][0] + " (" + str(similar_word[0][1]) + ")\n"
for key,value in dict_85.items():
similarity = scipy.spatial.distance.cosine(model_85[dict_85[key]], vec)
word_similarity_dict[key] = similarity
(word, value) = sorted(word_similarity_dict.items(), key=lambda x:x[1])[0]
#print("Model85\t" + word2 + " - " + word1 + " + " + word3 + " ==> " + word + " (" + str(value) + ")")
return_value += "Model85\t" + word2 + " - " + word1 + " + " + word3 + " ==> " + word + " (" + str(value) + ")\n"
except KeyError:
# this is required in order to avoid OOV error (we simply ignore it)
continue
return(return_value)
'''
93. アナロジータスクの正解率の計算
92で作ったデータを用い,各モデルのアナロジータスクの正解率を求めよ.
'''
def knock93(question_words_filename:str, wv_90_filename:str, wv_85_model_filename:str, wv_85_dict_filename:str):
lines = knock92(question_words_filename, wv_90_filename, wv_85_model_filename, wv_85_dict_filename)
lines_list = lines.split('\n')
total = 0
correct = 0
while(True):
try:
expected = lines_list.pop(0)
predicted = lines_list.pop(0)
except:
break
expected_word = re.sub(r'^.*==> (.*?) \(.*$', '\\1', expected)
predicted_word = re.sub(r'^.*==> (.*?) \(.*$', '\\1', predicted)
if(expected_word.lower() == predicted_word.lower()):
correct += 1
total += 1
return(float(correct / total))
'''
94. WordSimilarity-353 での類似度計算
The WordSimilarity-353 Test Collection (http://www.cs.technion.ac.il/~gabr/resources/data/wordsim353/) の評価データを入力とし,1列目と2列目の単語の類似度を計算し,各行の末尾に類似度の値を追加するプログラムを作成せよ.このプログラムを85で作成した単語ベクトル,90で作成した単語ベクトルに対して適用せよ.
'''
def knock94_word2vec(word1:str, word2, wv_90_model):
try:
return_value = wv_90_model.similarity(word1, word2)
except KeyError:
return_value = 0.0
return(return_value)
def knock94_myword2vec(word1:str, word2:str, wv_85_model:str, wv_85_dict:str):
try:
return_value = scipy.spatial.distance.cosine(wv_85_model[wv_85_dict[word1]], wv_85_model[wv_85_dict[word2]])
except KeyError:
return_value = 0.0
return(return_value)
def knock94(eval_set_filename:str, wv_90_filename:str, wv_85_model_filename:str, wv_85_dict_filename:str):
print("== Results using using Word2Vec ==")
model_90 = Word2Vec.load(wv_90_filename)
with open(eval_set_filename, 'r') as fds:
for line in fds:
words_list = line.split('\t')
word1 = words_list[0]
word2 = words_list[1]
if(word1 == 'Word 1' and str(word2) == 'Word 2'):
continue
similarity = knock94_word2vec(word1, word2, model_90)
print(word1 + " " + word2 + ": " + str(similarity))
print("== Results using using PCA from knock85 ==")
model_85 = np.load(wv_85_model_filename)
with open(wv_85_dict_filename, 'r') as fds:
dict_85 = json.load(fds)
with open(eval_set_filename, 'r') as fds:
for line in fds:
words_list = line.split('\t')
word1 = words_list[0]
word2 = words_list[1]
if(word1 == 'Word 1' and str(word2) == 'Word 2'):
continue
similarity = knock94_myword2vec(word1, word2, model_85, dict_85)
print(word1 + " " + word2 + ": " + str(similarity))
return("Complete")
'''
95. WordSimilarity-353での評価
94で作ったデータを用い,各モデルが出力する類似度のランキングと,人間の類似度判定のランキングの間のスピアマン相関係数を計算せよ.
'''
def knock95(eval_set_filename:str, wv_90_filename:str, wv_85_model_filename:str, wv_85_dict_filename:str):
return_value = ""
model_90 = Word2Vec.load(wv_90_filename)
model_85 = np.load(wv_85_model_filename)
model_90_similarity_list = []
model_90_human_list = []
model_85_similarity_list = []
model_85_human_list = []
with open(wv_85_dict_filename, 'r') as fds:
dict_85 = json.load(fds)
with open(eval_set_filename, 'r') as fds:
for line in fds:
words_list = line.split('\t')
word1 = words_list[0]
word2 = words_list[1]
human_score = words_list[2]
if(word1 == 'Word 1' and str(word2) == 'Word 2'):
continue
similarity_90 = knock94_word2vec(word1, word2, model_90)
similarity_85 = knock94_myword2vec(word1, word2, model_85, dict_85)
if(similarity_90 != 0.0):
model_90_similarity_list.append(similarity_90)
model_90_human_list.append(human_score)
if(similarity_85 != 0.0):
model_85_similarity_list.append(similarity_85)
model_85_human_list.append(human_score)
return_value = str(scipy.stats.stats.spearmanr(model_90_similarity_list, model_90_human_list)[0])
return_value += "\t" + str(scipy.stats.stats.spearmanr(model_85_similarity_list, model_85_human_list)[0])
return(return_value)
'''
96. 国名に関するベクトルの抽出
word2vecの学習結果から,国名に関するベクトルのみを抜き出せ.
'''
def knock96(wv_90_filename:str):
model_90 = Word2Vec.load(wv_90_filename)
for country in country_list:
try:
print(country + "\t" + str(model_90[country]))
except KeyError:
print("Skipping " + country + "..")
return("Completed")
'''
97. k-meansクラスタリング
96の単語ベクトルに対して,k-meansクラスタリングをクラスタ数 k=5 として実行せよ.
'''
def knock97(wv_90_filename:str):
model_90 = Word2Vec.load(wv_90_filename)
cluster_list = []
country_src_list = []
for country in country_list:
try:
cluster_list.append(model_90[country])
country_src_list.append(country)
except KeyError:
pass
cluster_nparray = np.array(cluster_list)
kmeans_model = KMeans(n_clusters=5, random_state=10).fit(cluster_nparray)
labels = kmeans_model.labels_
for i in range(len(labels)):
print(country_src_list[i] + "\t" + str(labels[i]))
return(None)
'''
98. Ward法によるクラスタリング
96の単語ベクトルに対して,Ward 法による階層型クラスタリングを実行せよ.さらに,クラスタリング結果をデンドログラムとして可視化せよ.
'''
def knock98(wv_90_filename:str):
model_90 = Word2Vec.load(wv_90_filename)
cluster_list = []
country_src_list = []
for country in country_list:
try:
cluster_list.append(model_90[country])
country_src_list.append(country)
except KeyError:
pass
cluster_nparray = np.array(cluster_list)
Z = linkage(cluster_nparray, 'ward')
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
Z,
labels=country_src_list,
leaf_font_size=8., # font size for the x axis labels
)
plt.show()
return("Completed")
'''
99. t-SNEによる可視化
96の単語ベクトルに対して,ベクトル空間を t-SNE で可視化せよ.
'''
def knock99(wv_90_filename:str):
model_90 = Word2Vec.load(wv_90_filename)
cluster_list = []
country_src_list = []
for country in country_list:
try:
cluster_list.append(model_90[country])
country_src_list.append(country)
except KeyError:
pass
cluster_nparray = np.array(cluster_list)
t_sne_result = TSNE().fit_transform(cluster_nparray)
fig, ax = plt.subplots()
ax.scatter(t_sne_result[:, 0], t_sne_result[:, 1])
for index, label in enumerate(country_src_list):
ax.annotate(label, xy=(t_sne_result[index, 0], t_sne_result[index, 1]))
plt.show()
return("Completed")
if(__name__ == '__main__'):
parser = argparse.ArgumentParser(description='Ch 10')
parser.add_argument('knock', type=int, help="Number of knock")
parser.add_argument('-a', '--arg', help="Additional argument where appropriate")
args = parser.parse_args()
if(args.knock == 0 or args.knock == 90):
print(knock90("temp_knock81_enwiki.txt", "temp_knock90"))
if(args.knock == 1 or args.knock == 91):
print(knock91("questions-words.txt", "temp_knock91"))
if(args.knock == 2 or args.knock == 92):
#print(knock90("temp_knock81_enwiki.txt", "temp_knock90"))
print(knock92("temp_knock91", "temp_knock90", "temp_knock85_matrix.npy", "temp_knock85_word_dict.json"))
if(args.knock == 3 or args.knock == 93):
print(knock93("temp_knock91", "temp_knock90", "temp_knock85_matrix.npy", "temp_knock85_word_dict.json"))
if(args.knock == 4 or args.knock == 94):
print(knock94("combined.tab", "temp_knock90", "temp_knock85_matrix.npy", "temp_knock85_word_dict.json"))
if(args.knock == 5 or args.knock == 95):
print(knock95("combined.tab", "temp_knock90", "temp_knock85_matrix.npy", "temp_knock85_word_dict.json"))
if(args.knock == 6 or args.knock == 96):
print(knock96("temp_knock90"))
if(args.knock == 7 or args.knock == 97):
print(knock97("temp_knock90"))
if(args.knock == 8 or args.knock == 98):
print(knock98("temp_knock90"))
if(args.knock == 9 or args.knock == 99):
print(knock99("temp_knock90"))