# wordSim評価
# wordAnalogy評価

# -----------初期設定-----------

In [1]:
# -*- coding: utf-8 -*-
# python3
#
import argparse
import gzip
import math
import numpy
import re
import sys
import numpy as np
from copy import deepcopy
import codecs

In [2]:
isNumber = re.compile(r'\d+.*')
def norm_word(word):
    if isNumber.search(word.lower()):
        return '---num---'
    elif re.sub(r'\W+', '', word) == '':
        return '---punc---'
    else:
        return word.lower()

In [3]:
"""Read all the word vectors and normalize them"""
def read_word_vecs(filename):
    wordVectors = {}
    # ファイル読み込み
    if filename.endswith('.gz'): 
        fileObject = gzip.open(filename, 'r')
    else: 
        fileObject = codecs.open(filename, "r", "utf-8", 'ignore')
        
    for line in fileObject:
        # line = line.strip().lower()
        line = line.strip()
        word = line.split()[0]
        wordVectors[word] = numpy.zeros(len(line.split())-1, dtype=float)
        for index, vecVal in enumerate(line.split()[1:]):
            wordVectors[word][index] = float(vecVal)
        """normalize weight vector"""
        wordVectors[word] /= math.sqrt((wordVectors[word]**2).sum() + 1e-6)

    sys.stderr.write("Vectors read from: "+filename+" \n")
    return wordVectors

In [4]:
"""Write word vectors to file"""
def print_word_vecs(wordVectors, outFileName):
    sys.stderr.write('\nWriting down the vectors in '+outFileName+'\n')
    outFile = open(outFileName, 'w')  
    for word, values in wordVectors.items():
        outFile.write(word+' ')
        for val in wordVectors[word]:
            outFile.write('%.4f' %(val)+' ')
        outFile.write('\n')      
    outFile.close()

In [5]:
"""Read the PPDB.etc word relations as a dictionary"""
def read_lexicon(filename):
    lexicon = {}
    fileObject = open(filename, 'r')
    for line in fileObject:
        words = line.lower().strip().split()
        lexicon[norm_word(words[0])] = [norm_word(word) for word in words[1:]]
    return lexicon

In [6]:
"""Retrofit word vectors to a lexicon"""
def retrofit(wordVecs, lexicon, numIters):
    # Input word vecs
    newWordVecs = deepcopy(wordVecs)
    # Input word vecsの単語リスト
    wvVocab = set(newWordVecs.keys())
    # wvVocabとlexiconの共通単語
    loopVocab = wvVocab.intersection(set(lexicon.keys()))

    for _ in range(numIters): #10回程度
        # loop through every node also in ontology (else just use data estimate)
        for word in loopVocab:
            # lexicon wordの近傍単語とwvVocabの共通単語とその個数
            wordNeighbours = set(lexicon[word]).intersection(wvVocab)
            numNeighbours = len(wordNeighbours)
            # no neighbours -> pass - use data estimate
            if numNeighbours == 0:
                continue
            """分散表現の更新手続き"""
            # the weight of the data estimate if the number of neighbours
            newVec = numNeighbours * wordVecs[word]
            # loop over neighbours and add to new vector (currently with weight 1)
            for ppWord in wordNeighbours:
                newVec += newWordVecs[ppWord]
            newWordVecs[word] = newVec/(2*numNeighbours)
    return newWordVecs

In [7]:
def similarity(v1, v2):
    n1 = np.linalg.norm(v1) # v1のノルム
    n2 = np.linalg.norm(v2) # v2のノルム
    return np.dot(v1, v2) / (n1*n2) # 内積 / 

## ・パス指定

In [8]:
# オリジナルの分散表現
input_arg = './word2vec/vectors.model'
# wordnetの辞書
lexicon_arg = './result.txt'
# retrofittingしたnewvec
output_arg = './out_vec.txt'

In [9]:
# Num iterations
numiter_arg = 10

## ・初期vecとnewvecとwordsim辞書のread

In [10]:
numIter = int(numiter_arg)

In [11]:
outFileName = output_arg

In [12]:
lexicon = read_lexicon(lexicon_arg)

In [13]:
wordVecs = read_word_vecs(input_arg)

Vectors read from: ./word2vec/vectors.model 


In [14]:
new_vec = read_word_vecs(output_arg)

Vectors read from: ./out_vec.txt 


# -----------WordSim評価-----------

In [15]:
# word2vec
vecs_wordVecs = wordVecs

In [16]:
# retrofitting
vecs_new_vec = new_vec

In [17]:
def checkSim(v1, v2):
    if v1 not in lexicon: # 注目単語がwordnetに含まれない場合，
        print("v1(={})はwordnetのkeyに存在しません".format(v1))
        wordNeighbours_1 = set()
    else:
        # 注目単語の同義語リストとword2vecのkeyリストと重複する単語リスト（更新対象か？）
        wordNeighbours_1 = set(lexicon[v1]).intersection(set(wordVecs.keys()))
        print('v1(={})における更新対象のneighbour数 : {}'.format(v1, len(wordNeighbours_1)))
        
    # neighboursの中で注目単語と類似度が低いもの
    ave_1_word2vec = np.zeros_like(wordVecs[v1])  #neighboursの平均ベクトル
    ave_1_retrofit = np.zeros_like(wordVecs[v1])
    for neighbour in wordNeighbours_1:
        print('v1とneighbour(={}) : {} -> {}'.format(neighbour, 
                                                                   similarity(wordVecs[neighbour], wordVecs[v1]), 
                                                                   similarity(vecs_new_vec[neighbour], vecs_new_vec[v1])))
        ave_1_word2vec += wordVecs[neighbour]
        ave_1_retrofit += vecs_new_vec[neighbour]
    print('v1とneighboursの平均ベクトル : {} -> {}'.format(similarity(ave_1_word2vec/len(wordNeighbours_1), wordVecs[v1]), 
                                                              similarity(ave_1_retrofit/len(wordNeighbours_1), vecs_new_vec[v1])))
    print(" ")
    
    if v2 not in lexicon:
        print("v2(={})はwordnetのkeyに存在しません".format(v2))
        wordNeighbours_2 = set()
    else:
        wordNeighbours_2 = set(lexicon[v2]).intersection(set(wordVecs.keys()))
        print('v2(={})における更新対象のneighbour数 : {}'.format(v2, len(wordNeighbours_2))) # 更新対象数
    
    # neighboursの中で注目単語と類似度が低いもの
    ave_2_word2vec = np.zeros_like(wordVecs[v2])  #neighboursの平均ベクトル
    ave_2_retrofit = np.zeros_like(wordVecs[v2])
    for neighbour in wordNeighbours_2:
        print('v2とneighbour(={}) : {} -> {}'.format(neighbour, 
                                                                   similarity(wordVecs[neighbour], wordVecs[v2]), 
                                                                   similarity(vecs_new_vec[neighbour], vecs_new_vec[v2])))
        ave_2_word2vec += wordVecs[neighbour]
        ave_2_retrofit += vecs_new_vec[neighbour]
    print('v2とneighboursの平均ベクトル : {} -> {}'.format(similarity(ave_2_word2vec/len(wordNeighbours_2), wordVecs[v2]), 
                                                              similarity(ave_2_retrofit/len(wordNeighbours_2), vecs_new_vec[v2])))
    print(" ")
    
    # v1とv2における更新対象のneighbourに重複があるか
    print('更新対象における重複数 : {}'.format(len(set(wordNeighbours_1).intersection(set(wordNeighbours_2)))))
    print(" ")

    """v1とv2のword2vecとretrofittingにおける類似度"""
    try:
        print("word2vec : {}".format(similarity(vecs_wordVecs[v1], vecs_wordVecs[v2])))
    except:
        print('error')

    try:
        print("retrofitting : {}".format(similarity(vecs_new_vec[v1], vecs_new_vec[v2])))
    except:
        print('error')

    print(" ")
    """同一単語におけるword2vecとretrofittingの類似度"""
    try:
        print("v1 : {}".format(similarity(vecs_wordVecs[v1], vecs_new_vec[v1])))
    except:
        print('error')

    try:
        print("v2 : {}".format(similarity(vecs_wordVecs[v2], vecs_new_vec[v2])))
    except:
        print('error')

## ・WordSimのテスト

In [18]:
v1 = 'フットボール'
v2 = 'サッカー'
checkSim(v1, v2)

v1(=フットボール)における更新対象のneighbour数 : 4
v1とneighbour(=サッカー) : 0.6700097012259578 -> 0.9023682559373559
v1とneighbour(=フートボール) : 0.41945202281722355 -> 0.8277756438161145
v1とneighbour(=蹴球) : 0.4714661654042284 -> 0.8447562314376742
v1とneighbour(=アソシエーションフットボール) : 0.44592722294673065 -> 0.8335533028545058
v1とneighboursの平均ベクトル : 0.6533150118662724 -> 0.9099880161508288
 
v2(=サッカー)における更新対象のneighbour数 : 4
v2とneighbour(=フートボール) : 0.3560841190233665 -> 0.8074251482783557
v2とneighbour(=フットボール) : 0.6700097012259578 -> 0.9023682559373559
v2とneighbour(=蹴球) : 0.49144473934055627 -> 0.8494646429952011
v2とneighbour(=アソシエーションフットボール) : 0.3838424344300787 -> 0.8132187145866909
v2とneighboursの平均ベクトル : 0.6121738109726407 -> 0.8980829186067604
 
更新対象における重複数 : 3
 
word2vec : 0.6700097012259578
retrofitting : 0.9023682559373559
 
v1 : 0.9490976274917442
v2 : 0.9421475806597656


In [19]:
v1 = 'ドル'
v2 = '円'
# checkSim(v1, v2)

In [20]:
v1 = '生'
v2 = '死'
# checkSim(v1, v2)

In [21]:
v1 = '美術館'
v2 = '劇場'
# checkSim(v1, v2)

In [22]:
v1 = '王'
v2 = '城'
# checkSim(v1, v2)

In [23]:
v1 = 'テレビ'
v2 = 'ラジオ'
# checkSim(v1, v2)

In [24]:
v1 = '飲む'
v2 = '食べる'
# checkSim(v1, v2)

In [25]:
v1 = '医師'
v2 = '看護師'
# checkSim(v1, v2)

# -----------wordAnalogy評価-----------

In [26]:
def checkAnalogy(vecs, w_vec):  
    negative = False # Falseなら似た単語を候補で上げる
    threshold = 0.3 # -1なら閾値固定

    # 閾値の設定
    border_positive = threshold if threshold > 0 else 0.9
    border_negative = threshold if threshold > 0 else 0.2
    print('{} < thd < {}'.format(border_negative, border_positive))

    # 候補数の設定
    max_candidates = 20
    candidates = {}

    for w in vecs:
        try:
            if w_vec.shape != vecs[w].shape:
                raise Exception("size not match")
            s = similarity(w_vec, vecs[w])
        except Exception as ex:
            print(w + " is not valid word.")
            continue

        if negative and s <= border_negative:
            candidates[w] = s
            if len(candidates) % 5 == 0:
                border_negative -= 0.05
        elif not negative and s >= border_positive:
            candidates[w] = s
            if len(candidates) % 5 == 0:
                border_positive += 0.05

        if len(candidates) > max_candidates:
            break

    # 類義語算出
    sorted_candidates = sorted(candidates, key=candidates.get, reverse=not negative)
    for c in sorted_candidates:
        print("{0}, {1}".format(c, candidates[c]))

## ・「v1 + v2 - v3」と「v4」の類似度算出

In [27]:
v1 = '兄'
v2 = '姉'
v3 = '祖父'
v4 = '祖母'

In [28]:
if v1 not in lexicon:
    print("v1 not found error in dict")
if v2 not in lexicon:
    print("v2 not found error in dict")
if v3 not in lexicon:
    print("v3 not found error in dict")
if v4 not in lexicon:
    print("v4 not found error in dict")

try:
    print('word2vec : {}'.format(similarity(vecs_wordVecs[v1] + vecs_wordVecs[v2] - vecs_wordVecs[v3], vecs_wordVecs[v4])))
except:
    print('error')

try:
    print('retrofitting : {}'.format(similarity(vecs_new_vec[v1] + vecs_new_vec[v2] - vecs_new_vec[v3], vecs_new_vec[v4])))
except:
    print('error')

word2vec : 0.3855693677632116
retrofitting : 0.46470763443894203


## ・「v1 + v2 - v3」と近い単語を挙げる→「v4」が結果に出るか

In [40]:
# 初期vecの場合，
checkAnalogy(vecs_wordVecs, vecs_wordVecs[v1] + vecs_wordVecs[v2] - vecs_wordVecs[v3])
print(' ')
# newvecの場合，
checkAnalogy(vecs_new_vec, vecs_new_vec[v1] + vecs_new_vec[v2] - vecs_new_vec[v3])

0.3 < thd < 0.3
754069 is not valid word.
姉, 0.8433279735101462
兄, 0.7622779357580127
妹, 0.7553375375315672
弟, 0.7011033258786433
双子, 0.5541898022518956
夫, 0.5241162973051148
年上, 0.5228285283574702
次女, 0.49053484292919564
兄弟, 0.4801388417438813
叔父, 0.48004084731816965
甥, 0.4646477248521072
母, 0.4618720699656536
長女, 0.4522498603989746
娘。, 0.4484254297338895
彼女, 0.41874909255787995
妻, 0.4180556386599047
息子, 0.38730153006548584
父, 0.3860297212008132
結婚, 0.3574390676533354
男, 0.35215392646403093
2人, 0.32434570907744076
 
0.3 < thd < 0.3
754069 is not valid word.
弟, 0.735540018182597
彼女, 0.5708677571388876
母, 0.543018985376124
息子, 0.5335641681270236
妻, 0.5269046411495635
娘。, 0.5115487458496082
女性, 0.4747866974855413
子供, 0.46754522886759947
彼, 0.464979913115947
父, 0.4579658210443133
結婚, 0.4532385554739422
男性, 0.4523917286989586
子, 0.4158272591171027
2人, 0.41108535260447876
自分, 0.3953707483145449
思う, 0.376416952702512
出演, 0.36485803889961343
メンバー, 0.35210435795316825
後, 0.3356467345493145
活動,

# -----------ある単語の類似する単語を挙げる-----------

In [36]:
def checkSim_by_word(vecs, word):
    # 閾値の設定
    negative = False # Falseなら似た単語を候補で上げる
    threshold = 0.3 # -1なら閾値固定
    border_positive = threshold if threshold > 0 else 0.8
    border_negative = threshold if threshold > 0 else 0.3

    # 候補数の設定
    max_candidates = 20
    candidates = {}
    
    # wordの設定確認
    if not word:
        raise Exception("word is missing")

    # wordがモデルにない場合，
    if word not in vecs:
        raise Exception("Sorry, this word is not registered in model.")

    # ベクトルの設定
    w_vec = vecs[word]

    # ナレッジグラフにあるかどうかの確認
    lexicon = read_lexicon(lexicon_arg)
    if word not in lexicon:
    #     raise Exception("not found error in dict")
        print("not found error in dict")
    
    for w in vecs:
        try:
            if w_vec.shape != vecs[w].shape:
                raise Exception("size not match")
            s = similarity(w_vec, vecs[w])
        except Exception as ex:
            print(w + " is not valid word.")
            continue

        if negative and s <= border_negative:
            candidates[w] = s
            if len(candidates) % 5 == 0:
                border_negative -= 0.05
        elif not negative and s >= border_positive:
            candidates[w] = s
            if len(candidates) % 5 == 0:
                border_positive += 0.05

        if len(candidates) > max_candidates:
            break
    
    # 類義語算出
    sorted_candidates = sorted(candidates, key=candidates.get, reverse=not negative)
    for c in sorted_candidates:
        print("{0}, {1}".format(c, candidates[c]))

In [37]:
word = '彼女'

In [39]:
# 初期vecの場合，
checkSim_by_word(vecs_wordVecs, word)
print(' ')
# newvecの場合，
checkSim_by_word(vecs_new_vec, word)

754069 is not valid word.
彼女, 1.0
彼, 0.7881711200018091
自分, 0.670073388263443
彼ら, 0.640895342831867
自身, 0.5246801791121426
女性, 0.5234110524560043
自ら, 0.5182341702311083
妻, 0.5180212881638562
たち, 0.5033317418848571
その, 0.45799455304278275
う, 0.4364369398637639
そして, 0.4343208537947781
それ, 0.38248465445508495
人, 0.3587975823461335
た, 0.35252306602737504
という, 0.351366702926194
この, 0.3510216224354535
に, 0.33693028236821
の, 0.3280942013100719
を, 0.3240346814325094
、, 0.3091203720976004
 
754069 is not valid word.
彼, 0.864642855479408
自身, 0.5674859399826648
知る, 0.4617902743271744
人, 0.4615519253815723
それ, 0.4602458636767502
家, 0.4544736289164974
者, 0.4413520603902445
その, 0.4247001308463406
いる, 0.41938810278738964
こと, 0.41417631043315545
中, 0.40988732720672916
後, 0.40122504221479827
また, 0.4009454755117826
ある, 0.3994005874913116
なる, 0.3741953852747088
た, 0.3673450095585146
する, 0.3590333640691226
に, 0.34534366653098586
の, 0.3441150380679108
、, 0.33847056209787146
は, 0.31613429952648975
