In [1]:
import sys
sys.getdefaultencoding()

'utf-8'

In [2]:
# -*- coding: utf-8 -*-
# python3
#
import argparse
import gzip
import math
import numpy
import re
import sys
import numpy as np
from copy import deepcopy
import codecs

In [3]:
isNumber = re.compile(r'\d+.*')
def norm_word(word):
    if isNumber.search(word.lower()):
        return '---num---'
    elif re.sub(r'\W+', '', word) == '':
        return '---punc---'
    else:
        return word.lower()

In [4]:
"""Read all the word vectors and normalize them"""
def read_word_vecs(filename):
    wordVectors = {}
    # ファイル読み込み
    if filename.endswith('.gz'): 
        fileObject = gzip.open(filename, 'r')
    else: 
        fileObject = codecs.open(filename, "r", "utf-8", 'ignore')
        
    for line in fileObject:
        # line = line.strip().lower()
        line = line.strip()
        word = line.split()[0]
        wordVectors[word] = numpy.zeros(len(line.split())-1, dtype=float)
        for index, vecVal in enumerate(line.split()[1:]):
            wordVectors[word][index] = float(vecVal)
        """normalize weight vector"""
        wordVectors[word] /= math.sqrt((wordVectors[word]**2).sum() + 1e-6)

    sys.stderr.write("Vectors read from: "+filename+" \n")
    return wordVectors

In [5]:
"""Write word vectors to file"""
def print_word_vecs(wordVectors, outFileName):
    sys.stderr.write('\nWriting down the vectors in '+outFileName+'\n')
    outFile = open(outFileName, 'w')  
    for word, values in wordVectors.items():
        outFile.write(word+' ')
        for val in wordVectors[word]:
            outFile.write('%.4f' %(val)+' ')
        outFile.write('\n')      
    outFile.close()

In [6]:
"""Read the PPDB.etc word relations as a dictionary"""
def read_lexicon(filename):
    lexicon = {}
    fileObject = open(filename, 'r')
    for line in fileObject:
        words = line.lower().strip().split()
        lexicon[norm_word(words[0])] = [norm_word(word) for word in words[1:]]
    return lexicon

In [7]:
"""Retrofit word vectors to a lexicon"""
def retrofit(wordVecs, lexicon, numIters):
    # Input word vecs
    newWordVecs = deepcopy(wordVecs)
    # Input word vecsの単語リスト
    wvVocab = set(newWordVecs.keys())
    # wvVocabとlexiconの共通単語
    loopVocab = wvVocab.intersection(set(lexicon.keys()))

    for _ in range(numIters): #10回程度
        # loop through every node also in ontology (else just use data estimate)
        for word in loopVocab:
            # lexicon wordの近傍単語とwvVocabの共通単語とその個数
            wordNeighbours = set(lexicon[word]).intersection(wvVocab)
            numNeighbours = len(wordNeighbours)
            # no neighbours -> pass - use data estimate
            if numNeighbours == 0:
                continue
            """分散表現の更新手続き"""
            # the weight of the data estimate if the number of neighbours
            newVec = numNeighbours * wordVecs[word]
            # loop over neighbours and add to new vector (currently with weight 1)
            for ppWord in wordNeighbours:
                newVec += newWordVecs[ppWord]
            newWordVecs[word] = newVec/(2*numNeighbours)
    return newWordVecs

In [8]:
def similarity(v1, v2):
    n1 = np.linalg.norm(v1) # v1のノルム
    n2 = np.linalg.norm(v2) # v2のノルム
    return np.dot(v1, v2) / (n1*n2) # 内積 / 

### 変数設定

In [9]:
# Input word vecs -> original
input_arg = './word2vec/vectors.model'
# Lexicon file name
lexicon_arg = './lexicons/wordnet-jpn.txt'
# Lexicon file name（keyとvalueに共通の単語を含まない）
lexicon_arg = './result.txt'
# Num iterations
numiter_arg = 10
# Output word vecs -> retrofitting
output_arg = './out_vec.txt'

In [10]:
numIter = int(numiter_arg)

In [11]:
outFileName = output_arg

In [12]:
lexicon = read_lexicon(lexicon_arg)

In [13]:
wordVecs = read_word_vecs(input_arg)

Vectors read from: ./word2vec/vectors.model 


In [14]:
new_vec = retrofit(wordVecs, lexicon, numIter)

### retrofittingした分散表現を保存する

In [15]:
"""Enrich the word vectors using ppdb and print the enriched vectors"""
print_word_vecs(new_vec, outFileName)


Writing down the vectors in ./out_vec.txt


### 評価

In [16]:
# word2vec
vecs_wordVecs = wordVecs

In [17]:
# retrofitting
vecs_new_vec = new_vec

In [18]:
word = '年'
newWordVecs = deepcopy(wordVecs)
wvVocab = set(wordVecs.keys())
wordNeighbours = set(lexicon[word]).intersection(wvVocab)
print('wordNeighbours : {}'.format(wordNeighbours))
numNeighbours = len(wordNeighbours)
print('numNeighbours : {}'.format(numNeighbours))
newVec = numNeighbours * wordVecs[word]
for ppWord in wordNeighbours:
    newVec += newWordVecs[ppWord]
print('分母 : {}'.format(2*numNeighbours))
newWordVecs[word] = newVec/(2*numNeighbours)

wordNeighbours : {'齢', '一年', '年紀', '年次', '老年', '年歯', '年中', '老いらく', '老い', '高齢', '年代', '年間', '老齢', '年頃', '年齢', '年令', '馬齢', '歳', '歳次', '年度'}
numNeighbours : 20
分母 : 40


## ●wordsim算出

In [102]:
def checkSim(v1, v2):
    if v1 not in lexicon: # 注目単語がwordnetに含まれない場合，
        print("v1(={})はwordnetのkeyに存在しません".format(v1))
        wordNeighbours_1 = set()
    else:
        # 注目単語の同義語リストとword2vecのkeyリストと重複する単語リスト（更新対象か？）
        wordNeighbours_1 = set(lexicon[v1]).intersection(set(wordVecs.keys()))
        print('v1(={})における更新対象のneighbour数 : {}'.format(v1, len(wordNeighbours_1)))
        
    # neighboursの中で注目単語と類似度が低いもの
    ave_1_word2vec = np.zeros_like(wordVecs[v1])  #neighboursの平均ベクトル
    ave_1_retrofit = np.zeros_like(wordVecs[v1])
    for neighbour in wordNeighbours_1:
        print('v1とneighbour(={}) : {} -> {}'.format(neighbour, 
                                                                   similarity(wordVecs[neighbour], wordVecs[v1]), 
                                                                   similarity(vecs_new_vec[neighbour], vecs_new_vec[v1])))
        ave_1_word2vec += wordVecs[neighbour]
        ave_1_retrofit += vecs_new_vec[neighbour]
    print('v1とneighboursの平均ベクトル : {} -> {}'.format(similarity(ave_1_word2vec/len(wordNeighbours_1), wordVecs[v1]), 
                                                              similarity(ave_1_retrofit/len(wordNeighbours_1), vecs_new_vec[v1])))
    print(" ")
    
    if v2 not in lexicon:
        print("v2(={})はwordnetのkeyに存在しません".format(v2))
        wordNeighbours_2 = set()
    else:
        wordNeighbours_2 = set(lexicon[v2]).intersection(set(wordVecs.keys()))
        print('v2(={})における更新対象のneighbour数 : {}'.format(v2, len(wordNeighbours_2))) # 更新対象数
    
    # neighboursの中で注目単語と類似度が低いもの
    ave_2_word2vec = np.zeros_like(wordVecs[v2])  #neighboursの平均ベクトル
    ave_2_retrofit = np.zeros_like(wordVecs[v2])
    for neighbour in wordNeighbours_2:
        print('v2とneighbour(={}) : {} -> {}'.format(neighbour, 
                                                                   similarity(wordVecs[neighbour], wordVecs[v2]), 
                                                                   similarity(vecs_new_vec[neighbour], vecs_new_vec[v2])))
        ave_2_word2vec += wordVecs[neighbour]
        ave_2_retrofit += vecs_new_vec[neighbour]
    print('v2とneighboursの平均ベクトル : {} -> {}'.format(similarity(ave_2_word2vec/len(wordNeighbours_2), wordVecs[v2]), 
                                                              similarity(ave_2_retrofit/len(wordNeighbours_2), vecs_new_vec[v2])))
    print(" ")
    
    # v1とv2における更新対象のneighbourに重複があるか
    print('更新対象における重複数 : {}'.format(len(set(wordNeighbours_1).intersection(set(wordNeighbours_2)))))
    print(" ")

    """v1とv2のword2vecとretrofittingにおける類似度"""
    try:
        print("word2vec : {}".format(similarity(vecs_wordVecs[v1], vecs_wordVecs[v2])))
    except:
        print('error')

    try:
        print("retrofitting : {}".format(similarity(vecs_new_vec[v1], vecs_new_vec[v2])))
    except:
        print('error')

    print(" ")
    """同一単語におけるword2vecとretrofittingの類似度"""
    try:
        print("v1 : {}".format(similarity(vecs_wordVecs[v1], vecs_new_vec[v1])))
    except:
        print('error')

    try:
        print("v2 : {}".format(similarity(vecs_wordVecs[v2], vecs_new_vec[v2])))
    except:
        print('error')

### マイナス

In [111]:
# 共通項がないやつで最終値が一番高い
v1 = 'ドル'
v2 = '円'
checkSim(v1, v2)

v1(=ドル)はwordnetのkeyに存在しません
v1とneighboursの平均ベクトル : nan -> nan
 
v2(=円)における更新対象のneighbour数 : 8
v2とneighbour(=巡回) : 0.1131635014205176 -> 0.4291887263221031




v2とneighbour(=丸形) : 0.21967795296196008 -> 0.6015340394512537
v2とneighbour(=サークル) : 0.17831748010221335 -> 0.4648514078651976
v2とneighbour(=循環) : 0.15366664240606967 -> 0.4915772471454811
v2とneighbour(=丸) : 0.27237524443096445 -> 0.5800608916545316
v2とneighbour(=円型) : 0.22594052119243904 -> 0.6060745381380435
v2とneighbour(=輪) : 0.2726846607881181 -> 0.5878950110144121
v2とneighbour(=円形) : 0.34518353641197497 -> 0.6758589356721078
v2とneighboursの平均ベクトル : 0.39251172887826713 -> 0.7335681831744241
 
更新対象における重複数 : 0
 
word2vec : 0.5411942571094619
retrofitting : 0.5183508062187686
 
v1 : 1.0
v2 : 0.9399721324548693


### 共通

In [107]:
# 共通項があるやつで最終値が一番高い
v1 = 'フットボール'
v2 = 'サッカー'
checkSim(v1, v2)

v1(=フットボール)における更新対象のneighbour数 : 4
v1とneighbour(=アソシエーションフットボール) : 0.44592722294673065 -> 0.8335826927035681
v1とneighbour(=フートボール) : 0.41945202281722355 -> 0.8278195137015855
v1とneighbour(=蹴球) : 0.4714661654042284 -> 0.8447621939082396
v1とneighbour(=サッカー) : 0.6700097012259578 -> 0.9024019593223032
v1とneighboursの平均ベクトル : 0.6533150118662724 -> 0.9100273993864155
 
v2(=サッカー)における更新対象のneighbour数 : 4
v2とneighbour(=アソシエーションフットボール) : 0.3838424344300787 -> 0.8132863977242077
v2とneighbour(=蹴球) : 0.49144473934055627 -> 0.8494863730782928
v2とneighbour(=フートボール) : 0.3560841190233665 -> 0.8074712182772082
v2とneighbour(=フットボール) : 0.6700097012259578 -> 0.9024019593223032
v2とneighboursの平均ベクトル : 0.6121738109726408 -> 0.8983739758369889
 
更新対象における重複数 : 3
 
word2vec : 0.6700097012259578
retrofitting : 0.9024019593223032
 
v1 : 0.9490886483333562
v2 : 0.9421231737699743


### 低い

In [105]:
v1 = '生'
v2 = '死'
checkSim(v1, v2)

v1(=生)における更新対象のneighbour数 : 26
v1とneighbour(=ライフ) : 0.17652417447173702 -> 0.43791106925070644
v1とneighbour(=毛並) : 0.24577676333904072 -> 0.572265884228388
v1とneighbour(=血統) : 0.14742782305704158 -> 0.526484784694368
v1とneighbour(=出自) : 0.16291115673841247 -> 0.5404569876542337
v1とneighbour(=身元) : 0.17757387864207355 -> 0.5072710415210289
v1とneighbour(=門閥) : 0.15888040040420326 -> 0.49112465754259804
v1とneighbour(=血脈) : 0.06475758927170373 -> 0.4700031284223761
v1とneighbour(=系統) : 0.20493965847265672 -> 0.5114783957811527
v1とneighbour(=毛並み) : 0.07192749128803916 -> 0.45536094889912493
v1とneighbour(=家系) : 0.10273288731876168 -> 0.5004052368505814
v1とneighbour(=命) : 0.2019877674547781 -> 0.4559616578569955
v1とneighbour(=種姓) : 0.2628761047023309 -> 0.5665000055647801
v1とneighbour(=閥) : 0.1931630728994868 -> 0.4937570369455544
v1とneighbour(=門) : 0.17700279978925296 -> 0.4484143125201698
v1とneighbour(=氏素性) : 0.11371815329503893 -> 0.5076226578073788
v1とneighbour(=生まれ) : 0.17343376652421144 

In [118]:
v1 = '美術館'
v2 = '劇場'
checkSim(v1, v2)

v1(=美術館)における更新対象のneighbour数 : 4
v1とneighbour(=ギャラリー) : 0.7021229078972576 -> 0.915751866800859
v1とneighbour(=画廊) : 0.6152072871038751 -> 0.8807404110283732
v1とneighbour(=ミュージアム) : 0.6293827471798114 -> 0.8964914766435316
v1とneighbour(=博物館) : 0.7262213696149932 -> 0.9141538954372825
v1とneighboursの平均ベクトル : 0.8186927006032015 -> 0.9666284454731989
 
v2(=劇場)における更新対象のneighbour数 : 5
v2とneighbour(=能楽堂) : 0.3547952875318167 -> 0.7718612705973279
v2とneighbour(=芝居小屋) : 0.48358660663894676 -> 0.8210933834428926
v2とneighbour(=戯場) : 0.21712467833665977 -> 0.7150136651358888
v2とneighbour(=シアター) : 0.6401493559053145 -> 0.8757172267540604
v2とneighbour(=テアトル) : 0.5394930833350355 -> 0.822213310854582
v2とneighboursの平均ベクトル : 0.6630547249790616 -> 0.9011015248957814
 
更新対象における重複数 : 0
 
word2vec : 0.42690805296907663
retrofitting : 0.5202503207311584
 
v1 : 0.977424165460638
v2 : 0.9574142056264471


In [112]:
# 共通項がないやつで最終値が一番高い
v1 = '王'
v2 = '城'
checkSim(v1, v2)

v1(=王)における更新対象のneighbour数 : 14
v1とneighbour(=王者) : 0.3101894359652949 -> 0.562964970904805
v1とneighbour(=王さま) : 0.2860579492700634 -> 0.6512339301522426
v1とneighbour(=クイーン) : 0.24872054073498523 -> 0.5972356601724406
v1とneighbour(=クィーン) : 0.21705241353816102 -> 0.5813464441629039
v1とneighbour(=王様) : 0.4239317157852163 -> 0.7312840466383078
v1とneighbour(=豪商) : 0.2599904642630862 -> 0.5687664870691005
v1とneighbour(=大立者) : 0.1979678673564072 -> 0.5440496504516424
v1とneighbour(=レックス) : 0.12846845975009447 -> 0.6034419344930807
v1とneighbour(=キング) : 0.27940652363442164 -> 0.6843970094996994
v1とneighbour(=君王) : 0.3819989838711822 -> 0.6770689717690949
v1とneighbour(=財界人) : 0.13009340003740402 -> 0.43928054528802907
v1とneighbour(=巨頭) : 0.19682008052235375 -> 0.5026156264495584
v1とneighbour(=第一人者) : 0.17573438535269445 -> 0.5852458595220515
v1とneighbour(=国王) : 0.6917026014021463 -> 0.8529199498319596
v1とneighboursの平均ベクトル : 0.5390497618857002 -> 0.8343685647230328
 
v2(=城)における更新対象のneighbour数 : 25

### 高い

In [109]:
# 共通項がないやつで最終値が一番高い
v1 = 'テレビ'
v2 = 'ラジオ'
checkSim(v1, v2)

v1(=テレビ)における更新対象のneighbour数 : 4
v1とneighbour(=テレビ受像機) : 0.5441313221781042 -> 0.8613849019964129
v1とneighbour(=テレビ放送) : 0.6787932434324452 -> 0.9056473604391946
v1とneighbour(=テレビジョン) : 0.5616445110157754 -> 0.8723406943883184
v1とneighbour(=テレヴィジョン) : 0.37064471779783603 -> 0.8088971992240203
v1とneighboursの平均ベクトル : 0.723982904839291 -> 0.9298441484397826
 
v2(=ラジオ)における更新対象のneighbour数 : 2
v2とneighbour(=チューナー) : 0.42850172988676133 -> 0.8709666273032112
v2とneighbour(=チューナ) : 0.3938496885949077 -> 0.8624978933689752
v2とneighboursの平均ベクトル : 0.45522564671014276 -> 0.884955517314295
 
更新対象における重複数 : 0
 
word2vec : 0.7361300573266628
retrofitting : 0.7592624761244849
 
v1 : 0.9616185424261647
v2 : 0.9217192863974057


In [113]:
# 共通項がないやつで最終値が一番高い
v1 = '飲む'
v2 = '食べる'
checkSim(v1, v2)

v1(=飲む)における更新対象のneighbour数 : 10
v1とneighbour(=押しまくる) : 0.29192872169906087 -> 0.6494121335549915
v1とneighbour(=圧す) : 0.21701712636899895 -> 0.5871563986556991
v1とneighbour(=飲酒) : 0.5462529604926939 -> 0.8198364349988605
v1とneighbour(=飲みこむ) : 0.34430900120028607 -> 0.6647725687899138
v1とneighbour(=呑む) : 0.6053115809833923 -> 0.8095862008190163
v1とneighbour(=呑みこむ) : 0.3170492369722226 -> 0.6353214396254244
v1とneighbour(=押す) : 0.2621257996819309 -> 0.5859205498007677
v1とneighbour(=圧倒) : 0.22653396847640545 -> 0.6242290161236901
v1とneighbour(=呑み込む) : 0.3827670006466599 -> 0.6760116046036065
v1とneighbour(=圧する) : 0.15395362894024356 -> 0.5172680632146465
v1とneighboursの平均ベクトル : 0.5383063562437775 -> 0.82455317801076
 
v2(=食べる)における更新対象のneighbour数 : 12
v2とneighbour(=食む) : 0.38105936119163064 -> 0.7170363742151195
v2とneighbour(=召し上がる) : 0.49685808939480847 -> 0.7677150899669016
v2とneighbour(=喰う) : 0.4828591555898598 -> 0.7221916464278607
v2とneighbour(=食する) : 0.816408708129915 -> 0.91782644938044

In [119]:
# 共通項がないやつで最終値が一番高い
v1 = '医師'
v2 = '看護師'
checkSim(v1, v2)

v1(=医師)における更新対象のneighbour数 : 11
v1とneighbour(=薬師) : 0.19296580520475656 -> 0.6480136951021512
v1とneighbour(=ドクター) : 0.4804415296902404 -> 0.7746588857647472
v1とneighbour(=医員) : 0.484714624518716 -> 0.7857234703444463
v1とneighbour(=開業医) : 0.6909833714035468 -> 0.8743027476679038
v1とneighbour(=医家) : 0.43677196823172926 -> 0.764525263642084
v1とneighbour(=国手) : 0.20735460565847408 -> 0.5062540810772626
v1とneighbour(=杏林) : 0.19466620372975943 -> 0.6494180845807439
v1とneighbour(=医) : 0.7552187090764066 -> 0.8939365285647622
v1とneighbour(=ドクトル) : 0.3581458967787516 -> 0.726976044274809
v1とneighbour(=医者) : 0.7569605809715519 -> 0.9026890514771313
v1とneighbour(=内科医) : 0.7212434425390394 -> 0.8871307653911633
v1とneighboursの平均ベクトル : 0.7936331349555631 -> 0.9231214358404376
 
v2(=看護師)における更新対象のneighbour数 : 4
v2とneighbour(=看護婦) : 0.7768066373887185 -> 0.9401185437306926
v2とneighbour(=看護士) : 0.6944085594045406 -> 0.916656789019836
v2とneighbour(=ナース) : 0.5219851825453762 -> 0.8674098573168365
v2とneigh

### トラと虎とタイガーがwordvecのkeyとwordnetのvalueにどの程度含まれているか

### ●類似度から特徴を捉える
#### ・（やや）上がりやすい単語の組み合わせの特徴
#### ・（やや）下がりやすい単語の組み合わせの特報
#### ・類似度が下がっていない組み合わせはないか？（現在，見つかっていない）

### ●wordnetに含まれているかどうか
#### ・wordnetに含まれているもの同士（？）
#### ・wordnetに片方だけ含まれているもの同士（？）
#### ・wordnetに両方含まれているもの同士（ベクトルは変化しない）

### ●ベクトル内の比較
#### ・word2vecとretrofittingでどう変わったか（次元）
#### ・変化のパターンを見つけたり，，，

### ●イテレーション回数
#### ・イテレーション回数が少ない，周辺単語同士を考慮したベクトル表現になっているはず
#### ・イテレーション回数が多いと，遠い単語同士を考慮したベクトル表現になっているはず

In [26]:
# deepcopy
ex_1_dict = deepcopy(vecs_wordVecs)

In [27]:
# 更新手続き
for i in range(10):
    ex_1 = 2*vecs_wordVecs['トラ'] + ex_1_dict['虎'] + ex_1_dict['タイガー']
    ex_1_dict['トラ'] = ex_1/4
    
    ex_1 = 2*vecs_wordVecs['虎'] + ex_1_dict['トラ'] + ex_1_dict['タイガー']
    ex_1_dict['虎'] = ex_1/4
    
    ex_1 = 2*vecs_wordVecs['タイガー'] + ex_1_dict['トラ'] + ex_1_dict['虎']
    ex_1_dict['タイガー'] = ex_1/4

In [28]:
for i in range(10):
    ex_1 = 3*vecs_wordVecs['ネコ'] + ex_1_dict['ねんねこ'] + ex_1_dict['猫'] + ex_1_dict['にゃんにゃん']
    ex_1_dict['ネコ'] = ex_1/6
    
    ex_1 = 9*vecs_wordVecs['ねんねこ'] + ex_1_dict['ネコ'] + ex_1_dict['御寝'] + ex_1_dict['睡眠'] + ex_1_dict['にゃんにゃん'] + ex_1_dict['眠り'] + ex_1_dict['ねね'] + ex_1_dict['就眠'] + ex_1_dict['スリープ'] + ex_1_dict['猫']
    ex_1_dict['ねんねこ'] = ex_1/18
    
    ex_1 = 3*vecs_wordVecs['猫'] + ex_1_dict['ネコ'] + ex_1_dict['ねんねこ'] + ex_1_dict['にゃんにゃん']
    ex_1_dict['猫'] = ex_1/6
    
    ex_1 = 3*vecs_wordVecs['にゃんにゃん'] + ex_1_dict['ネコ'] + ex_1_dict['ねんねこ'] + ex_1_dict['猫']
    ex_1_dict['にゃんにゃん'] = ex_1/6

In [29]:
# word2vec
similarity(vecs_wordVecs['トラ'], vecs_wordVecs['ネコ'])

0.46740306874140686

In [30]:
# retrofitting
similarity(ex_1_dict['トラ'], ex_1_dict['ネコ'])

0.5452202960439427

## ●アナロジー算出
### v1+v2-v3, v4の類似度

In [31]:
def checkAnalogy(vecs, w_vec):  
    negative = False # Falseなら似た単語を候補で上げる
    threshold = 0.3 # -1なら閾値固定

    # 閾値の設定
    border_positive = threshold if threshold > 0 else 0.9
    border_negative = threshold if threshold > 0 else 0.2
    print('{} < thd < {}'.format(border_negative, border_positive))

    # 候補数の設定
    max_candidates = 20
    candidates = {}

    for w in vecs:
        try:
            if w_vec.shape != vecs[w].shape:
                raise Exception("size not match")
            s = similarity(w_vec, vecs[w])
        except Exception as ex:
            print(w + " is not valid word.")
            continue

        if negative and s <= border_negative:
            candidates[w] = s
            if len(candidates) % 5 == 0:
                border_negative -= 0.05
        elif not negative and s >= border_positive:
            candidates[w] = s
            if len(candidates) % 5 == 0:
                border_positive += 0.05

        if len(candidates) > max_candidates:
            break

    # 類義語算出
    sorted_candidates = sorted(candidates, key=candidates.get, reverse=not negative)
    for c in sorted_candidates:
        print("{0}, {1}".format(c, candidates[c]))

In [32]:
v1 = '兄'
v2 = '姉'
v3 = '祖父'
v4 = '祖母'

if v1 not in lexicon:
    print("v1 not found error in dict")
if v2 not in lexicon:
    print("v2 not found error in dict")
if v3 not in lexicon:
    print("v3 not found error in dict")
if v4 not in lexicon:
    print("v4 not found error in dict")

try:
    print('word2vec : {}'.format(similarity(vecs_wordVecs[v1] + vecs_wordVecs[v2] - vecs_wordVecs[v3], vecs_wordVecs[v4])))
except:
    print('error')

try:
    print('retrofitting : {}'.format(similarity(vecs_new_vec[v1] + vecs_new_vec[v2] - vecs_new_vec[v3], vecs_new_vec[v4])))
except:
    print('error')

word2vec : 0.3855693677632116
retrofitting : 0.4551626248488288


### v1+v2-v3の上位10単語

In [33]:
checkAnalogy(vecs_wordVecs, vecs_wordVecs[v1] + vecs_wordVecs[v2] - vecs_wordVecs[v3])

0.3 < thd < 0.3
754069 is not valid word.
姉, 0.8433279735101462
兄, 0.7622779357580127
妹, 0.7553375375315672
弟, 0.7011033258786433
双子, 0.5541898022518956
夫, 0.5241162973051148
年上, 0.5228285283574702
次女, 0.49053484292919564
兄弟, 0.4801388417438813
叔父, 0.48004084731816965
甥, 0.4646477248521072
母, 0.4618720699656536
長女, 0.4522498603989746
娘。, 0.4484254297338895
彼女, 0.41874909255787995
妻, 0.4180556386599047
息子, 0.38730153006548584
父, 0.3860297212008132
結婚, 0.3574390676533354
男, 0.35215392646403093
2人, 0.32434570907744076


In [34]:
checkAnalogy(vecs_new_vec, vecs_new_vec[v1] + vecs_new_vec[v2] - vecs_new_vec[v3])

0.3 < thd < 0.3
754069 is not valid word.
弟, 0.7363773428035553
彼女, 0.5633848612459265
母, 0.5338945985417203
息子, 0.5296489064677692
妻, 0.5185961666462241
娘。, 0.5039797679904815
男, 0.48125900743836425
女性, 0.46642097473328686
子供, 0.46185837631087945
彼, 0.46079802351631
父, 0.453286636253022
結婚, 0.4467901599528763
子, 0.41050218358373636
2人, 0.4091102121032748
自分, 0.3918471991585603
思う, 0.3722482847040549
出演, 0.3608708000529299
メンバー, 0.35132074590314577
後, 0.3332142011063982
優勝, 0.3050190898147509
活動, 0.30106505940926076


In [35]:
# # ベクトルの設定
# w_vec = vecs_wordVecs[v1] + vecs_wordVecs[v2] - vecs_wordVecs[v3]
# vecs = vecs_wordVecs
# for w in vecs:
#     try:
#         if w_vec.shape != vecs[w].shape:
#             raise Exception("size not match")
#         s = similarity(w_vec, vecs[w])
#     except Exception as ex:
#         print(w + " is not valid word.")
#         continue

#     if negative and s <= border_negative:
#         candidates[w] = s
#         if len(candidates) % 5 == 0:
#             border_negative -= 0.05
#     elif not negative and s >= border_positive:
#         candidates[w] = s
#         if len(candidates) % 5 == 0:
#             border_positive += 0.05

#     if len(candidates) > max_candidates:
#         break

# # 類義語算出
# sorted_candidates = sorted(candidates, key=candidates.get, reverse=not negative)
# for c in sorted_candidates:
#     print("{0}, {1}".format(c, candidates[c]))

In [36]:
# # ベクトルの設定
# w_vec = vecs_new_vec[v1] + vecs_new_vec[v2] - vecs_new_vec[v3]
# vecs = vecs_new_vec
# for w in vecs:
#     try:
#         if w_vec.shape != vecs[w].shape:
#             raise Exception("size not match")
#         s = similarity(w_vec, vecs[w])
#     except Exception as ex:
#         print(w + " is not valid word.")
#         continue

#     if negative and s <= border_negative:
#         candidates[w] = s
#         if len(candidates) % 5 == 0:
#             border_negative -= 0.05
#     elif not negative and s >= border_positive:
#         candidates[w] = s
#         if len(candidates) % 5 == 0:
#             border_positive += 0.05

#     if len(candidates) > max_candidates:
#         break

# # 類義語算出
# sorted_candidates = sorted(candidates, key=candidates.get, reverse=not negative)
# for c in sorted_candidates:
#     print("{0}, {1}".format(c, candidates[c]))

## ●wordに対する上位10単語

In [37]:
word = '彼女'
# path = "./fastText/model.vec"
negative = False # Falseなら似た単語を候補で上げる
threshold = 0.6 # -1なら閾値固定

In [38]:
def checkSim_by_word(vecs, word):
    # 閾値の設定
    border_positive = threshold if threshold > 0 else 0.8
    border_negative = threshold if threshold > 0 else 0.3

    # 候補数の設定
    max_candidates = 20
    candidates = {}
    
    # wordの設定確認
    if not word:
        raise Exception("word is missing")

    # wordがモデルにない場合，
    if word not in vecs:
        raise Exception("Sorry, this word is not registered in model.")

    # ベクトルの設定
    w_vec = vecs[word]

    # ナレッジグラフにあるかどうかの確認
    lexicon = read_lexicon(lexicon_arg)
    if word not in lexicon:
    #     raise Exception("not found error in dict")
        print("not found error in dict")
    
    for w in vecs:
        try:
            if w_vec.shape != vecs[w].shape:
                raise Exception("size not match")
            s = similarity(w_vec, vecs[w])
        except Exception as ex:
            print(w + " is not valid word.")
            continue

        if negative and s <= border_negative:
            candidates[w] = s
            if len(candidates) % 5 == 0:
                border_negative -= 0.05
        elif not negative and s >= border_positive:
            candidates[w] = s
            if len(candidates) % 5 == 0:
                border_positive += 0.05

        if len(candidates) > max_candidates:
            break
    
    # 類義語算出
    sorted_candidates = sorted(candidates, key=candidates.get, reverse=not negative)
    for c in sorted_candidates:
        print("{0}, {1}".format(c, candidates[c]))

In [39]:
checkSim_by_word(vecs_wordVecs, word)

754069 is not valid word.
彼女, 1.0
彼, 0.7881711200018091
彼女自身, 0.7454731084176603
彼女たち, 0.7367573651943607
彼女ら, 0.6977034431164688
自分, 0.670073388263443
ルービツ, 0.654221608969978
フィリピエヴナ, 0.654103015163623
彼ら, 0.640895342831867
私, 0.6133480216235981


In [40]:
checkSim_by_word(vecs_new_vec, word)

754069 is not valid word.
彼女, 1.0
ガールフレンド, 0.8702002688497589
彼, 0.8646146980898385
恋人, 0.8645802396686143
女友達, 0.8311694694676265
ボーイフレンド, 0.8124717823463654
彼氏, 0.7801450844152102
愛人, 0.7650726007804837
姉, 0.7288931991907749
母親, 0.7185615661404968
彼女たち, 0.7185170361874652
少女, 0.7085028585700932
夫, 0.7052696073185539
友人, 0.6895894431116874
妻, 0.6895129106064973
女, 0.6675841870719544
女性, 0.6668317027438901
自分, 0.6601343182581626
結婚, 0.6003956506916117


In [41]:
# # wordの設定確認
# if not word:
#     raise Exception("word is missing")
    
# # wordがモデルにない場合，
# if word not in vecs:
#     raise Exception("Sorry, this word is not registered in model.")

# # ベクトルの設定
# w_vec = vecs[word]

# # ナレッジグラフにあるかどうかの確認
# lexicon = read_lexicon(lexicon_arg)
# if word not in lexicon:
# #     raise Exception("not found error in dict")
#     print("not found error in dict")

# # 閾値の設定
# border_positive = threshold if threshold > 0 else 0.8
# border_negative = threshold if threshold > 0 else 0.3

# # 候補数の設定
# max_candidates = 20
# candidates = {}

In [42]:
# for w in vecs:
#     try:
#         if w_vec.shape != vecs[w].shape:
#             raise Exception("size not match")
#         s = similarity(w_vec, vecs[w])
#     except Exception as ex:
#         print(w + " is not valid word.")
#         continue

#     if negative and s <= border_negative:
#         candidates[w] = s
#         if len(candidates) % 5 == 0:
#             border_negative -= 0.05
#     elif not negative and s >= border_positive:
#         candidates[w] = s
#         if len(candidates) % 5 == 0:
#             border_positive += 0.05

#     if len(candidates) > max_candidates:
#         break

In [43]:
# # 類義語算出
# sorted_candidates = sorted(candidates, key=candidates.get, reverse=not negative)
# for c in sorted_candidates:
#     print("{0}, {1}".format(c, candidates[c]))