In [1]:
import sys
sys.getdefaultencoding()

'utf-8'

In [2]:
# -*- coding: utf-8 -*-
# python3
#
import argparse
import gzip
import math
import numpy
import re
import sys
import numpy as np
from copy import deepcopy
import codecs

In [3]:
isNumber = re.compile(r'\d+.*')
def norm_word(word):
    if isNumber.search(word.lower()):
        return '---num---'
    elif re.sub(r'\W+', '', word) == '':
        return '---punc---'
    else:
        return word.lower()

In [4]:
"""Read all the word vectors and normalize them"""
def read_word_vecs(filename):
    wordVectors = {}
    # ファイル読み込み
    if filename.endswith('.gz'): 
        fileObject = gzip.open(filename, 'r')
    else: 
        fileObject = codecs.open(filename, "r", "utf-8", 'ignore')
        
    for line in fileObject:
        # line = line.strip().lower()
        line = line.strip()
        word = line.split()[0]
        wordVectors[word] = numpy.zeros(len(line.split())-1, dtype=float)
        for index, vecVal in enumerate(line.split()[1:]):
            wordVectors[word][index] = float(vecVal)
        """normalize weight vector"""
        wordVectors[word] /= math.sqrt((wordVectors[word]**2).sum() + 1e-6)

    sys.stderr.write("Vectors read from: "+filename+" \n")
    return wordVectors

In [5]:
"""Write word vectors to file"""
def print_word_vecs(wordVectors, outFileName):
    sys.stderr.write('\nWriting down the vectors in '+outFileName+'\n')
    outFile = open(outFileName, 'w')  
    for word, values in wordVectors.items():
        outFile.write(word+' ')
        for val in wordVectors[word]:
            outFile.write('%.4f' %(val)+' ')
        outFile.write('\n')      
    outFile.close()

In [6]:
"""Read the PPDB.etc word relations as a dictionary"""
def read_lexicon(filename):
    lexicon = {}
    fileObject = open(filename, 'r')
    for line in fileObject:
        words = line.lower().strip().split()
        lexicon[norm_word(words[0])] = [norm_word(word) for word in words[1:]]
    return lexicon

In [7]:
"""Retrofit word vectors to a lexicon"""
def retrofit(wordVecs, lexicon, numIters):
    # Input word vecs
    newWordVecs = deepcopy(wordVecs)
    # Input word vecsの単語リスト
    wvVocab = set(newWordVecs.keys())
    # wvVocabとlexiconの共通単語
    loopVocab = wvVocab.intersection(set(lexicon.keys()))

    for _ in range(numIters): #10回程度
        # loop through every node also in ontology (else just use data estimate)
        for word in loopVocab:
            # lexicon wordの近傍単語とwvVocabの共通単語とその個数
            wordNeighbours = set(lexicon[word]).intersection(wvVocab)
            numNeighbours = len(wordNeighbours)
            # no neighbours -> pass - use data estimate
            if numNeighbours == 0:
                continue
            """分散表現の更新手続き"""
            # the weight of the data estimate if the number of neighbours
            newVec = numNeighbours * wordVecs[word]
            # loop over neighbours and add to new vector (currently with weight 1)
            for ppWord in wordNeighbours:
                newVec += newWordVecs[ppWord]
            newWordVecs[word] = newVec/(2*numNeighbours)
    return newWordVecs

In [8]:
def similarity(v1, v2):
    n1 = np.linalg.norm(v1) # v1のノルム
    n2 = np.linalg.norm(v2) # v2のノルム
    return np.dot(v1, v2) / (n1*n2) # 内積 / 

### 変数設定

In [9]:
# Input word vecs -> original
input_arg = './word2vec/vectors.model'
# Lexicon file name
lexicon_arg = './lexicons/wordnet-jpn.txt'
# Lexicon file name（keyとvalueに共通の単語を含まない）
lexicon_arg = './result.txt'
# Num iterations
numiter_arg = 10
# Output word vecs -> retrofitting
output_arg = './out_vec.txt'

In [10]:
numIter = int(numiter_arg)

In [11]:
outFileName = output_arg

In [12]:
lexicon = read_lexicon(lexicon_arg)

In [13]:
wordVecs = read_word_vecs(input_arg)

Vectors read from: ./word2vec/vectors.model 


In [14]:
new_vec = retrofit(wordVecs, lexicon, numIter)

### retrofittingした分散表現を保存する

In [15]:
"""Enrich the word vectors using ppdb and print the enriched vectors"""
print_word_vecs(new_vec, outFileName)


Writing down the vectors in ./out_vec.txt


### 評価

In [16]:
# word2vec
vecs_wordVecs = wordVecs

In [17]:
# retrofitting
vecs_new_vec = new_vec

In [18]:
word = '年'
newWordVecs = deepcopy(wordVecs)
wvVocab = set(wordVecs.keys())
wordNeighbours = set(lexicon[word]).intersection(wvVocab)
print('wordNeighbours : {}'.format(wordNeighbours))
numNeighbours = len(wordNeighbours)
print('numNeighbours : {}'.format(numNeighbours))
newVec = numNeighbours * wordVecs[word]
for ppWord in wordNeighbours:
    newVec += newWordVecs[ppWord]
print('分母 : {}'.format(2*numNeighbours))
newWordVecs[word] = newVec/(2*numNeighbours)

wordNeighbours : {'高齢', '年中', '老年', '歳', '年紀', '一年', '年歯', '年間', '老齢', '老いらく', '年令', '年度', '老い', '年齢', '馬齢', '年頃', '年次', '齢', '歳次', '年代'}
numNeighbours : 20
分母 : 40


## ●wordsim算出

In [137]:
def checkSim(v1, v2):
    if v1 not in lexicon:
        print("v1 not found error in dict")
    else:
        wordNeighbours_1 = set(lexicon[v1]).intersection(set(wordVecs.keys()))
        print('wordNeighbours_v1 : {}'.format(wordNeighbours_1))
        print('numNeighbours_v1 : {}'.format(len(wordNeighbours_1)))
    if v2 not in lexicon:
        print("v2 not found error in dict")
    else:
        wordNeighbours_2 = set(lexicon[v2]).intersection(set(wordVecs.keys()))
        print('wordNeighbours_v2 : {}'.format(wordNeighbours_2))
        print('numNeighbours_v2 : {}'.format(len(wordNeighbours_2)))
    print(" ")
    print('wordNeighboursの共通単語 : {}'.format(set(wordNeighbours_1).intersection(set(wordNeighbours_2))))
    print(" ")

    """v1とv2のword2vecとretrofittingにおける類似度"""
    try:
        print("word2vec : {}".format(similarity(vecs_wordVecs[v1], vecs_wordVecs[v2])))
    except:
        print('error')

    try:
        print("retrofitting : {}".format(similarity(vecs_new_vec[v1], vecs_new_vec[v2])))
    except:
        print('error')

    print(" ")
    """同一単語におけるword2vecとretrofittingの類似度"""
    try:
        print("v1 : {}".format(similarity(vecs_wordVecs[v1], vecs_new_vec[v1])))
    except:
        print('error')

    try:
        print("v2 : {}".format(similarity(vecs_wordVecs[v2], vecs_new_vec[v2])))
    except:
        print('error')

In [226]:
v1 = 'トラ'
v2 = 'ネコ'
checkSim(v1, v2)

wordNeighbours_v1 : {'虎', 'タイガー'}
numNeighbours_v1 : 2
wordNeighbours_v2 : {'ねんねこ', '猫', 'にゃんにゃん'}
numNeighbours_v2 : 3
 
wordNeighboursの共通単語 : set()
 
word2vec : 0.5214821607397867
retrofitting : 0.545527993271341
 
v1 : 0.9460903473614132
v2 : 0.9137972129149805


In [220]:
ex_1_dict = deepcopy(vecs_wordVecs)

In [None]:
"""明日の方針"""
# トラと虎とタイガーがwordvecのkeyとwordnetのvalueにどの程度含まれているか

# 類似度から特徴を捉える
## （やや）上がりやすい単語の組み合わせの特徴
## （やや）下がりやすい単語の組み合わせの特報
## 類似度が下がっていない組み合わせはないか？（現在，見つかっていない）

# wordnetに含まれているかどうか
## wordnetに含まれているもの同士（？）
## wordnetに片方だけ含まれているもの同士（？）
## wordnetに両方含まれているもの同士（ベクトルは変化しない）

# ベクトル内の比較
## word2vecとretrofittingでどう変わったか（次元）
## 変化のパターンを見つけたり，，，

# イテレーション回数
## イテレーション回数が少ない，周辺単語同士を考慮したベクトル表現になっているはず
## イテレーション回数が多いと，遠い単語同士を考慮したベクトル表現になっているはず

In [222]:
for i in range(10):
    ex_1 = 2*vecs_wordVecs['トラ'] + ex_1_dict['虎'] + ex_1_dict['タイガー']
    ex_1_dict['トラ'] = ex_1/4
    
    ex_1 = 2*vecs_wordVecs['虎'] + ex_1_dict['トラ'] + ex_1_dict['タイガー']
    ex_1_dict['虎'] = ex_1/4
    
    ex_1 = 2*vecs_wordVecs['タイガー'] + ex_1_dict['トラ'] + ex_1_dict['虎']
    ex_1_dict['タイガー'] = ex_1/4

In [223]:
for i in range(10):
    ex_1 = 3*vecs_wordVecs['ネコ'] + ex_1_dict['ねんねこ'] + ex_1_dict['猫'] + ex_1_dict['にゃんにゃん']
    ex_1_dict['ネコ'] = ex_1/6
    
    ex_1 = 9*vecs_wordVecs['ねんねこ'] + ex_1_dict['ネコ'] + ex_1_dict['御寝'] + ex_1_dict['睡眠'] + ex_1_dict['にゃんにゃん'] + ex_1_dict['眠り'] + ex_1_dict['ねね'] + ex_1_dict['就眠'] + ex_1_dict['スリープ'] + ex_1_dict['猫']
    ex_1_dict['ねんねこ'] = ex_1/18
    
    ex_1 = 3*vecs_wordVecs['猫'] + ex_1_dict['ネコ'] + ex_1_dict['ねんねこ'] + ex_1_dict['にゃんにゃん']
    ex_1_dict['猫'] = ex_1/6
    
    ex_1 = 3*vecs_wordVecs['にゃんにゃん'] + ex_1_dict['ネコ'] + ex_1_dict['ねんねこ'] + ex_1_dict['猫']
    ex_1_dict['にゃんにゃん'] = ex_1/6

In [225]:
similarity(vecs_wordVecs['トラ'], vecs_wordVecs['ネコ'])

0.5214821607397867

In [224]:
similarity(ex_1_dict['トラ'], ex_1_dict['ネコ'])

0.5214821350254598

## ●アナロジー算出
### v1+v2-v3, v4の類似度

In [20]:
def checkAnalogy(vecs, w_vec):  
    negative = False # Falseなら似た単語を候補で上げる
    threshold = 0.3 # -1なら閾値固定

    # 閾値の設定
    border_positive = threshold if threshold > 0 else 0.9
    border_negative = threshold if threshold > 0 else 0.2
    print('{} < thd < {}'.format(border_negative, border_positive))

    # 候補数の設定
    max_candidates = 20
    candidates = {}

    for w in vecs:
        try:
            if w_vec.shape != vecs[w].shape:
                raise Exception("size not match")
            s = similarity(w_vec, vecs[w])
        except Exception as ex:
            print(w + " is not valid word.")
            continue

        if negative and s <= border_negative:
            candidates[w] = s
            if len(candidates) % 5 == 0:
                border_negative -= 0.05
        elif not negative and s >= border_positive:
            candidates[w] = s
            if len(candidates) % 5 == 0:
                border_positive += 0.05

        if len(candidates) > max_candidates:
            break

    # 類義語算出
    sorted_candidates = sorted(candidates, key=candidates.get, reverse=not negative)
    for c in sorted_candidates:
        print("{0}, {1}".format(c, candidates[c]))

In [21]:
v1 = '兄'
v2 = '姉'
v3 = '祖父'
v4 = '祖母'

if v1 not in lexicon:
    print("v1 not found error in dict")
if v2 not in lexicon:
    print("v2 not found error in dict")
if v3 not in lexicon:
    print("v3 not found error in dict")
if v4 not in lexicon:
    print("v4 not found error in dict")

try:
    print('word2vec : {}'.format(similarity(vecs_wordVecs[v1] + vecs_wordVecs[v2] - vecs_wordVecs[v3], vecs_wordVecs[v4])))
except:
    print('error')

try:
    print('retrofitting : {}'.format(similarity(vecs_new_vec[v1] + vecs_new_vec[v2] - vecs_new_vec[v3], vecs_new_vec[v4])))
except:
    print('error')

word2vec : 0.3855693677632116
retrofitting : 0.455162528951861


### v1+v2-v3の上位10単語

In [22]:
checkAnalogy(vecs_wordVecs, vecs_wordVecs[v1] + vecs_wordVecs[v2] - vecs_wordVecs[v3])

0.3 < thd < 0.3
754069 is not valid word.
姉, 0.8433279735101462
兄, 0.7622779357580127
妹, 0.7553375375315672
弟, 0.7011033258786433
双子, 0.5541898022518956
夫, 0.5241162973051148
年上, 0.5228285283574702
次女, 0.49053484292919564
兄弟, 0.4801388417438813
叔父, 0.48004084731816965
甥, 0.4646477248521072
母, 0.4618720699656536
長女, 0.4522498603989746
娘。, 0.4484254297338895
彼女, 0.41874909255787995
妻, 0.4180556386599047
息子, 0.38730153006548584
父, 0.3860297212008132
結婚, 0.3574390676533354
男, 0.35215392646403093
2人, 0.32434570907744076


In [23]:
checkAnalogy(vecs_new_vec, vecs_new_vec[v1] + vecs_new_vec[v2] - vecs_new_vec[v3])

0.3 < thd < 0.3
754069 is not valid word.
弟, 0.7363773440569735
彼女, 0.5633848313623006
母, 0.5338945818801998
息子, 0.5296488931378206
妻, 0.5185961348730093
娘。, 0.5039797586656193
男, 0.48125902457162945
女性, 0.46642095554235585
子供, 0.4618583755895169
彼, 0.46079785092814735
父, 0.45328663166247235
結婚, 0.44679014492847346
子, 0.4105021820226309
2人, 0.4091101937240521
自分, 0.3918472036563103
思う, 0.37224827169148844
出演, 0.3608708120523299
メンバー, 0.3513207187726019
後, 0.33321416563642
優勝, 0.3050190720842435
活動, 0.3010650673095817


In [24]:
# # ベクトルの設定
# w_vec = vecs_wordVecs[v1] + vecs_wordVecs[v2] - vecs_wordVecs[v3]
# vecs = vecs_wordVecs
# for w in vecs:
#     try:
#         if w_vec.shape != vecs[w].shape:
#             raise Exception("size not match")
#         s = similarity(w_vec, vecs[w])
#     except Exception as ex:
#         print(w + " is not valid word.")
#         continue

#     if negative and s <= border_negative:
#         candidates[w] = s
#         if len(candidates) % 5 == 0:
#             border_negative -= 0.05
#     elif not negative and s >= border_positive:
#         candidates[w] = s
#         if len(candidates) % 5 == 0:
#             border_positive += 0.05

#     if len(candidates) > max_candidates:
#         break

# # 類義語算出
# sorted_candidates = sorted(candidates, key=candidates.get, reverse=not negative)
# for c in sorted_candidates:
#     print("{0}, {1}".format(c, candidates[c]))

In [25]:
# # ベクトルの設定
# w_vec = vecs_new_vec[v1] + vecs_new_vec[v2] - vecs_new_vec[v3]
# vecs = vecs_new_vec
# for w in vecs:
#     try:
#         if w_vec.shape != vecs[w].shape:
#             raise Exception("size not match")
#         s = similarity(w_vec, vecs[w])
#     except Exception as ex:
#         print(w + " is not valid word.")
#         continue

#     if negative and s <= border_negative:
#         candidates[w] = s
#         if len(candidates) % 5 == 0:
#             border_negative -= 0.05
#     elif not negative and s >= border_positive:
#         candidates[w] = s
#         if len(candidates) % 5 == 0:
#             border_positive += 0.05

#     if len(candidates) > max_candidates:
#         break

# # 類義語算出
# sorted_candidates = sorted(candidates, key=candidates.get, reverse=not negative)
# for c in sorted_candidates:
#     print("{0}, {1}".format(c, candidates[c]))

## ●wordに対する上位10単語

In [26]:
word = '彼女'
# path = "./fastText/model.vec"
negative = False # Falseなら似た単語を候補で上げる
threshold = 0.6 # -1なら閾値固定

In [27]:
def checkSim_by_word(vecs, word):
    # 閾値の設定
    border_positive = threshold if threshold > 0 else 0.8
    border_negative = threshold if threshold > 0 else 0.3

    # 候補数の設定
    max_candidates = 20
    candidates = {}
    
    # wordの設定確認
    if not word:
        raise Exception("word is missing")

    # wordがモデルにない場合，
    if word not in vecs:
        raise Exception("Sorry, this word is not registered in model.")

    # ベクトルの設定
    w_vec = vecs[word]

    # ナレッジグラフにあるかどうかの確認
    lexicon = read_lexicon(lexicon_arg)
    if word not in lexicon:
    #     raise Exception("not found error in dict")
        print("not found error in dict")
    
    for w in vecs:
        try:
            if w_vec.shape != vecs[w].shape:
                raise Exception("size not match")
            s = similarity(w_vec, vecs[w])
        except Exception as ex:
            print(w + " is not valid word.")
            continue

        if negative and s <= border_negative:
            candidates[w] = s
            if len(candidates) % 5 == 0:
                border_negative -= 0.05
        elif not negative and s >= border_positive:
            candidates[w] = s
            if len(candidates) % 5 == 0:
                border_positive += 0.05

        if len(candidates) > max_candidates:
            break
    
    # 類義語算出
    sorted_candidates = sorted(candidates, key=candidates.get, reverse=not negative)
    for c in sorted_candidates:
        print("{0}, {1}".format(c, candidates[c]))

In [28]:
checkSim_by_word(vecs_wordVecs, word)

754069 is not valid word.
彼女, 1.0
彼, 0.7881711200018091
彼女自身, 0.7454731084176603
彼女たち, 0.7367573651943607
彼女ら, 0.6977034431164688
自分, 0.670073388263443
ルービツ, 0.654221608969978
フィリピエヴナ, 0.654103015163623
彼ら, 0.640895342831867
私, 0.6133480216235981


In [29]:
checkSim_by_word(vecs_new_vec, word)

754069 is not valid word.
彼女, 1.0
ガールフレンド, 0.8702002492417416
彼, 0.8646146491713587
恋人, 0.8645802592529555
女友達, 0.8311695203495161
ボーイフレンド, 0.8124718679947734
彼氏, 0.7801455673281661
愛人, 0.7650726037527209
姉, 0.7288931909201077
母親, 0.7185616060061439
彼女たち, 0.7185170521135521
少女, 0.7085028367621702
夫, 0.7052696026212519
友人, 0.6895894359368071
妻, 0.6895129146007162
女, 0.6675841616411355
女性, 0.6668317039751089
自分, 0.6601343505245548
結婚, 0.6003957244867856


In [30]:
# # wordの設定確認
# if not word:
#     raise Exception("word is missing")
    
# # wordがモデルにない場合，
# if word not in vecs:
#     raise Exception("Sorry, this word is not registered in model.")

# # ベクトルの設定
# w_vec = vecs[word]

# # ナレッジグラフにあるかどうかの確認
# lexicon = read_lexicon(lexicon_arg)
# if word not in lexicon:
# #     raise Exception("not found error in dict")
#     print("not found error in dict")

# # 閾値の設定
# border_positive = threshold if threshold > 0 else 0.8
# border_negative = threshold if threshold > 0 else 0.3

# # 候補数の設定
# max_candidates = 20
# candidates = {}

In [31]:
# for w in vecs:
#     try:
#         if w_vec.shape != vecs[w].shape:
#             raise Exception("size not match")
#         s = similarity(w_vec, vecs[w])
#     except Exception as ex:
#         print(w + " is not valid word.")
#         continue

#     if negative and s <= border_negative:
#         candidates[w] = s
#         if len(candidates) % 5 == 0:
#             border_negative -= 0.05
#     elif not negative and s >= border_positive:
#         candidates[w] = s
#         if len(candidates) % 5 == 0:
#             border_positive += 0.05

#     if len(candidates) > max_candidates:
#         break

In [32]:
# # 類義語算出
# sorted_candidates = sorted(candidates, key=candidates.get, reverse=not negative)
# for c in sorted_candidates:
#     print("{0}, {1}".format(c, candidates[c]))