# wordSim評価
# wordAnalogy評価

# -----------初期設定-----------

In [1]:
# -*- coding: utf-8 -*-
# python3
#
import argparse
import gzip
import math
import numpy
import re
import sys
import numpy as np
from copy import deepcopy
import codecs

In [2]:
isNumber = re.compile(r'\d+.*')
def norm_word(word):
    if isNumber.search(word.lower()):
        return '---num---'
    elif re.sub(r'\W+', '', word) == '':
        return '---punc---'
    else:
        return word.lower()

In [3]:
"""Read all the word vectors and normalize them"""
def read_word_vecs(filename):
    wordVectors = {}
    # ファイル読み込み
    if filename.endswith('.gz'): 
        fileObject = gzip.open(filename, 'r')
    else: 
        fileObject = codecs.open(filename, "r", "utf-8", 'ignore')
        
    for line in fileObject:
        # line = line.strip().lower()
        line = line.strip()
        word = line.split()[0]
        wordVectors[word] = numpy.zeros(len(line.split())-1, dtype=float)
        for index, vecVal in enumerate(line.split()[1:]):
            wordVectors[word][index] = float(vecVal)
        """normalize weight vector"""
        wordVectors[word] /= math.sqrt((wordVectors[word]**2).sum() + 1e-7)

    sys.stderr.write("Vectors read from: "+filename+" \n")
    return wordVectors

In [4]:
"""Read all the word vectors and normalize them"""
def read_word_vecs_non(filename):
    wordVectors = {}
    # ファイル読み込み
    if filename.endswith('.gz'): 
        fileObject = gzip.open(filename, 'r')
    else: 
        fileObject = codecs.open(filename, "r", "utf-8", 'ignore')
        
    for line in fileObject:
        # line = line.strip().lower()
        line = line.strip()
        word = line.split()[0]
        wordVectors[word] = numpy.zeros(len(line.split())-1, dtype=float)
        for index, vecVal in enumerate(line.split()[1:]):
            wordVectors[word][index] = float(vecVal)
        """normalize weight vector"""
#         wordVectors[word] /= math.sqrt((wordVectors[word]**2).sum() + 1e-7)

    sys.stderr.write("Vectors read from: "+filename+" \n")
    return wordVectors

In [5]:
"""Write word vectors to file"""
def print_word_vecs(wordVectors, outFileName):
    sys.stderr.write('\nWriting down the vectors in '+outFileName+'\n')
    outFile = open(outFileName, 'w')  
    for word, values in wordVectors.items():
        outFile.write(word+' ')
        for val in wordVectors[word]:
            outFile.write('%.4f' %(val)+' ')
        outFile.write('\n')      
    outFile.close()

In [6]:
"""Read the PPDB.etc word relations as a dictionary"""
def read_lexicon(filename):
    lexicon = {}
    fileObject = open(filename, 'r')
    for line in fileObject:
        words = line.lower().strip().split()
        lexicon[norm_word(words[0])] = [norm_word(word) for word in words[1:]]
    return lexicon

In [7]:
def similarity(v1, v2):
    n1 = np.linalg.norm(v1) # v1のノルム
    n2 = np.linalg.norm(v2) # v2のノルム
    return np.dot(v1, v2) / (n1*n2) # 内積 / 

## ・パス指定

In [8]:
# オリジナルの分散表現
input_arg = './sample/sample_vecs.txt'
# retrofittingしたnewvec
output_arg = './sample/newvec.txt'
output_arg_1 = './sample/newvec_non.txt'

## ・初期vecとnewvecとwordsim辞書のread

In [9]:
outFileName = output_arg

In [10]:
wordVecs = read_word_vecs(input_arg)

Vectors read from: ./sample/sample_vecs.txt 


In [11]:
new_vec = read_word_vecs(output_arg)

Vectors read from: ./sample/newvec.txt 


In [12]:
new_vec_non = read_word_vecs_non(output_arg_1)

Vectors read from: ./sample/newvec_non.txt 


# -----------各次元の分散が大きい単語top5-----------

In [13]:
from statistics import variance

In [14]:
def  Qualitative_Evaluation(vectors):
    # 次元ごとの値を格納する
    value_per_dimention = {}
    for key in vectors.keys():
        for j in range(len(vectors[key])):
            try:
                value_per_dimention[j] += [vectors[key][j]]
            except:
                value_per_dimention[j] = [vectors[key][j]]
    
    # 次元ごとに分散を計算する
    var_per_dimention = []
    for key in range(len(value_per_dimention.keys())):
        var = variance(value_per_dimention[key])
        var_per_dimention.append(var)
    
    # 分散が大きい上位5次元
    top5_var = np.argsort(-np.array(var_per_dimention))[:5]
    worst5_var = np.argsort(np.array(var_per_dimention))[:5]
    print('分散が大きいtop5の次元 : {}'.format(top5_var))
#     print('分散が小さいtop5の次元 : {}'.format(worst5_var))
    print(" ")
    print(value_per_dimention[top5_var[0]])
    print(" ")
    print(value_per_dimention[worst5_var[0]])
    
    print(" ")
    # top5の次元で大きい値の単語top5
    for var in top5_var:
        cnt = 0
        top5_id = np.argsort(-np.array(value_per_dimention[var]))[:10]
        print("{}次元のtop5の単語".format(var))
        for key in vectors.keys():
            if cnt in top5_id:
                print(key)
            cnt += 1
        print(" ")

In [15]:
Qualitative_Evaluation(wordVecs)

分散が大きいtop5の次元 : [ 84 110  14  22  42]
 
[-0.041188563617448735, 0.0024033520297079276, -0.1050169586560986, 0.03710340335907547, 0.06733420776607296, -0.0631791105369332, 0.07828356080260786, -0.027740007510587954, 0.027922274818868702, 0.0025967566319395822, -0.07956504089898361, 0.01700037638041317, -0.10660092665852255, 0.03441858152022766, -0.0317711416556715, 0.05503710963572882, 0.04654471007945353, -0.07684494565332704, -0.02694488862135539, -0.03327638050329605, -0.01396007561487291, 0.013479415001757644, 0.042467467486799144, 0.024545383366806617, -0.07650318403674446, -0.03380191043464119, -0.024750055353817357, -0.06679460477238902, -0.08819207684311735, -0.05982059418074669, -0.05250198231592341, 0.00855522842868417, -0.028290623332081304, -0.009189475078995403, -0.030302306209432084, -0.1695535433702774, 0.07318426223607034, -0.005460272004753298, -0.04197614020255475, -0.09852796321943769, -0.08141770174604317, -0.057538037111713514, -0.04744812404393036, -0.1157429084241

In [16]:
Qualitative_Evaluation(new_vec)

分散が大きいtop5の次元 : [  69 2145 1157 1989 1844]
 
[-0.027368007183783717, -0.028325058991896787, -0.029751978724367394, -0.03197102243404834, -0.033594829846398784, -0.03688731255742202, -0.0390599486444076, -0.04157409390535357, -0.04963555711138379, 0.02423755070107002, 0.020068642347818924, 0.01929271107714021, 0.018926809445226435, 0.01873805815746836, 0.018658086097636466, 0.018577724530984684, 0.018536339600633212, 0.018508855896844242, 0.018511799802759243, 0.018505902246792615, 0.01851398226366217, 0.018543683949226432, 0.01863297365998667, -0.018695656010012565, 0.018824742314480943, -0.01858474992154049, 0.018736073924059845, 0.019472434032926013, 0.017967287797045584, 0.019949179588974512, 0.01874826761195103, -0.01863880453445565, 0.018660566898797606, -0.01867629350290394, 0.018670547487414844, -0.01897132967642473, -0.019330411815342396, 0.018810022114927178, 0.019114670882843594, -0.018790432746739597, -0.01890688785380977, 0.0189221420284534, 0.01875110370423547, -0.01885676

In [17]:
Qualitative_Evaluation(new_vec_non)

分散が大きいtop5の次元 : [ 369 1247 1241 2055 2059]
 
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0]
 
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0

In [None]:
word2vec_top5_dimention = [ 84, 110, 14, 22, 42]

In [None]:
# 次元ごとの値を格納する
value_per_dimention = {}
for key in new_vec.keys():
    for j in range(len(new_vec[key])):
        try:
            value_per_dimention[j] += [new_vec[key][j]]
        except:
            value_per_dimention[j] = [new_vec[key][j]]
            
# top5の次元で大きい値の単語top5
for var in word2vec_top5_dimention:
    cnt = 0
    top5_id = np.argsort(-np.array(value_per_dimention[var]))[:10]
    print("{}次元のtop5の単語".format(var))
    for key in new_vec.keys():
        if cnt in top5_id:
            print(key)
        cnt += 1
    print(" ")

# -----------値の分散が最大な次元top5-----------

# -----------ある単語の類似する単語を挙げる-----------

In [None]:
def checkSim_by_word(vecs, word):
    # 閾値の設定
    negative = False # Falseなら似た単語を候補で上げる
    threshold = 0.5 # -1なら閾値固定
    border_positive = threshold if threshold > 0 else 0.8
    border_negative = threshold if threshold > 0 else 0.3

    # 候補数の設定
    max_candidates = 20
    candidates = {}
    
    # wordの設定確認
    if not word:
        raise Exception("word is missing")

    # wordがモデルにない場合，
    if word not in vecs:
        raise Exception("Sorry, this word is not registered in model.")

    # ベクトルの設定
    w_vec = vecs[word]
    for w in vecs:
        try:
            if w_vec.shape != vecs[w].shape:
                raise Exception("size not match")
            s = similarity(w_vec, vecs[w])
        except Exception as ex:
            print(w + " is not valid word.")
            continue

        if negative and s <= border_negative:
            candidates[w] = s
            if len(candidates) % 5 == 0:
                border_negative -= 0.05
        elif not negative and s >= border_positive:
            candidates[w] = s
            if len(candidates) % 5 == 0:
                border_positive += 0.05

        if len(candidates) > max_candidates:
            break
    
    # 類義語算出
    sorted_candidates = sorted(candidates, key=candidates.get, reverse=not negative)
    for c in sorted_candidates:
        print("{0}, {1}".format(c, candidates[c]))

In [None]:
word = 'on'

In [None]:
# 初期vecの場合，
checkSim_by_word(vecs_wordVecs, word)
print(' ')
# newvecの場合，
checkSim_by_word(vecs_new_vec, word)

# -----------WordSim評価-----------

In [None]:
# word2vec
vecs_wordVecs = wordVecs

In [None]:
# newvec
vecs_new_vec = new_vec

In [None]:
def checkSim(v1, v2):
    if v1 not in lexicon: # 注目単語がwordnetに含まれない場合，
        print("v1(={})はwordnetのkeyに存在しません".format(v1))
        wordNeighbours_1 = set()
    else:
        # 注目単語の同義語リストとword2vecのkeyリストと重複する単語リスト（更新対象か？）
        wordNeighbours_1 = set(lexicon[v1]).intersection(set(wordVecs.keys()))
        print('v1(={})における更新対象のneighbour数 : {}'.format(v1, len(wordNeighbours_1)))
        
    # neighboursの中で注目単語と類似度が低いもの
    ave_1_word2vec = np.zeros_like(wordVecs[v1])  #neighboursの平均ベクトル
    ave_1_retrofit = np.zeros_like(wordVecs[v1])
    for neighbour in wordNeighbours_1:
        print('v1とneighbour(={}) : {} -> {}'.format(neighbour, 
                                                                   similarity(wordVecs[neighbour], wordVecs[v1]), 
                                                                   similarity(vecs_new_vec[neighbour], vecs_new_vec[v1])))
        ave_1_word2vec += wordVecs[neighbour]
        ave_1_retrofit += vecs_new_vec[neighbour]
    print('v1とneighboursの平均ベクトル : {} -> {}'.format(similarity(ave_1_word2vec/len(wordNeighbours_1), wordVecs[v1]), 
                                                              similarity(ave_1_retrofit/len(wordNeighbours_1), vecs_new_vec[v1])))
    print(" ")
    
    if v2 not in lexicon:
        print("v2(={})はwordnetのkeyに存在しません".format(v2))
        wordNeighbours_2 = set()
    else:
        wordNeighbours_2 = set(lexicon[v2]).intersection(set(wordVecs.keys()))
        print('v2(={})における更新対象のneighbour数 : {}'.format(v2, len(wordNeighbours_2))) # 更新対象数
    
    # neighboursの中で注目単語と類似度が低いもの
    ave_2_word2vec = np.zeros_like(wordVecs[v2])  #neighboursの平均ベクトル
    ave_2_retrofit = np.zeros_like(wordVecs[v2])
    for neighbour in wordNeighbours_2:
        print('v2とneighbour(={}) : {} -> {}'.format(neighbour, 
                                                                   similarity(wordVecs[neighbour], wordVecs[v2]), 
                                                                   similarity(vecs_new_vec[neighbour], vecs_new_vec[v2])))
        ave_2_word2vec += wordVecs[neighbour]
        ave_2_retrofit += vecs_new_vec[neighbour]
    print('v2とneighboursの平均ベクトル : {} -> {}'.format(similarity(ave_2_word2vec/len(wordNeighbours_2), wordVecs[v2]), 
                                                              similarity(ave_2_retrofit/len(wordNeighbours_2), vecs_new_vec[v2])))
    print(" ")
    
    # v1とv2における更新対象のneighbourに重複があるか
    print('更新対象における重複 : {}'.format(set(wordNeighbours_1).intersection(set(wordNeighbours_2))))
    print('更新対象における重複数 : {}'.format(len(set(wordNeighbours_1).intersection(set(wordNeighbours_2)))))
    print(" ")

    """v1とv2のword2vecとretrofittingにおける類似度"""
    try:
        print("word2vec : {}".format(similarity(vecs_wordVecs[v1], vecs_wordVecs[v2])))
    except:
        print('error')

    try:
        print("retrofitting : {}".format(similarity(vecs_new_vec[v1], vecs_new_vec[v2])))
    except:
        print('error')

    print(" ")
    """同一単語におけるword2vecとretrofittingの類似度"""
    try:
        print("v1 : {}".format(similarity(vecs_wordVecs[v1], vecs_new_vec[v1])))
    except:
        print('error')

    try:
        print("v2 : {}".format(similarity(vecs_wordVecs[v2], vecs_new_vec[v2])))
    except:
        print('error')

## ・WordSimのテスト

In [None]:
v1 = 'フットボール'
v2 = 'サッカー'
# checkSim(v1, v2)

# -----------wordAnalogy評価-----------

In [None]:
def checkAnalogy(vecs, w_vec):  
    negative = False # Falseなら似た単語を候補で上げる
    threshold = 0.3 # -1なら閾値固定

    # 閾値の設定
    border_positive = threshold if threshold > 0 else 0.9
    border_negative = threshold if threshold > 0 else 0.2
    print('{} < thd < {}'.format(border_negative, border_positive))

    # 候補数の設定
    max_candidates = 20
    candidates = {}

    for w in vecs:
        try:
            if w_vec.shape != vecs[w].shape:
                raise Exception("size not match")
            s = similarity(w_vec, vecs[w])
        except Exception as ex:
            print(w + " is not valid word.")
            continue

        if negative and s <= border_negative:
            candidates[w] = s
            if len(candidates) % 5 == 0:
                border_negative -= 0.05
        elif not negative and s >= border_positive:
            candidates[w] = s
            if len(candidates) % 5 == 0:
                border_positive += 0.05

        if len(candidates) > max_candidates:
            break

    # 類義語算出
    sorted_candidates = sorted(candidates, key=candidates.get, reverse=not negative)
    for c in sorted_candidates:
        print("{0}, {1}".format(c, candidates[c]))

## ・「v1 + v2 - v3」と「v4」の類似度算出

In [None]:
v1 = '兄'
v2 = '姉'
v3 = '祖父'
v4 = '祖母'

In [None]:
if v1 not in lexicon:
    print("v1 not found error in dict")
if v2 not in lexicon:
    print("v2 not found error in dict")
if v3 not in lexicon:
    print("v3 not found error in dict")
if v4 not in lexicon:
    print("v4 not found error in dict")

try:
    print('word2vec : {}'.format(similarity(vecs_wordVecs[v1] + vecs_wordVecs[v2] - vecs_wordVecs[v3], vecs_wordVecs[v4])))
except:
    print('error')

try:
    print('retrofitting : {}'.format(similarity(vecs_new_vec[v1] + vecs_new_vec[v2] - vecs_new_vec[v3], vecs_new_vec[v4])))
except:
    print('error')

## ・「v1 + v2 - v3」と近い単語を挙げる→「v4」が結果に出るか

In [None]:
# 初期vecの場合，
checkAnalogy(vecs_wordVecs, vecs_wordVecs[v1] + vecs_wordVecs[v2] - vecs_wordVecs[v3])
print(' ')
# newvecの場合，
checkAnalogy(vecs_new_vec, vecs_new_vec[v1] + vecs_new_vec[v2] - vecs_new_vec[v3])