In [1]:
# -*- coding: utf-8 -*-
import argparse
import gzip
import math
import numpy
import re
import sys
import numpy as np
from copy import deepcopy
import codecs
from statistics import mean, median,variance,stdev
import pandas as pd
import pandas_profiling as pdp
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.externals import joblib

# ベクトルの読み込み

In [2]:
## file generator
def file_generator(filename):
        cnt = 0
        with open(filename, encoding="utf-8", errors="ignore") as infile:
            for line in infile:
                cnt += 1
                if cnt == 1: # 1行目をskip
                    pass
                else:
                    yield line

In [3]:
"""vectorsのread + normalize"""
def ReadVecsFromFile(filename):
    sys.stderr.write("Vectors read from: " + filename + " \n")
    print("wordVecsのread中")
    wordVectors = {}
    # 以下，yierdによる処理
    gen = file_generator(filename)
    for line in gen:
        line = line.strip()
        word = line.split()[0]
        wordVectors[word] = np.zeros(
            len(line.split()) - 1, dtype=np.float16
        )  # (L,)
        for index, vecVal in enumerate(line.split()[1:]):
            wordVectors[word][index] = float(vecVal)
        """normalize"""
        # wordVectors[word] /= [math.sqrt((wordVectors[word]**2).sum() + 1e-6)]
        wordVectors[word] = np.array(
            [wordVectors[word]], dtype=np.float16
        )  # (1, L)

        """サイズが異なる場合のエラー処理"""
        if wordVectors[word].shape[1] != 300:
            print("word: {}, shape: {}".format(word, wordVectors[word].shape))
            del wordVectors[word]
    print("wordVecsのread完了")
    return wordVectors

In [4]:
## word2vecの読み込み
w2v = ReadVecsFromFile("/Users/1-10robotics/Desktop/Sparse_Overcomplete/vectors.model")

Vectors read from: /Users/1-10robotics/Desktop/Sparse_Overcomplete/vectors.model 


wordVecsのread中
word: ゲオルク・カントール, shape: (1, 33)
wordVecsのread完了


In [5]:
## Sparse Over Completeしたベクトルの読み込み
newvec = joblib.load("./model/factor10_iter42/newvec_42.pkl")

# 類似度計算

In [6]:
## 類似度計算
def similarity(v1, v2):
    n1 = np.linalg.norm(v1) # v1のノルム
    n2 = np.linalg.norm(v2) # v2のノルム
    return np.dot(v1[0], v2[0]) / (n1*n2)

# 検証

In [7]:
# newvecの単語のみを抜き出し，リスト化
newvec_word = list(newvec.keys())

In [8]:
# wordとidを対応づけるDataFrame
f_word = pd.DataFrame(newvec_word, columns=['word'])

In [9]:
# newvecのベクトルのみを抜き出し，リスト化
newvec_dimention = list(newvec.values())

In [10]:
# 次元毎にdictを作成
# key: 次元, value: 単語毎のベクトル値
var_per_dimention = {}
for v in tqdm(range(len(newvec_dimention))):
    for j in range(len(newvec_dimention[v][0])):
        try:
            var_per_dimention[j].append(newvec_dimention[v][0][j])
        except:
            var_per_dimention[j] = [newvec_dimention[v][0][j]]

100%|██████████| 192953/192953 [08:38<00:00, 372.31it/s]


## ・次元のスパース性を求める

In [None]:
x = 234 # 入力次元
vocab_num = len(var_per_dimention[x])
sparse_cnt = var_per_dimention[x].count(0)
sparse_rate = sparse_cnt/vocab_num

non_sparse_cnt = len([i for i in var_per_dimention[x] if i > 0])
non_sparse_rate = non_sparse_cnt/vocab_num

print("vocab_num: {}".format(vocab_num))
print("non_sparse_cnt: {}".format(sparse_cnt))
print("non_sparse_rate: {}".format(sparse_rate))
print("sparse_cnt: {}".format(non_sparse_cnt))
print("sparse_rate: {}".format(non_sparse_rate))

## ・スパース性の高い次元を求める

In [13]:
sparse_list = {}
for v in tqdm(range(len(var_per_dimention))):
    try:
        sparse_list[v].append(len([i for i in var_per_dimention[v] if i > 0]))
    except:
        sparse_list[v] =len([i for i in var_per_dimention[v] if i > 0])

100%|██████████| 3000/3000 [31:50<00:00,  1.81it/s]  


In [16]:
df_sparse_list = pd.DataFrame(list(sparse_list.items()), columns=['ID', 'sparse_cnt'])

In [17]:
## 度数分布
pdp.ProfileReport(df_sparse_list)

0,1
Number of variables,2
Number of observations,3000
Total Missing (%),0.0%
Total size in memory,47.0 KiB
Average record size in memory,16.0 B

0,1
Numeric,2
Categorical,0
Boolean,0
Date,0
Text (Unique),0
Rejected,0
Unsupported,0

0,1
Distinct count,3000
Unique (%),100.0%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,1499.5
Minimum,0
Maximum,2999
Zeros (%),0.0%

0,1
Minimum,0.0
5-th percentile,149.95
Q1,749.75
Median,1499.5
Q3,2249.2
95-th percentile,2849.0
Maximum,2999.0
Range,2999.0
Interquartile range,1499.5

0,1
Standard deviation,866.17
Coef of variation,0.57764
Kurtosis,-1.2
Mean,1499.5
MAD,750
Skewness,0
Sum,4498500
Variance,750250
Memory size,23.5 KiB

Value,Count,Frequency (%),Unnamed: 3
2047,1,0.0%,
1040,1,0.0%,
1058,1,0.0%,
1056,1,0.0%,
1054,1,0.0%,
1052,1,0.0%,
1050,1,0.0%,
1048,1,0.0%,
1046,1,0.0%,
1044,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
0,1,0.0%,
1,1,0.0%,
2,1,0.0%,
3,1,0.0%,
4,1,0.0%,

Value,Count,Frequency (%),Unnamed: 3
2995,1,0.0%,
2996,1,0.0%,
2997,1,0.0%,
2998,1,0.0%,
2999,1,0.0%,

0,1
Distinct count,948
Unique (%),31.6%
Missing (%),0.0%
Missing (n),0
Infinite (%),0.0%
Infinite (n),0

0,1
Mean,346.07
Minimum,0
Maximum,5358
Zeros (%),0.5%

0,1
Minimum,0.0
5-th percentile,9.95
Q1,49.0
Median,153.0
Q3,418.0
95-th percentile,1412.0
Maximum,5358.0
Range,5358.0
Interquartile range,369.0

0,1
Standard deviation,517.59
Coef of variation,1.4956
Kurtosis,13.779
Mean,346.07
MAD,331.75
Skewness,3.2005
Sum,1038198
Variance,267900
Memory size,23.5 KiB

Value,Count,Frequency (%),Unnamed: 3
21,24,0.8%,
16,22,0.7%,
27,22,0.7%,
45,22,0.7%,
15,21,0.7%,
11,20,0.7%,
26,20,0.7%,
36,20,0.7%,
3,20,0.7%,
2,19,0.6%,

Value,Count,Frequency (%),Unnamed: 3
0,16,0.5%,
1,14,0.5%,
2,19,0.6%,
3,20,0.7%,
4,16,0.5%,

Value,Count,Frequency (%),Unnamed: 3
3544,1,0.0%,
3665,1,0.0%,
3682,1,0.0%,
4577,1,0.0%,
5358,1,0.0%,

Unnamed: 0,ID,sparse_cnt
0,0,65
1,1,160
2,2,816
3,3,75
4,4,468


In [18]:
## 少ない順（よりスパース）
sorted(sparse_list.items(), key=lambda x: x[1])

[(60, 0),
 (160, 0),
 (166, 0),
 (192, 0),
 (321, 0),
 (584, 0),
 (589, 0),
 (681, 0),
 (811, 0),
 (862, 0),
 (1626, 0),
 (1869, 0),
 (2038, 0),
 (2097, 0),
 (2390, 0),
 (2480, 0),
 (70, 1),
 (75, 1),
 (178, 1),
 (710, 1),
 (929, 1),
 (1649, 1),
 (1700, 1),
 (1862, 1),
 (1987, 1),
 (2071, 1),
 (2216, 1),
 (2668, 1),
 (2830, 1),
 (2984, 1),
 (26, 2),
 (95, 2),
 (282, 2),
 (301, 2),
 (331, 2),
 (719, 2),
 (730, 2),
 (1545, 2),
 (1616, 2),
 (1794, 2),
 (1997, 2),
 (2056, 2),
 (2061, 2),
 (2174, 2),
 (2285, 2),
 (2412, 2),
 (2920, 2),
 (2935, 2),
 (2940, 2),
 (69, 3),
 (99, 3),
 (142, 3),
 (389, 3),
 (441, 3),
 (533, 3),
 (538, 3),
 (636, 3),
 (641, 3),
 (1016, 3),
 (1122, 3),
 (1211, 3),
 (1332, 3),
 (1439, 3),
 (1473, 3),
 (1590, 3),
 (1762, 3),
 (1826, 3),
 (1874, 3),
 (2108, 3),
 (234, 4),
 (502, 4),
 (639, 4),
 (732, 4),
 (763, 4),
 (800, 4),
 (954, 4),
 (1102, 4),
 (1150, 4),
 (1159, 4),
 (1636, 4),
 (2335, 4),
 (2524, 4),
 (2590, 4),
 (2682, 4),
 (2898, 4),
 (58, 5),
 (230, 5),
 (87

 (70, 1),
 (75, 1),
 (178, 1),
 (710, 1),
 (929, 1),
 (1649, 1),
 (1700, 1),
 (1862, 1),
 (1987, 1),
 (2071, 1),
 (2216, 1),
 (2668, 1),
 (2830, 1),
 (2984, 1),

In [19]:
## 多い順（非スパース）
sorted(sparse_list.items(), key=lambda x: x[1], reverse=True)

[(1526, 5358),
 (197, 4577),
 (521, 3682),
 (859, 3665),
 (1672, 3544),
 (1921, 3517),
 (1068, 3516),
 (900, 3433),
 (1604, 3358),
 (1396, 3350),
 (884, 3315),
 (86, 3287),
 (1749, 3280),
 (624, 3217),
 (214, 3193),
 (870, 3167),
 (1132, 3117),
 (2228, 3007),
 (1938, 2969),
 (2471, 2922),
 (1409, 2875),
 (1903, 2844),
 (2075, 2758),
 (873, 2756),
 (2740, 2709),
 (1648, 2700),
 (1733, 2683),
 (16, 2661),
 (1447, 2633),
 (2245, 2632),
 (251, 2628),
 (1550, 2618),
 (94, 2599),
 (2377, 2598),
 (1650, 2574),
 (62, 2557),
 (2520, 2529),
 (842, 2498),
 (1965, 2472),
 (892, 2412),
 (757, 2386),
 (477, 2354),
 (431, 2344),
 (2131, 2336),
 (2366, 2330),
 (1535, 2324),
 (1928, 2321),
 (525, 2316),
 (412, 2276),
 (337, 2245),
 (2681, 2214),
 (2230, 2212),
 (1218, 2193),
 (2542, 2192),
 (2414, 2168),
 (422, 2151),
 (1847, 2146),
 (2543, 2143),
 (2560, 2108),
 (2259, 2084),
 (324, 2043),
 (59, 2034),
 (1357, 2029),
 (2264, 2026),
 (1750, 2022),
 (1560, 2016),
 (886, 2012),
 (1722, 2012),
 (2342, 200

## [重い処理] ・0占有率（non-sparse rate）を比率毎にリスト化

In [11]:
# ## 次元ごとに0占有率に関するリストを作成する
# memo_80 = []
# memo_80_85 = []
# memo_85_90 = []
# memo_90_95 = []
# memo_95 = []
# memo_96 = []
# memo_97 = []
# memo_98 = []
# memo_99 = []
# memo_other = []
# # i: 次元
# for i in tqdm(range(len(list(var_per_dimention.values())))):
#     # 100次元までをそれぞれ見る
#     x = list(var_per_dimention.values())[i]
#     if (x.count(0) / len(x)) * 100 < 80:
#         memo_80.append(i)
#     elif (x.count(0) / len(x)) * 100 < 85:
#         memo_80_85.append(i)
#     elif (x.count(0) / len(x)) * 100 < 90:
#         memo_85_90.append(i)
#     elif (x.count(0) / len(x)) * 100 < 95:
#         memo_90_95.append(i)
#     elif (x.count(0) / len(x)) * 100 < 96:
#         memo_95.append(i)
#     elif (x.count(0) / len(x)) * 100 < 97:
#         memo_96.append(i)
#     elif (x.count(0) / len(x)) * 100 < 98:
#         memo_97.append(i)
#     elif (x.count(0) / len(x)) * 100 < 99:
#         memo_98.append(i)
#     elif (x.count(0) / len(x)) * 100 < 100:
#         memo_99.append(i)
#     else:
#         memo_other.append(i)

In [12]:
# print("~80%: {}".format(len(memo_80)))
# print("80~85%: {}".format(len(memo_80_85)))
# print("85~90%: {}".format(len(memo_85_90)))
# print("90~95%: {}".format(len(memo_90_95)))
# print("95%: {}".format(len(memo_95)))
# print("96%: {}".format(len(memo_96)))
# print("97%: {}".format(len(memo_97)))
# print("98%: {}".format(len(memo_98)))
# print("99: {}".format(len(memo_99)))
# print("other: {}".format(len(memo_other)))

## ・平均，分散，類似度など

In [58]:
d = 234 # 0から数えて
# d = memo_99[memo_99.index(537)]  # memoの辞書を作成している場合

x = list(var_per_dimention.values())[d]
# 平均を求める
mean_x = np.average(x)
print("d: {}, mean: {}\n".format(d, mean_x))

# 求めた平均から各単語の分散を求める
var_x = [math.sqrt((mean_x - x[i]) ** 2) for i in range(len(x))]

# 分散の大きさ順にソートし，indexを返す
index_x_sorted = sorted(
    range(len(var_x)), key=lambda k: var_x[k], reverse=True
)

# indexの上位5個
target = index_x_sorted[:20]

# indexの上位5個の単語を返す
# result = "index\tword\tVar(sort)\tnew_CosSimilarity(vmax.vs)\tw2v_CosSimilarity(vmax.vs)\n"
result = "index\tword\tVar(sort)\tw2v_CosSimilarity(vmax.vs)\n"
result1 = "index\tword\tnew_CosSimilarity(vmax.vs)\tindex_from\n"
for i in target:
    best_word = newvec_word[target[0]]
    result += "{}\t{}\t{}\t".format(i, newvec_word[i], var_x[i])
    result += "{}\n".format(similarity(w2v[best_word], w2v[newvec_word[i]]))
    result1 += "{}\t{}\t".format(i, newvec_word[i])
    result1 += "{}\t{}\n".format(similarity(newvec[best_word], newvec[newvec_word[i]]), np.where(newvec[newvec_word[i]]!=0)[1])
# print("d: {}, target_word: {}\n".format(d, f_word.iloc[target]))
print(result)

d: 234, mean: 9.97781753540039e-05

index	word	Var(sort)	w2v_CosSimilarity(vmax.vs)
131745	入構	5.06640625	1.0
71482	義肢	4.89453125	0.06298828125
57602	英也	4.6953125	0.114501953125
88663	柳本	4.59375	0.1221923828125
0	</s>	9.97781753540039e-05	-0.1190185546875
1	の	9.97781753540039e-05	0.11932373046875
2	、	9.97781753540039e-05	0.1099853515625
3	に	9.97781753540039e-05	0.11181640625
4	は	9.97781753540039e-05	0.1009521484375
5	する	9.97781753540039e-05	0.1383056640625
6	を	9.97781753540039e-05	0.1275634765625
7	た	9.97781753540039e-05	0.107421875
8	が	9.97781753540039e-05	0.14208984375
9	て	9.97781753540039e-05	0.1446533203125
10	と	9.97781753540039e-05	0.076171875
11	で	9.97781753540039e-05	0.06439208984375
12	だ	9.97781753540039e-05	0.1270751953125
13	れる	9.97781753540039e-05	0.1055908203125
14	ある	9.97781753540039e-05	0.18359375
15	いる	9.97781753540039e-05	0.1815185546875





In [59]:
print(result1)

index	word	new_CosSimilarity(vmax.vs)	index_from
131745	入構	1.0	[   4   24   54   62   81   84   94  100  114  122  139  171  184  194
  197  234  248  249  251  252  270  300  308  360  362  377  411  412
  415  422  445  454  465  477  492  499  515  550  597  606  609  617
  647  666  692  703  707  722  733  750  779  781  826  833  864  877
  888  889  893  896  900  912  917  918  938  966 1030 1037 1039 1040
 1045 1086 1131 1153 1163 1169 1196 1199 1204 1261 1304 1320 1337 1396
 1407 1428 1485 1486 1516 1517 1519 1520 1525 1558 1573 1574 1578 1587
 1588 1595 1600 1615 1682 1688 1689 1706 1720 1761 1778 1798 1822 1823
 1835 1845 1846 1856 1857 1864 1883 1886 1903 1907 1923 1976 1992 2029
 2033 2062 2068 2080 2103 2121 2128 2129 2130 2132 2157 2191 2196 2199
 2209 2223 2248 2250 2251 2278 2329 2338 2342 2377 2388 2394 2406 2417
 2421 2445 2458 2479 2484 2530 2542 2565 2598 2600 2693 2697 2703 2732
 2738 2740 2773 2789 2817 2827 2844 2878 2884 2886 2931 2942 2944 2956
 2975 2997]
71

## ・類似単語を見つける

In [6]:
def checkSim_by_word(vecs, word):
    # 閾値の設定
    negative = False # Falseなら似た単語を候補で上げる
    threshold = 0.5 # -1なら閾値固定
    border_positive = threshold if threshold > 0 else 0.8
    border_negative = threshold if threshold > 0 else 0.3

    # 候補数の設定
    max_candidates = 20
    candidates = {}
    
    # wordの設定確認
    if not word:
        raise Exception("word is missing")

    # wordがモデルにない場合，
    if word not in vecs:
        raise Exception("Sorry, this word is not registered in model.")

    # ベクトルの設定
    w_vec = vecs[word]
    for w in vecs:
        try:
            if w_vec.shape != vecs[w].shape:
                raise Exception("size not match")
            s = similarity(w_vec, vecs[w])
        except Exception as ex:
            print(w + " is not valid word.")
            continue

        if negative and s <= border_negative:
            candidates[w] = s
            if len(candidates) % 5 == 0:
                border_negative -= 0.05
        elif not negative and s >= border_positive:
            candidates[w] = s
            if len(candidates) % 5 == 0:
                border_positive += 0.05
    
    # 類義語算出
    sorted_candidates = sorted(candidates, key=candidates.get, reverse=not negative)
    for c in sorted_candidates:
        print("{0}, {1}".format(c, candidates[c]))

In [18]:
word = "外国語学研究科"
# w2vの場合，
checkSim_by_word(w2v, word)

外国語学研究科, 1.0
大阪大学大学院文学研究科, 0.708984375
単位取得満期退学, 0.6884765625
文学研究科, 0.677734375
満期退学, 0.67529296875
単位取得退学, 0.66845703125
社会学研究科, 0.66748046875
一橋大学大学院法学研究科, 0.6669921875
京都大学大学院法学研究科, 0.6513671875
大学院, 0.6484375
修士課程, 0.63330078125
経済学研究科, 0.61865234375
東京外国語大学, 0.611328125
研究科, 0.60888671875
文学部, 0.58544921875
助教授, 0.55810546875
専攻, 0.55517578125
京都大学, 0.5537109375
修了, 0.54541015625
東京大学, 0.54052734375
博士, 0.5234375


In [12]:
word = "死語"
# newvecの場合，
checkSim_by_word(newvec, word)



死語, 0.99951171875
なんちゃって, 0.78662109375
言い方, 0.6171875
口語, 0.6171875
語義, 0.6171875
古語, 0.6171875
熟語, 0.6171875
詞, 0.61669921875
Wikipedia, 0.61669921875
ニュアンス, 0.61669921875
サンスクリット, 0.61669921875
同音, 0.61669921875
強勢, 0.61669921875
語形, 0.61669921875
訛る, 0.61669921875
スラング, 0.6162109375
流行語, 0.57421875
