In [2]:
import numpy as np
from tqdm import tqdm
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

sparknlp.start()
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [3]:
documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")

sentence = SentenceDetector() \
.setInputCols(["document"]) \
.setOutputCol("sentence")

word_segmenter = WordSegmenterModel.pretrained("wordseg_gsd_ud", "ja") \
.setInputCols(["sentence"]) \
.setOutputCol("token")

lemmatizer = LemmatizerModel.pretrained("lemma", "ja") \
.setInputCols(["token"]) \
.setOutputCol("lemma")

embeddings = WordEmbeddingsModel.pretrained("japanese_cc_300d", "ja") \
.setInputCols("sentence", "token") \
.setOutputCol("embeddings")

pipeline = Pipeline().setStages([
documentAssembler,
sentence,
word_segmenter,
# lemmatizer,
embeddings
])

wordseg_gsd_ud download started this may take some time.
Approximate size to download 979 KB
[OK!]
lemma download started this may take some time.
Approximate size to download 3.4 MB
[OK!]
japanese_cc_300d download started this may take some time.
Approximate size to download 1.2 GB
[OK!]


In [108]:
import math
def sigmoid(x):
    if x == 1:
        return 100
    return 100 * np.tanh(x)
def score(a, b):
    a = np.array(a)
    b = np.array(b)
    x = a * b
    x = sum(x) / math.sqrt(sum(a*a)*sum(b*b))
    x = round(x, 6)
#     return x
    return sigmoid(x)

In [5]:
from enum import Enum
class KeyType(Enum):
    INVALID = 1
    HASHTAG = 2
    VALID = 0
def isValidKey(key): 
    if len(key) == 0:
        return KeyType.INVALID
    elif key[0] == '#':
        return KeyType.HASHTAG
    for char in key:
        if ord(char) <= 128:
            return KeyType.INVALID
    else:
        return KeyType.VALID

In [6]:
# read data into dict
word_list = []

with open('/Users/bigsad/Downloads/jawiki.all_vectors.300d.txt') as f:
    line = f.readline()
    nWords, vecSize = line.split(' ')
    nWords = int(nWords)
    vecSize = int(vecSize)

    for i in tqdm(range(nWords)):
        line = f.readline()
        line = line[:-1] # remove newline
        arr = line.split(' ')
        # first element is the japanese word and the rest are the vector values
        key = arr[0]
        
        if isValidKey(key) == KeyType.INVALID:
            pass
        elif isValidKey(key) == KeyType.HASHTAG:
            key = key[2:-2]
            word_list.append([key])
        else:
            word_list.append([key])

100%|██████████| 1511782/1511782 [00:38<00:00, 39689.57it/s]


In [124]:
from bs4 import BeautifulSoup
import requests as req

urls = [
        "https://japanesetest4you.com/jlpt-n1-vocabulary-list/",
        "https://japanesetest4you.com/jlpt-n2-vocabulary-list/",
        "https://japanesetest4you.com/jlpt-n3-vocabulary-list/",
        "https://japanesetest4you.com/jlpt-n4-vocabulary-list/",
        "https://japanesetest4you.com/jlpt-n5-vocabulary-list/",
        "https://japanesetest4you.com/jlpt-n1-grammar-list/",
        "https://japanesetest4you.com/jlpt-n2-grammar-list/",
        "https://japanesetest4you.com/jlpt-n3-grammar-list/",
        "https://japanesetest4you.com/jlpt-n4-grammar-list/",
        "https://japanesetest4you.com/jlpt-n5-grammar-list/",
       ]
target_words = set()
words = list()

for url in urls:
    content = req.get(url).text
    soup = BeautifulSoup(content, 'lxml')
    
    for div in soup.find_all('div', class_='entry clearfix'):
        for p in tqdm(div.find_all('p')):
            word = p.text.split(' ')[0]
            
            words.append([word])
            
data = spark.createDataFrame(words).toDF("text")
model = pipeline.fit(data)
result = model.transform(data)
result = result.select('embeddings').collect()

# print(result[0].embeddings)

for res in tqdm(result):
    if len(res.embeddings) == 1 and sum(res.embeddings[0].embeddings) != 0.0:
        target_words.add(res.embeddings[0].result)

100%|██████████| 442/442 [00:00<00:00, 248177.02it/s]
100%|██████████| 1621/1621 [00:00<00:00, 379004.78it/s]
100%|██████████| 1731/1731 [00:00<00:00, 333134.82it/s]
100%|██████████| 544/544 [00:00<00:00, 232969.31it/s]
100%|██████████| 562/562 [00:00<00:00, 271629.27it/s]
100%|██████████| 224/224 [00:00<00:00, 200538.76it/s]
100%|██████████| 209/209 [00:00<00:00, 118813.98it/s]
100%|██████████| 130/130 [00:00<00:00, 157180.61it/s]
100%|██████████| 115/115 [00:00<00:00, 155595.15it/s]
100%|██████████| 60/60 [00:00<00:00, 108333.29it/s]
100%|██████████| 5638/5638 [00:00<00:00, 118545.65it/s]


In [125]:
url = "https://kyoan.u-biq.org/tangosearch.html"
content = req.get(url)
content = req.get(url).text
content = content.encode('latin1')
soup = BeautifulSoup(content, 'lxml')

table = soup.find_all('table', class_='hyou')

words = list()

for t in soup.find_all('table', class_='hyou'):
    for td in tqdm(t.tbody.find_all('td')):
        if td.text and td.text[0] == '(':
            word = td.text.split(')')[1]
        else:
            word = td.text
        
        words.append([word])
            
data = spark.createDataFrame(words).toDF("text")
model = pipeline.fit(data)
result = model.transform(data)
result = result.select('embeddings').collect()

for res in tqdm(result):
    if len(res.embeddings) == 1 and sum(res.embeddings[0].embeddings) != 0.0:
        target_words.add(res.embeddings[0].result)

100%|██████████| 6014/6014 [00:00<00:00, 202001.59it/s]
100%|██████████| 6014/6014 [00:00<00:00, 148659.50it/s]


In [128]:
len(target_words)

4375

In [9]:
splitters = [ i for i in range(0, len(word_list), 100000) ]
splitters.append(len(word_list))
vector_list = list()

for i in range(len(splitters) - 1):
    data = spark.createDataFrame(word_list[splitters[i]:splitters[i+1]]).toDF("text")
    model = pipeline.fit(data)
    result = model.transform(data)
    result = result.select('embeddings').collect()

    for res in tqdm(result):
        if len(res.embeddings) > 0:
            vector_list.append(res.embeddings[0].embeddings)

100%|██████████| 100000/100000 [00:00<00:00, 243337.51it/s]
100%|██████████| 100000/100000 [00:00<00:00, 230071.84it/s]
100%|██████████| 100000/100000 [00:00<00:00, 223766.63it/s]
100%|██████████| 100000/100000 [00:00<00:00, 218204.78it/s]
100%|██████████| 100000/100000 [00:00<00:00, 218816.66it/s]
100%|██████████| 100000/100000 [00:00<00:00, 208477.16it/s]
100%|██████████| 100000/100000 [00:00<00:00, 214845.39it/s]
100%|██████████| 100000/100000 [00:00<00:00, 207893.84it/s]
100%|██████████| 100000/100000 [00:00<00:00, 214095.27it/s]
100%|██████████| 100000/100000 [00:00<00:00, 204397.39it/s]
100%|██████████| 100000/100000 [00:00<00:00, 194534.32it/s]
100%|██████████| 100000/100000 [00:00<00:00, 131301.12it/s]
100%|██████████| 35624/35624 [00:00<00:00, 214751.53it/s]


In [21]:
import hnswlib
def fit_hnsw_index(features, ef=100, M=16, save_index_file=False):
    # Convenience function to create HNSW graph
    # features : list of lists containing the embeddings
    # ef, M: parameters to tune the HNSW algorithm
    
    num_elements = len(features)
    labels_index = np.arange(num_elements)
    EMBEDDING_SIZE = len(features[0])
    
    # Declaring index
    # possible space options are l2, cosine or ip
    p = hnswlib.Index(space='cosine', dim=EMBEDDING_SIZE)
    
    # Initing index - the maximum number of elements should be known
    p.init_index(max_elements=num_elements, ef_construction=ef, M=M)
    
    # Element insertion
    int_labels = p.add_items(features, labels_index)
    
    # Controlling the recall by setting ef
    # ef should always be > k
    p.set_ef(ef) 
    
    # If you want to save the graph to a file
    if save_index_file:
         p.save_index(save_index_file)
    
    return p

In [47]:
from datetime import datetime
k = 1000
n = 1000000
ann_start = datetime.now()
p = fit_hnsw_index(vector_list[:n], ef=k*10)
ann_neighbor_indices, ann_distances = p.knn_query(vector_list[0], k)
ann_end = datetime.now()
print(f"ran in {(ann_end - ann_start).total_seconds()} seconds")

ran in 3923.7843 seconds


In [127]:
target_word_list = list()
target_vector_list = list()
for word in target_words:
    target_word_list.append([word])
    
data = spark.createDataFrame(target_word_list).toDF("text")
model = pipeline.fit(data)
result = model.transform(data)
result = result.select('embeddings').collect()

for res in tqdm(result):
    target_vector_list.append(res.embeddings[0].embeddings)

ann_neighbor_indices, ann_distances = p.knn_query(target_vector_list, k)



100%|██████████| 4375/4375 [00:00<00:00, 263510.49it/s]


TypeError: 'NoneType' object is not callable

In [126]:
for i in range(len(ann_neighbor_indices)):
    vec = target_vector_list[i]
    top25vec = vector_list[ann_neighbor_indices[i][25]]
    top100vec = vector_list[ann_neighbor_indices[i][100]]
    top500vec = vector_list[ann_neighbor_indices[i][500]]
    top1000vec = vector_list[ann_neighbor_indices[i][999]]
    print(f"{target_word_list[i]} \t\t {score(vec, top25vec):.2f} \t {score(vec, top100vec):.2f} \t {score(vec, top500vec):.2f} \t {score(vec, top1000vec):.2f}")

['新品'] 		 43.04 	 39.38 	 33.96 	 31.98
['乗り継ぐ'] 		 50.79 	 44.95 	 38.60 	 36.00
['開設'] 		 55.47 	 48.21 	 44.00 	 41.04
['夜'] 		 100.00 	 100.00 	 55.09 	 49.41
['地図'] 		 49.15 	 45.45 	 37.93 	 36.37
['愛情'] 		 52.83 	 51.61 	 46.48 	 40.54
['出社'] 		 47.88 	 45.09 	 37.45 	 34.87
['払う'] 		 49.09 	 43.88 	 40.31 	 38.74
['快い'] 		 43.97 	 40.02 	 36.87 	 34.48
['痛い'] 		 51.91 	 49.54 	 41.52 	 38.84
['分かる'] 		 57.41 	 52.69 	 50.28 	 47.56
['だらけ'] 		 45.62 	 42.09 	 40.16 	 38.91
['素材'] 		 45.86 	 42.65 	 39.66 	 37.29
['依存'] 		 48.17 	 43.61 	 39.65 	 37.76
['結構'] 		 61.52 	 57.10 	 53.70 	 52.03
['けど'] 		 68.92 	 68.92 	 65.24 	 58.42
['共産'] 		 100.00 	 60.89 	 48.02 	 46.42
['停止'] 		 50.00 	 45.56 	 39.91 	 37.77
['警官'] 		 60.91 	 60.91 	 47.28 	 43.31
['入る'] 		 59.80 	 52.45 	 46.11 	 45.84
['予想'] 		 50.06 	 43.56 	 40.23 	 37.86
['たら'] 		 62.87 	 62.87 	 59.92 	 57.95
['選抜'] 		 48.53 	 44.77 	 42.38 	 39.31
['消しゴム'] 		 46.53 	 41.37 	 34.44 	 32.27
['料理'] 		 100.00 	 51.39 	 46.04

  x = sum(x) / math.sqrt(sum(a*a)*sum(b*b))



['遠く'] 		 66.79 	 47.65 	 41.62 	 39.95
['克服'] 		 49.29 	 43.39 	 39.82 	 37.13
['カリフォルニア'] 		 100.00 	 62.38 	 56.31 	 56.31
['引き受ける'] 		 50.75 	 46.40 	 41.58 	 38.63
['全額'] 		 44.88 	 40.93 	 38.37 	 37.14
['柔道'] 		 100.00 	 52.00 	 48.11 	 44.51
['植物'] 		 100.00 	 55.79 	 47.26 	 44.13
['占領'] 		 57.03 	 50.98 	 44.43 	 41.29
['死亡'] 		 58.32 	 52.00 	 44.37 	 40.73
['泉'] 		 100.00 	 48.70 	 43.19 	 40.91
['どちら'] 		 54.07 	 54.07 	 53.09 	 50.16
['採用'] 		 53.02 	 48.02 	 43.03 	 39.98
['国家'] 		 100.00 	 100.00 	 50.46 	 48.63
['内緒'] 		 47.56 	 44.09 	 39.02 	 37.99
['十分'] 		 54.63 	 53.26 	 47.65 	 45.39
['予算'] 		 48.24 	 44.05 	 38.65 	 37.50
['いぬ'] 		 48.47 	 45.52 	 37.05 	 35.53
['別'] 		 100.00 	 52.12 	 45.27 	 43.70
['回す'] 		 50.27 	 45.73 	 39.54 	 37.59
['記入'] 		 48.82 	 43.11 	 40.67 	 38.34
['国'] 		 100.00 	 100.00 	 46.05 	 43.30
['貯金'] 		 48.95 	 42.25 	 39.09 	 35.66
['シャンプー'] 		 50.95 	 43.36 	 38.17 	 35.11
['付き合う'] 		 52.15 	 48.91 	 43.12 	 40.03
['成分'] 		 49.87 	 4

['画家'] 		 56.67 	 50.43 	 47.65 	 42.91
['個々'] 		 59.54 	 51.46 	 48.15 	 45.11
['行為'] 		 51.42 	 48.88 	 43.92 	 41.15
['説明'] 		 52.19 	 46.75 	 41.38 	 39.64
['両端'] 		 52.91 	 46.63 	 45.87 	 44.45
['譲歩'] 		 49.98 	 45.50 	 40.97 	 38.78
['問いかける'] 		 52.83 	 45.80 	 43.74 	 40.59
['視覚'] 		 59.81 	 47.40 	 41.75 	 38.45
['来場'] 		 54.43 	 46.01 	 39.63 	 37.74
['転職'] 		 57.86 	 46.33 	 41.45 	 37.71
['建設'] 		 100.00 	 52.89 	 49.03 	 42.82
['勝手'] 		 50.47 	 44.27 	 41.59 	 40.22
['慣れ'] 		 53.83 	 48.72 	 44.43 	 42.64
['解ける'] 		 44.17 	 39.09 	 34.25 	 31.96
['賞金'] 		 49.80 	 44.48 	 37.23 	 35.85
['順'] 		 51.56 	 38.05 	 33.45 	 32.48
['優良'] 		 44.73 	 42.39 	 36.28 	 36.23
['重役'] 		 57.26 	 47.66 	 40.52 	 37.65
['開く'] 		 59.33 	 43.83 	 39.15 	 37.22
['芸術'] 		 100.00 	 100.00 	 51.87 	 48.01
['本能'] 		 51.87 	 43.81 	 39.26 	 37.38
['貧困'] 		 51.15 	 45.57 	 43.59 	 43.29
['芸'] 		 42.22 	 42.22 	 35.88 	 32.41
['塵'] 		 45.32 	 41.44 	 37.26 	 33.98
['お腹'] 		 46.74 	 40.87 	 32.33 	 31

['最新'] 		 51.26 	 51.26 	 43.75 	 42.91
['止す'] 		 52.67 	 49.59 	 44.57 	 41.51
['前後'] 		 49.55 	 47.67 	 46.41 	 44.21
['昼寝'] 		 59.29 	 46.42 	 40.04 	 36.36
['入場'] 		 52.60 	 42.47 	 38.33 	 35.81
['は'] 		 100.00 	 100.00 	 100.00 	 100.00
['到達'] 		 53.09 	 46.58 	 42.35 	 40.41
['事例'] 		 48.59 	 45.69 	 42.65 	 39.21
['日常'] 		 56.10 	 55.41 	 44.38 	 39.39
['したぎ'] 		 nan 	 nan 	 nan 	 nan
['スカート'] 		 57.96 	 48.10 	 41.82 	 38.65
['終'] 		 41.63 	 37.36 	 32.79 	 32.79
['広い'] 		 54.86 	 48.98 	 41.60 	 38.71
['両替'] 		 45.35 	 41.77 	 36.24 	 35.17
['40'] 		 73.41 	 71.58 	 68.98 	 66.86
['暗記'] 		 47.38 	 47.06 	 39.43 	 37.04
['上昇'] 		 55.29 	 48.67 	 41.17 	 38.48
['不可'] 		 60.99 	 48.55 	 39.68 	 36.80
['生産'] 		 100.00 	 54.47 	 43.01 	 39.46
['経費'] 		 55.91 	 46.30 	 41.84 	 38.27
['夫'] 		 63.27 	 51.18 	 44.08 	 40.93
['整える'] 		 46.50 	 41.20 	 35.64 	 33.60
['燃料'] 		 100.00 	 48.01 	 43.47 	 40.50
['畑'] 		 100.00 	 51.37 	 44.38 	 40.49
['実家'] 		 56.29 	 51.67 	 45.94 	 42.72
[

['水面'] 		 50.74 	 46.04 	 42.78 	 37.34
['どんぶり'] 		 49.00 	 43.53 	 37.10 	 34.42
['区別'] 		 51.04 	 48.46 	 43.04 	 42.35
['明瞭'] 		 46.19 	 44.75 	 39.71 	 37.20
['愛社'] 		 47.36 	 47.36 	 35.36 	 34.52
['足元'] 		 43.77 	 43.77 	 35.76 	 34.89
['溜める'] 		 50.19 	 38.68 	 34.66 	 32.69
['外観'] 		 48.89 	 45.73 	 42.30 	 39.21
['打ち合わせ'] 		 48.44 	 44.87 	 40.36 	 38.17
['友情'] 		 52.83 	 44.43 	 43.80 	 39.74
['旧\xa0'] 		 nan 	 nan 	 nan 	 nan
['西洋'] 		 100.00 	 54.93 	 49.56 	 48.62
['昔'] 		 100.00 	 57.15 	 50.88 	 46.03
['感心'] 		 54.52 	 49.70 	 46.03 	 42.52
['相性'] 		 48.07 	 47.35 	 42.27 	 39.43
['登場'] 		 53.60 	 51.41 	 44.78 	 43.85
['現代'] 		 100.00 	 100.00 	 49.56 	 48.06
['悪戯'] 		 44.05 	 41.90 	 36.80 	 35.36
['立場'] 		 53.52 	 48.65 	 44.97 	 42.13
['箱根'] 		 100.00 	 55.48 	 48.69 	 48.01
['獲得'] 		 53.22 	 47.84 	 42.30 	 40.08
['多彩'] 		 59.31 	 49.55 	 45.37 	 42.91
['広がる'] 		 51.61 	 45.72 	 40.00 	 37.62
['支える'] 		 50.87 	 46.32 	 41.64 	 39.60
['簡単'] 		 55.65 	 49.73 	 43.75 	

['劇場'] 		 100.00 	 100.00 	 47.29 	 42.55
['症状'] 		 51.73 	 46.94 	 40.79 	 38.27
['餌'] 		 48.23 	 46.57 	 41.06 	 39.15
['計画'] 		 56.98 	 46.86 	 44.52 	 38.98
['迅速'] 		 51.21 	 42.31 	 37.94 	 35.31
['夕食'] 		 55.28 	 47.99 	 44.65 	 40.34
['発揮'] 		 51.52 	 48.55 	 44.31 	 43.42
['教員'] 		 59.54 	 56.16 	 56.02 	 51.06
['りょこう'] 		 nan 	 nan 	 nan 	 nan
['最上'] 		 100.00 	 51.70 	 46.43 	 43.45
['ひまな'] 		 nan 	 nan 	 nan 	 nan
['容器'] 		 52.35 	 47.49 	 41.10 	 38.42
['ひげ'] 		 nan 	 nan 	 nan 	 nan
['内包'] 		 50.58 	 45.33 	 41.73 	 40.52
['考察'] 		 52.48 	 48.10 	 43.42 	 40.41
['個別'] 		 52.19 	 42.70 	 39.09 	 38.04
['アイスクリーム'] 		 66.16 	 53.02 	 43.89 	 39.31
['背'] 		 100.00 	 43.12 	 36.96 	 36.96
['一段'] 		 46.67 	 45.53 	 42.68 	 39.52
['一体'] 		 56.19 	 56.19 	 48.21 	 41.45
['ぶる'] 		 41.95 	 40.90 	 39.59 	 37.82
['試す'] 		 47.43 	 46.89 	 41.66 	 39.67
['事'] 		 71.67 	 61.93 	 56.97 	 55.02
['好く'] 		 48.61 	 45.28 	 39.10 	 36.40
['弁明'] 		 53.04 	 47.56 	 41.57 	 39.52
['手帳'] 		 44.66

['破産'] 		 100.00 	 50.38 	 41.72 	 38.63
['作業'] 		 100.00 	 46.37 	 39.47 	 37.36
['信号'] 		 100.00 	 42.17 	 37.48 	 36.37
['話し合い'] 		 52.95 	 47.37 	 39.82 	 37.58
['新鮮'] 		 49.55 	 46.15 	 41.65 	 38.87
['材料'] 		 48.66 	 42.85 	 38.32 	 36.45
['鮮明'] 		 43.77 	 41.93 	 37.35 	 35.43
['かかり'] 		 52.78 	 46.32 	 38.99 	 36.78
['根性'] 		 49.58 	 43.44 	 38.74 	 37.15
['負け'] 		 66.21 	 48.19 	 42.44 	 39.94
['貿易'] 		 100.00 	 49.83 	 45.52 	 43.34
['玩具'] 		 46.87 	 44.34 	 39.73 	 38.45
['辞書'] 		 46.44 	 44.45 	 39.96 	 35.96
['持参'] 		 53.36 	 48.39 	 42.59 	 39.87
['滑る'] 		 50.01 	 45.55 	 39.41 	 37.03
['同居'] 		 49.54 	 47.38 	 42.53 	 39.27
['運送'] 		 57.08 	 51.29 	 45.95 	 42.19
['でんち'] 		 nan 	 nan 	 nan 	 nan
['神'] 		 100.00 	 100.00 	 43.83 	 42.03
['謙虚'] 		 52.56 	 46.87 	 41.30 	 38.68
['フィルム'] 		 65.52 	 45.92 	 41.89 	 40.17
['多大'] 		 48.36 	 43.97 	 40.49 	 39.39
['食物'] 		 51.92 	 49.30 	 41.89 	 40.43
['発売'] 		 57.64 	 51.79 	 45.29 	 43.82
['死ぬ'] 		 55.19 	 49.79 	 42.52 	 40.

['音量'] 		 47.66 	 43.56 	 40.97 	 36.23
['運転'] 		 100.00 	 51.66 	 42.25 	 39.36
['床'] 		 48.96 	 42.47 	 37.48 	 34.87
['電気'] 		 100.00 	 100.00 	 45.57 	 42.01
['てんき'] 		 36.55 	 36.00 	 33.66 	 30.54
['システム'] 		 100.00 	 53.29 	 43.95 	 42.28
['王子'] 		 100.00 	 46.10 	 39.87 	 35.94
['男女'] 		 100.00 	 50.24 	 41.57 	 40.45
['手早い'] 		 46.84 	 41.27 	 35.03 	 32.95
['雷'] 		 100.00 	 50.41 	 40.65 	 39.52
['対比'] 		 51.34 	 50.70 	 47.77 	 43.79
['可也'] 		 34.21 	 31.29 	 28.45 	 27.72
['直る'] 		 48.32 	 44.06 	 37.61 	 35.85
['配置'] 		 52.74 	 46.82 	 41.50 	 39.34
['経験'] 		 52.89 	 46.50 	 41.36 	 38.69
['妊娠'] 		 59.30 	 51.06 	 41.47 	 37.87
['収める'] 		 49.57 	 42.81 	 39.52 	 37.93
['工事'] 		 55.59 	 52.23 	 45.23 	 42.21
['熱意'] 		 61.94 	 51.13 	 42.57 	 40.00
['実現'] 		 51.60 	 48.33 	 44.72 	 43.56
['太陽'] 		 100.00 	 100.00 	 48.88 	 43.74
['とけい'] 		 47.53 	 37.83 	 34.28 	 32.88
['不思議'] 		 100.00 	 50.57 	 44.38 	 41.94
['昼食'] 		 56.42 	 44.84 	 37.20 	 35.60
['笑顔'] 		 100.00 	 52.77 

['意見'] 		 52.03 	 47.20 	 43.43 	 41.12
['草'] 		 100.00 	 46.47 	 41.21 	 37.79
['ひも'] 		 64.27 	 43.40 	 35.50 	 32.48
['手順'] 		 52.86 	 46.85 	 40.32 	 37.58
['規定'] 		 51.28 	 48.67 	 42.28 	 40.01
['探す'] 		 48.40 	 40.43 	 36.29 	 34.64
['原理'] 		 56.23 	 51.08 	 43.80 	 40.12
['賞品'] 		 51.25 	 45.52 	 37.89 	 34.84
['表紙'] 		 54.07 	 46.89 	 42.94 	 39.04
['あまり'] 		 58.97 	 58.97 	 57.57 	 57.57
['付近'] 		 56.43 	 49.29 	 45.58 	 43.46
['客'] 		 53.50 	 52.84 	 41.60 	 39.59
['相違'] 		 51.98 	 48.84 	 43.54 	 40.96
['郊外'] 		 56.63 	 50.62 	 45.69 	 44.01
['短い'] 		 65.95 	 46.73 	 42.76 	 40.81
['にち'] 		 nan 	 nan 	 nan 	 nan
['沸騰'] 		 47.68 	 41.81 	 35.61 	 33.44
['誠実'] 		 50.44 	 48.05 	 42.81 	 39.64
['降りる'] 		 53.21 	 46.56 	 40.86 	 37.68
['読み'] 		 100.00 	 47.37 	 40.51 	 35.80
['純粋'] 		 57.08 	 49.84 	 44.02 	 42.09
['酷い'] 		 56.48 	 51.81 	 45.50 	 42.69
['指輪'] 		 54.37 	 45.64 	 40.51 	 37.19
['水準'] 		 49.80 	 44.05 	 40.67 	 38.36
['範囲'] 		 47.51 	 45.13 	 39.13 	 37.25
['親子']

['うちゅう'] 		 nan 	 nan 	 nan 	 nan
['兎'] 		 53.87 	 51.59 	 43.01 	 40.18
['産業'] 		 100.00 	 100.00 	 51.76 	 45.81
['清掃'] 		 52.11 	 46.80 	 40.64 	 38.50
['司会'] 		 53.56 	 51.25 	 43.67 	 41.44
['公平'] 		 58.73 	 45.38 	 36.51 	 34.42
['みせ'] 		 56.14 	 52.94 	 50.76 	 49.48
['折る'] 		 56.05 	 44.15 	 36.97 	 34.25
['休み'] 		 52.44 	 47.97 	 40.79 	 38.75
['表面'] 		 100.00 	 46.73 	 39.57 	 37.48
['承知'] 		 50.44 	 49.01 	 46.01 	 44.02
['20'] 		 100.00 	 73.58 	 72.07 	 69.82
['女'] 		 100.00 	 65.68 	 46.21 	 43.03
['寄る'] 		 51.83 	 47.36 	 42.58 	 38.60
['背景'] 		 45.45 	 43.49 	 41.74 	 39.54
['刺激'] 		 45.75 	 42.66 	 39.01 	 36.95
['遠回り'] 		 45.03 	 41.78 	 38.12 	 36.03
['各地'] 		 62.44 	 62.44 	 62.44 	 49.33
['合格'] 		 58.21 	 48.44 	 39.57 	 35.79
['経路'] 		 58.78 	 44.74 	 40.17 	 38.54
['作戦'] 		 51.56 	 48.19 	 40.18 	 37.34
['入門'] 		 49.08 	 46.40 	 39.54 	 37.19
['勧める'] 		 50.86 	 48.03 	 42.88 	 40.96
['教訓'] 		 46.79 	 42.26 	 39.35 	 36.83
['定食'] 		 50.89 	 48.01 	 41.49 	 37.98
[

['鈍い'] 		 48.76 	 44.17 	 38.97 	 36.74
['ご'] 		 100.00 	 100.00 	 55.41 	 39.61
['喉'] 		 50.62 	 46.71 	 39.95 	 35.62
['ええ'] 		 58.91 	 57.33 	 54.77 	 49.53
['供給'] 		 51.43 	 50.09 	 44.35 	 41.15
['低い'] 		 54.05 	 48.95 	 47.09 	 43.92
['ケーキ'] 		 55.29 	 52.93 	 45.14 	 41.04
['本番'] 		 45.07 	 41.81 	 37.98 	 37.21
['ガラス'] 		 100.00 	 49.96 	 41.86 	 38.16
['卒業'] 		 100.00 	 56.26 	 45.66 	 44.00
['花見'] 		 53.58 	 49.88 	 38.87 	 36.27
['承認'] 		 53.92 	 45.57 	 40.97 	 39.27
['捕まえる'] 		 53.86 	 47.16 	 41.70 	 38.86
['多様'] 		 52.16 	 47.87 	 42.74 	 40.12
['ストレス'] 		 49.32 	 42.84 	 38.99 	 36.24
['強弱'] 		 44.58 	 40.26 	 34.39 	 32.95
['埋める'] 		 47.75 	 41.83 	 39.06 	 36.97
['反対'] 		 51.92 	 47.78 	 41.64 	 39.33
['物価'] 		 51.25 	 45.94 	 41.69 	 39.18
['成功'] 		 62.63 	 47.54 	 41.33 	 38.80
['ペット'] 		 100.00 	 51.13 	 40.58 	 37.98
['前回'] 		 57.77 	 50.84 	 47.04 	 45.58
['以降'] 		 57.93 	 57.93 	 56.38 	 56.38
['進める'] 		 53.24 	 46.94 	 41.67 	 39.54
['言葉'] 		 56.59 	 50.80 	 45

['そちら'] 		 52.97 	 50.28 	 44.43 	 43.45
['ピラミッド'] 		 47.08 	 47.08 	 37.55 	 36.45
['権限'] 		 52.39 	 47.01 	 40.89 	 37.87
['左右'] 		 55.36 	 43.52 	 38.00 	 35.32
['段'] 		 50.17 	 41.77 	 41.01 	 41.01
['飽き'] 		 49.39 	 45.26 	 41.59 	 38.82
['経過'] 		 49.63 	 49.63 	 39.62 	 36.92
['深まる'] 		 43.20 	 38.52 	 35.39 	 32.63
['工夫'] 		 51.23 	 49.82 	 43.99 	 41.73
['直結'] 		 42.14 	 38.36 	 36.16 	 34.46
['冷静'] 		 49.51 	 45.40 	 40.49 	 37.88
['磨く'] 		 46.94 	 40.31 	 36.27 	 33.80
['摘み'] 		 44.13 	 39.30 	 37.06 	 36.47
['痛む'] 		 51.67 	 44.19 	 40.23 	 38.24
['循環'] 		 100.00 	 40.07 	 38.30 	 35.03
['布団'] 		 55.90 	 46.67 	 38.70 	 36.06
['装置'] 		 47.92 	 47.91 	 40.63 	 38.20
['旨い'] 		 59.82 	 45.51 	 40.96 	 38.47
['政策'] 		 100.00 	 52.49 	 49.87 	 48.86
['実習'] 		 53.72 	 51.31 	 49.28 	 45.56
['暖かい'] 		 58.95 	 47.87 	 41.01 	 38.40
['試合'] 		 55.18 	 53.95 	 49.70 	 46.08
['しまい'] 		 63.75 	 63.75 	 62.12 	 62.07
['背中'] 		 57.36 	 53.56 	 42.96 	 39.28
['飲料'] 		 57.57 	 48.34 	 40.79 

['おみやげ'] 		 46.11 	 43.89 	 39.83 	 37.10
['ツイン'] 		 52.65 	 52.65 	 43.93 	 43.14
['職員'] 		 58.07 	 49.21 	 46.94 	 45.86
['芯'] 		 45.70 	 45.70 	 36.80 	 33.57
['陶器'] 		 53.86 	 48.65 	 41.52 	 38.23
['通学'] 		 60.03 	 46.85 	 45.97 	 39.68
['技術'] 		 100.00 	 52.70 	 47.91 	 44.10
['待つ'] 		 52.46 	 44.39 	 40.56 	 38.60
['興奮'] 		 53.02 	 48.20 	 42.53 	 40.16
['怠ける'] 		 40.54 	 36.92 	 33.41 	 31.87
['成熟'] 		 55.62 	 45.38 	 38.82 	 36.25
['小鳥'] 		 54.16 	 54.16 	 40.65 	 38.38
['敬意'] 		 49.15 	 45.49 	 40.04 	 36.28
['泣く'] 		 66.83 	 54.29 	 41.65 	 38.03
['最初'] 		 59.03 	 59.03 	 55.90 	 55.90
['まじき'] 		 44.65 	 44.65 	 37.62 	 35.13
['老人'] 		 100.00 	 51.16 	 43.68 	 38.30
['悪意'] 		 51.63 	 48.11 	 41.96 	 40.11
['伝達'] 		 48.30 	 45.31 	 39.43 	 37.92
['柔らかい'] 		 55.50 	 48.53 	 42.78 	 39.80
['けいぐ'] 		 nan 	 nan 	 nan 	 nan
['しょくよく'] 		 nan 	 nan 	 nan 	 nan
['重荷'] 		 46.94 	 42.91 	 38.85 	 36.31
['続き'] 		 45.97 	 43.37 	 37.12 	 34.90
['手品'] 		 46.62 	 46.34 	 39.00 	 36.69
['放す

['かぜ'] 		 44.04 	 41.67 	 37.61 	 35.51
['デザイン'] 		 100.00 	 53.12 	 44.92 	 42.68
['28'] 		 75.15 	 74.68 	 73.28 	 70.94
['数学'] 		 100.00 	 55.91 	 48.10 	 45.32
['夢'] 		 100.00 	 100.00 	 47.09 	 40.14
['映像'] 		 100.00 	 54.13 	 49.39 	 46.06
['続々'] 		 50.98 	 46.33 	 46.33 	 46.33
['保証'] 		 47.06 	 42.01 	 34.42 	 32.42
['学習'] 		 100.00 	 54.29 	 46.55 	 41.83
['曇'] 		 48.33 	 48.12 	 39.61 	 33.47
['セーター'] 		 50.94 	 49.34 	 39.82 	 35.78
['釣り'] 		 100.00 	 54.26 	 43.46 	 39.37
['42'] 		 74.85 	 72.91 	 71.51 	 68.46
['後'] 		 100.00 	 100.00 	 53.12 	 51.91
['紹介'] 		 51.92 	 48.51 	 45.80 	 42.25
['前日'] 		 56.16 	 53.23 	 51.45 	 49.15
['高校'] 		 100.00 	 68.06 	 56.98 	 51.84
['多発'] 		 56.63 	 45.42 	 39.59 	 38.48
['模型'] 		 57.92 	 51.46 	 42.23 	 38.94
['報告'] 		 50.17 	 45.99 	 41.24 	 38.83
['一人'] 		 100.00 	 45.86 	 41.89 	 39.78
['売れる'] 		 47.17 	 43.04 	 38.02 	 35.79
['花屋'] 		 49.60 	 49.60 	 49.60 	 39.46
['取り付く'] 		 39.62 	 36.51 	 33.73 	 31.67
['思う'] 		 69.06 	 66.54 	

['救済'] 		 50.72 	 47.03 	 41.16 	 38.54
['辿り着く'] 		 nan 	 nan 	 nan 	 nan
['奪う'] 		 53.37 	 48.83 	 42.70 	 40.55
['たい'] 		 100.00 	 62.41 	 53.01 	 50.25
['商業'] 		 100.00 	 51.54 	 44.63 	 43.92
['加速'] 		 53.92 	 47.74 	 42.59 	 39.50
['世代'] 		 47.85 	 44.63 	 39.16 	 39.16
['促す'] 		 55.49 	 50.22 	 47.57 	 44.11
['気持ち'] 		 58.19 	 48.03 	 45.56 	 43.44
['下手'] 		 52.59 	 45.47 	 42.08 	 40.10
['組み込む'] 		 53.71 	 48.96 	 44.85 	 42.67
['必着'] 		 40.60 	 33.52 	 30.30 	 28.46
['鏡'] 		 52.37 	 43.26 	 34.83 	 33.43
['濡れる'] 		 53.02 	 53.02 	 38.48 	 35.61
['れきし'] 		 44.72 	 44.72 	 31.18 	 29.02
['意思'] 		 59.97 	 46.50 	 43.17 	 40.62
['含む'] 		 52.93 	 49.63 	 45.27 	 45.05
['神経'] 		 100.00 	 100.00 	 44.85 	 39.30
['改革'] 		 57.29 	 47.35 	 44.18 	 41.93
['19'] 		 75.54 	 74.91 	 73.50 	 70.30
['あぶない'] 		 51.23 	 45.86 	 37.68 	 35.53
['感謝'] 		 51.71 	 45.62 	 40.58 	 38.16
['先月'] 		 62.92 	 56.93 	 53.20 	 52.86
['転換'] 		 47.13 	 44.22 	 40.23 	 37.47
['差し上げる'] 		 49.29 	 43.61 	 41.07 	

['笑い'] 		 100.00 	 55.64 	 39.27 	 36.18
['財産'] 		 56.70 	 49.05 	 41.74 	 39.57
['開始'] 		 56.94 	 52.45 	 46.99 	 44.81
['崩す'] 		 49.49 	 43.24 	 38.93 	 37.11
['概念'] 		 53.88 	 51.29 	 45.65 	 42.71
['原料'] 		 50.54 	 47.64 	 42.78 	 40.01
['発散'] 		 47.94 	 43.42 	 37.26 	 35.27
['図'] 		 48.29 	 42.19 	 35.47 	 32.67
['業績'] 		 49.26 	 42.08 	 39.32 	 35.63
['焦り'] 		 52.29 	 46.88 	 42.28 	 39.60
['濃厚'] 		 46.51 	 44.32 	 38.81 	 37.43
['母'] 		 100.00 	 69.29 	 48.95 	 44.84
['違う'] 		 57.94 	 52.61 	 47.58 	 45.61
['添う'] 		 44.03 	 41.28 	 35.27 	 34.64
['女優'] 		 62.10 	 54.60 	 48.80 	 43.87
['靴下'] 		 50.65 	 46.49 	 41.15 	 36.15
['強気'] 		 49.67 	 45.09 	 40.34 	 38.19
['シャツ'] 		 53.98 	 49.15 	 41.13 	 36.82
['天気'] 		 53.07 	 48.84 	 39.38 	 36.83
['形成'] 		 52.39 	 49.41 	 43.91 	 41.74
['広告'] 		 100.00 	 45.91 	 39.54 	 36.10
['小物'] 		 52.41 	 48.52 	 41.05 	 37.74
['視界'] 		 45.32 	 39.27 	 34.93 	 32.82
['賛成'] 		 49.93 	 44.39 	 38.15 	 35.32
['高い'] 		 52.21 	 49.41 	 48.44 	 45.5

In [118]:
target_word_list.index(['やくそく'])

49

In [120]:
sum(target_vector_list[49])

0.0