In [2]:
import numpy as np
from tqdm import tqdm


In [1]:
! pip install -q pyspark spark-nlp

In [4]:
!pyspark --version

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /___/ .__/\_,_/_/ /_/\_\   version 3.3.0
      /_/
                        
Using Scala version 2.12.15, Java HotSpot(TM) 64-Bit Server VM, 14.0.2
Branch HEAD
Compiled by user ubuntu on 2022-06-09T19:58:58Z
Revision f74867bddfbcdd4d08076db36851e88b15e66556
Url https://github.com/apache/spark
Type --help for more information.


In [6]:
import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline

sparknlp.start()

In [12]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [147]:
documentAssembler = DocumentAssembler() \
.setInputCol("text") \
.setOutputCol("document")

sentence = SentenceDetector() \
.setInputCols(["document"]) \
.setOutputCol("sentence")

word_segmenter = WordSegmenterModel.pretrained("wordseg_gsd_ud", "ja") \
.setInputCols(["sentence"]) \
.setOutputCol("token")

lemmatizer = LemmatizerModel.pretrained("lemma", "ja") \
.setInputCols(["token"]) \
.setOutputCol("lemma")

embeddings = WordEmbeddingsModel.pretrained("japanese_cc_300d", "ja") \
.setInputCols("sentence", "token") \
.setOutputCol("embeddings")

pipeline = Pipeline().setStages([
documentAssembler,
sentence,
word_segmenter,
# lemmatizer,
embeddings
])

wordseg_gsd_ud download started this may take some time.
Approximate size to download 979 KB
[OK!]
lemma download started this may take some time.
Approximate size to download 3.4 MB
[OK!]
japanese_cc_300d download started this may take some time.
Approximate size to download 1.2 GB
[OK!]


In [175]:
data = spark.createDataFrame([["憂鬱"], ["限界"], ["パン"]]).toDF("text")
model = pipeline.fit(data)
result = model.transform(data)
result.selectExpr("explode(arrays_zip(embeddings.result, embeddings.embeddings))")

DataFrame[col: struct<0:string,1:array<float>>]

In [179]:
result.select('embeddings').collect()[1].embeddings

[Row(annotatorType='word_embeddings', begin=0, end=1, result='限界', metadata={'sentence': '0', 'isOOV': 'false', 'isWordStart': 'true', 'pieceId': '-1', 'token': '限界'}, embeddings=[-0.21699999272823334, 0.07559999823570251, 0.32330000400543213, 0.19480000436306, -0.24480000138282776, -0.04659999907016754, 0.06129999831318855, 0.5189999938011169, 0.11190000176429749, 0.1386999934911728, 0.12860000133514404, 0.20399999618530273, 0.16439999639987946, 0.5054000020027161, -0.11140000075101852, -0.14249999821186066, 0.09790000319480896, 0.04520000144839287, -0.2387000024318695, -0.020999999716877937, 0.06840000301599503, 0.15649999678134918, 0.017100000753998756, 0.1891999989748001, -0.1858000010251999, 0.12720000743865967, 0.11580000072717667, 0.15109999477863312, -0.23420000076293945, -0.12860000133514404, -0.06560000032186508, -0.062300000339746475, -0.18000000715255737, -0.4424999952316284, 0.2214999943971634, 0.15539999306201935, -0.16169999539852142, 0.03539999946951866, -0.220400005578

In [176]:
len(result.select('embeddings').collect()[0].embeddings)

1

In [149]:
for res in result.select('embeddings').collect()[0].embeddings:
    print(res.result, res.embeddings)

憂鬱 [-0.19429999589920044, 0.12530000507831573, 0.3095000088214874, 0.22609999775886536, -0.27320000529289246, -0.20010000467300415, 0.30709999799728394, 0.1941000074148178, 0.15809999406337738, 0.5088000297546387, 0.07450000196695328, 0.08169999718666077, 0.07090000063180923, -0.11980000138282776, -0.3709999918937683, 0.2476000040769577, -0.15369999408721924, 0.011300000362098217, -0.24660000205039978, -0.007300000172108412, -0.2856000065803528, -0.08169999718666077, -0.1851000040769577, -0.11209999769926071, 0.20029999315738678, 0.19539999961853027, 0.2078000009059906, -0.0689999982714653, -0.043800000101327896, -0.15629999339580536, -0.05559999868273735, 0.08460000157356262, 0.16609999537467957, -0.09830000251531601, 0.24160000681877136, -0.08609999716281891, 0.15060000121593475, -0.27090001106262207, 0.14659999310970306, -0.25360000133514404, 0.06019999831914902, 0.26030001044273376, -0.016699999570846558, 0.34599998593330383, 0.43560001254081726, -0.027899999171495438, 0.1331000030

In [204]:
nWords = 0
vecSize = 0
embeddings = []

In [161]:
def sigmoid(x):
    if x == 1:
        return 100
    return 100 * np.tanh(x)

In [4]:
def test_score(a, b):
    x = embeddings[a] * embeddings[b]
    x = sum(x) / (np.linalg.norm(embeddings[a]) * np.linalg.norm(embeddings[b]))
    x = round(x, 6)
    return sigmoid(x)

In [162]:
def score(a, b):
    x = a * b
    x = sum(x) / sqrt(sum(a*a)*sum(b*b))
    x = round(x, 6)
    return x
#     return sigmoid(x)

In [6]:
from enum import Enum
class KeyType(Enum):
    INVALID = 1
    HASHTAG = 2
    VALID = 0

In [7]:
def isValidKey(key): 
    if len(key) == 0:
        return KeyType.INVALID
    elif key[0] == '#':
        return KeyType.HASHTAG
    for char in key:
        if ord(char) <= 128:
            return KeyType.INVALID
    else:
        return KeyType.VALID

In [None]:
# clean up dataset (remove punctuations and stuff)
# remove stuff with ##...##
# remove english

In [112]:
print(ord('#'))
print(ord('自')) # pretty sure ascii only goes up to 128

35
33258


In [111]:
string = "123456789"
string[2:-2]

'34567'

In [10]:
#clean up other miscel characters
del embeddings['']
del embeddings['、']
del embeddings['。']
del embeddings['（']
del embeddings['）']
del embeddings['「']
del embeddings['」']
del embeddings['・']

In [207]:
# read data into dict
word_list = []

with open('/Users/bigsad/Downloads/jawiki.all_vectors.300d.txt') as f:
# with open('test dataset.txt') as f:
    line = f.readline()
    nWords, vecSize = line.split(' ')
    nWords = int(nWords)
#     nWords = 100000
    vecSize = int(vecSize)
    count = 1
    for i in tqdm(range(nWords)):
        line = f.readline()
        line = line[:-1] # remove newline
        arr = np.array(line.split(' '))

        # first element is the japanese word and the rest are the vector values
        key = arr[0] 
        if isValidKey(key) == KeyType.INVALID:
            pass
        elif isValidKey(key) == KeyType.HASHTAG:
            key = key[2:-2]

#             vec = arr[1:]
#             vec = vec.astype('float64')
        word_list.append([key])

#             if count == nWords:
#                 break
        count += 1


  0%|          | 0/1511782 [00:00<?, ?it/s][A
  0%|          | 1475/1511782 [00:00<01:42, 14744.71it/s][A
  0%|          | 2962/1511782 [00:00<01:42, 14779.65it/s][A
  0%|          | 4597/1511782 [00:00<01:39, 15216.22it/s][A
  0%|          | 6341/1511782 [00:00<01:35, 15821.27it/s][A
  1%|          | 8124/1511782 [00:00<01:31, 16373.67it/s][A
  1%|          | 9888/1511782 [00:00<01:29, 16734.01it/s][A
  1%|          | 11684/1511782 [00:00<01:27, 17083.38it/s][A
  1%|          | 13436/1511782 [00:00<01:27, 17209.88it/s][A
  1%|          | 15202/1511782 [00:00<01:26, 17342.18it/s][A
  1%|          | 16944/1511782 [00:01<01:26, 17364.46it/s][A
  1%|          | 18673/1511782 [00:01<01:26, 17341.07it/s][A
  1%|▏         | 20381/1511782 [00:01<01:26, 17192.68it/s][A
  1%|▏         | 22082/1511782 [00:01<01:27, 16957.32it/s][A
  2%|▏         | 23766/1511782 [00:01<01:34, 15727.62it/s][A
  2%|▏         | 25348/1511782 [00:01<01:40, 14726.82it/s][A
  2%|▏         | 26841/15117

 14%|█▍        | 216542/1511782 [00:13<01:17, 16769.65it/s][A
 14%|█▍        | 218333/1511782 [00:13<01:15, 17093.96it/s][A
 15%|█▍        | 220060/1511782 [00:13<01:16, 16944.57it/s][A
 15%|█▍        | 221874/1511782 [00:14<01:14, 17283.81it/s][A
 15%|█▍        | 223613/1511782 [00:14<01:21, 15761.14it/s][A
 15%|█▍        | 225223/1511782 [00:14<01:28, 14546.62it/s][A
 15%|█▍        | 226720/1511782 [00:14<01:29, 14351.42it/s][A
 15%|█▌        | 228434/1511782 [00:14<01:25, 15087.12it/s][A
 15%|█▌        | 230094/1511782 [00:14<01:22, 15510.12it/s][A
 15%|█▌        | 231729/1511782 [00:14<01:21, 15752.33it/s][A
 15%|█▌        | 233360/1511782 [00:14<01:20, 15913.55it/s][A
 16%|█▌        | 235063/1511782 [00:14<01:18, 16232.71it/s][A
 16%|█▌        | 236762/1511782 [00:14<01:17, 16452.51it/s][A
 16%|█▌        | 238600/1511782 [00:15<01:14, 16985.90it/s][A
 16%|█▌        | 240379/1511782 [00:15<01:13, 17217.99it/s][A
 16%|█▌        | 242109/1511782 [00:15<01:43, 12242.00i

 28%|██▊       | 420464/1511782 [00:27<01:00, 17951.04it/s][A
 28%|██▊       | 422261/1511782 [00:27<01:01, 17742.32it/s][A
 28%|██▊       | 424073/1511782 [00:27<01:00, 17851.70it/s][A
 28%|██▊       | 425876/1511782 [00:27<01:00, 17904.11it/s][A
 28%|██▊       | 427710/1511782 [00:27<01:00, 18032.58it/s][A
 28%|██▊       | 429526/1511782 [00:27<00:59, 18069.32it/s][A
 29%|██▊       | 431373/1511782 [00:27<00:59, 18185.91it/s][A
 29%|██▊       | 433193/1511782 [00:28<00:59, 17978.40it/s][A
 29%|██▉       | 435031/1511782 [00:28<00:59, 18096.38it/s][A
 29%|██▉       | 436884/1511782 [00:28<00:58, 18223.51it/s][A
 29%|██▉       | 438720/1511782 [00:28<00:58, 18261.15it/s][A
 29%|██▉       | 440547/1511782 [00:28<00:58, 18242.51it/s][A
 29%|██▉       | 442387/1511782 [00:28<00:58, 18288.27it/s][A
 29%|██▉       | 444217/1511782 [00:28<00:58, 18282.58it/s][A
 30%|██▉       | 446049/1511782 [00:28<00:58, 18292.10it/s][A
 30%|██▉       | 447896/1511782 [00:28<00:57, 18344.94i

 43%|████▎     | 650619/1511782 [00:40<00:47, 18008.59it/s][A
 43%|████▎     | 652422/1511782 [00:40<00:47, 17985.61it/s][A
 43%|████▎     | 654222/1511782 [00:40<00:47, 17929.61it/s][A
 43%|████▎     | 656016/1511782 [00:41<00:47, 17918.44it/s][A
 44%|████▎     | 657809/1511782 [00:41<00:47, 17921.20it/s][A
 44%|████▎     | 659640/1511782 [00:41<00:47, 18035.29it/s][A
 44%|████▍     | 661475/1511782 [00:41<00:46, 18126.03it/s][A
 44%|████▍     | 663309/1511782 [00:41<00:46, 18187.48it/s][A
 44%|████▍     | 665131/1511782 [00:41<00:46, 18195.97it/s][A
 44%|████▍     | 666967/1511782 [00:41<00:46, 18242.58it/s][A
 44%|████▍     | 668809/1511782 [00:41<00:46, 18295.33it/s][A
 44%|████▍     | 670639/1511782 [00:41<00:46, 18275.20it/s][A
 44%|████▍     | 672477/1511782 [00:41<00:45, 18304.54it/s][A
 45%|████▍     | 674308/1511782 [00:42<00:46, 18039.42it/s][A
 45%|████▍     | 676113/1511782 [00:42<00:46, 17955.32it/s][A
 45%|████▍     | 677910/1511782 [00:42<00:46, 17790.00i

 59%|█████▊    | 885820/1511782 [00:54<00:34, 18022.86it/s][A
 59%|█████▊    | 887662/1511782 [00:54<00:34, 18137.30it/s][A
 59%|█████▉    | 889477/1511782 [00:54<00:34, 18130.45it/s][A
 59%|█████▉    | 891291/1511782 [00:54<00:34, 18123.04it/s][A
 59%|█████▉    | 893104/1511782 [00:54<00:34, 18073.95it/s][A
 59%|█████▉    | 894912/1511782 [00:54<00:34, 18002.02it/s][A
 59%|█████▉    | 896757/1511782 [00:54<00:33, 18133.85it/s][A
 59%|█████▉    | 898571/1511782 [00:54<00:33, 18104.47it/s][A
 60%|█████▉    | 900389/1511782 [00:54<00:33, 18125.88it/s][A
 60%|█████▉    | 902202/1511782 [00:54<00:33, 18004.15it/s][A
 60%|█████▉    | 904039/1511782 [00:55<00:33, 18111.08it/s][A
 60%|█████▉    | 905851/1511782 [00:55<00:33, 17988.48it/s][A
 60%|██████    | 907651/1511782 [00:55<00:34, 17720.72it/s][A
 60%|██████    | 909425/1511782 [00:55<00:34, 17628.36it/s][A
 60%|██████    | 911271/1511782 [00:55<00:33, 17868.27it/s][A
 60%|██████    | 913080/1511782 [00:55<00:33, 17933.04i

 74%|███████▍  | 1119231/1511782 [01:07<00:21, 18018.04it/s][A
 74%|███████▍  | 1121045/1511782 [01:07<00:21, 18053.01it/s][A
 74%|███████▍  | 1122851/1511782 [01:07<00:21, 17824.37it/s][A
 74%|███████▍  | 1124661/1511782 [01:07<00:21, 17904.25it/s][A
 75%|███████▍  | 1126453/1511782 [01:07<00:21, 17871.04it/s][A
 75%|███████▍  | 1128241/1511782 [01:07<00:21, 17803.46it/s][A
 75%|███████▍  | 1130050/1511782 [01:07<00:21, 17886.40it/s][A
 75%|███████▍  | 1131876/1511782 [01:07<00:21, 17994.32it/s][A
 75%|███████▍  | 1133676/1511782 [01:08<00:21, 17913.21it/s][A
 75%|███████▌  | 1135496/1511782 [01:08<00:20, 17996.06it/s][A
 75%|███████▌  | 1137327/1511782 [01:08<00:20, 18086.96it/s][A
 75%|███████▌  | 1139140/1511782 [01:08<00:20, 18098.23it/s][A
 75%|███████▌  | 1140951/1511782 [01:08<00:20, 18091.88it/s][A
 76%|███████▌  | 1142761/1511782 [01:08<00:20, 17968.05it/s][A
 76%|███████▌  | 1144581/1511782 [01:08<00:20, 18035.95it/s][A
 76%|███████▌  | 1146401/1511782 [01:08<

 89%|████████▊ | 1340930/1511782 [01:20<00:09, 17705.76it/s][A
 89%|████████▉ | 1342703/1511782 [01:20<00:09, 17445.84it/s][A
 89%|████████▉ | 1344451/1511782 [01:20<00:09, 17450.80it/s][A
 89%|████████▉ | 1346271/1511782 [01:20<00:09, 17666.58it/s][A
 89%|████████▉ | 1348040/1511782 [01:20<00:09, 17630.49it/s][A
 89%|████████▉ | 1349805/1511782 [01:20<00:09, 17364.26it/s][A
 89%|████████▉ | 1351605/1511782 [01:21<00:09, 17549.18it/s][A
 90%|████████▉ | 1353363/1511782 [01:21<00:09, 17556.10it/s][A
 90%|████████▉ | 1355126/1511782 [01:21<00:08, 17577.11it/s][A
 90%|████████▉ | 1356921/1511782 [01:21<00:08, 17685.93it/s][A
 90%|████████▉ | 1358721/1511782 [01:21<00:08, 17777.98it/s][A
 90%|████████▉ | 1360552/1511782 [01:21<00:08, 17932.84it/s][A
 90%|█████████ | 1362379/1511782 [01:21<00:08, 18030.37it/s][A
 90%|█████████ | 1364183/1511782 [01:21<00:08, 18002.87it/s][A
 90%|█████████ | 1365984/1511782 [01:21<00:08, 17671.73it/s][A
 90%|█████████ | 1367753/1511782 [01:21<

In [302]:
ord('・')

12539

In [207]:
len(embeddings)

24

In [63]:
'残酷' in embeddings

True

In [191]:
#words that should be similar
print(test_score('思う','思う'))
print(test_score('ショッピング','ショッピング'))
print(test_score('思う','考える'))
print(test_score('ショッピング','買い物'))
print(test_score('死ぬ','殺す'))

print()

print(test_score('俺','私')) # these are weirdly low
print(test_score('落ち込む','憂鬱'))
print(test_score('逃げる','逃す'))
print(test_score('シャワー','浴びる'))

100
100
53.67549083071811
57.97712064543794
61.578751615132255

33.74366778104572
37.481291582473865
37.27895559084609
35.02678245052315


In [192]:
#words that should be different
print(test_score('家事','みかん'))
print(test_score('最高','走る'))
print(test_score('鬼','不健康'))
print(test_score('刀','柱')) #why is this so high
print(test_score('電柱','絶望'))
print(test_score('寿命','命令'))
print(test_score('自然','突然'))
print(test_score('党','塔'))

26.48898637886783
14.887525527629219
24.46507394775458
37.68987753237022
27.53781179869285
22.651983068439485
9.764884191778691
21.20682488094399


- dot vectors together
- need an ordered map of the first 1000? most similar words to the target word
- seems like a lot of space to pre-compute every pair of the top 1000 (given that the dataset contains 1.5 mil words... that would be a billion items to contain)

In [219]:
from sortedcontainers import SortedDict

'''sorted dict testing'''

In [260]:
sd = SortedDict()

In [263]:
sd['c'] = 1
sd['a'] = 3
sd['b'] = 2


In [264]:
sd

SortedDict({'a': 3, 'b': 2, 'c': 1})

In [None]:
#conclusion: sortedDict orders by key oh well

In [279]:
test_top_1000 = SortedDict() #create a reverse dictionary mapping scores to words

In [280]:
#generate the top 1000 is nlogn
first_key = list(embeddings.keys())[0]
first_value = list(embeddings.values())[0]
for word, vec in tqdm(embeddings.items()):
    similarity = score(first_value, vec)
#     print(f"{word},{similarity}")
#     print(test_top_1000.keys())
    if not test_top_1000 or len(test_top_1000) < 10:
        test_top_1000[similarity] = word
    elif similarity > test_top_1000.keys()[0]:
        del test_top_1000[test_top_1000.keys()[0]]
        test_top_1000[similarity] = word

100%|██████████| 24/24 [00:00<00:00, 9392.86it/s]


In [281]:
print(test_top_1000)

SortedDict({63.9170484449445: 'で', 64.17679506795967: 'を', 64.49328747700123: 'た', 65.67585055637353: 'と', 66.11173851634045: 'に', 66.1399879134437: 'が', 67.04521351811081: '。', 67.46227727153806: 'は', 68.83036316627944: 'の', 100: '、'})


In [None]:
######################################################## generate top 1000s for all words ##########################################

In [307]:
top_1000s = dict()

for tword, tvec in tqdm(embeddings.items()):
    top_1000_for_target = SortedDict()
    
    for word, vec in embeddings.items():
        similarity = score(tvec, vec)
        if not top_1000_for_target or len(top_1000_for_target) < 1000:
            top_1000_for_target[similarity] = word
        elif similarity > top_1000_for_target.keys()[0]:
            del top_1000_for_target[top_1000_for_target.keys()[0]]
            top_1000_for_target[similarity] = word
    
    top_1000s[tword] = top_1000_for_target

100%|██████████| 24/24 [00:00<00:00, 425.37it/s]


In [313]:
#TESTING: print top 10s for first 10 words
c1 = 0

for key, val in top_1000s.items():
    if c1 == 10:
        break
    print(f"{key}")
    c2 = 0
    for k, v in val.items():
        if c2 == 10:
            break
        print(f"{k}: {v}")
        c2 += 1
    print()
    c1 += 1

、
40.13750764190077: .
43.68700370624287: (
50.57712359710557: ^
50.57995148549023: -
58.38031680588522: れ
58.623684332563556: さ
60.16223696138103: いる
60.19113277304079: 」
60.837913129219515: 「
61.77559583729595: ・

の
39.811256694589886: .
44.11976840367658: (
47.733872776199085: -
51.37233541225735: ^
57.72989679356676: （
58.06846192430002: ）
59.35031724266171: ・
62.59730919894266: さ
62.72624952833649: れ
63.88515770841449: 」

。
44.354384813969006: .
45.378853043753296: (
51.34067662318563: -
54.763784863207775: ・
55.460041476814624: （
55.73025881465422: ）
56.95455882489146: ^
60.13243163002346: れ
60.43119170625077: さ
62.072336386613244: いる

に
34.53778574869036: .
40.68044686712311: (
43.43615406203638: -
47.121035287452976: ^
52.86487916352364: ）
52.95150384526238: （
54.46189557627267: ・
61.12101953158623: 」
61.422501670121775: 「
64.28635078588492: さ

を
32.03777411278231: .
38.56611412938088: (
40.32131589828974: -
42.882089308151826: ^
49.92360030838841: ）
50.19099525584308: （
52.796

In [None]:
#web scraping test

In [76]:
from bs4 import BeautifulSoup
import requests as req

url = "https://japanesetest4you.com/jlpt-n1-vocabulary-list/"

content = req.get(url).text
soup = BeautifulSoup(content, 'lxml')


In [78]:
count = 1
for div in soup.find_all('div', class_='entry clearfix'):
    for p in div.find_all('p'):
        print(p.text)
        if count == 30:
            break
        count += 1

This is the list of words you need to study for the Japanese Language Proficiency Test Level N1.

This list is being updated. Click on each word to see example sentences and download flashcards.
赤字 (akaji): deficit
悪事 (akuji): evil deed, crime
圧倒 (attou): overwhelm, overpower
扱い (atsukai): treatment, service​
圧力 (atsuryoku): pressure, stress​
過ち (ayamachi): fault, error, indiscretion
買収 (baishuu): acquisition, buy-out, takeover
爆弾 (bakudan): bomb
弁護 (bengo): defence, pleading, advocacy​
弁解 (benkai): justification, explanation
弁明 (benmei): explanation, excuse
貧乏 (binbou): poverty, destitute, poor​
敏感 (binkan): sensibility, susceptibility
微笑 (bishou): smile
膨張 (bouchou): expansion, swelling, increase
防衛 (bouei): defense, protection
暴力 (bouryoku): violence, mayhem
侮辱 (bujoku): insult, affront, slight
部下 (buka): subordinate person
分配 (bunpai): division, splitting, sharing
分散 (bunsan): dispersion, breakup
文書 (bunsho): document, writing, paperwork
無礼 (burei): impolite, rude​
武装 (busou): arms

In [79]:
target_words = []

In [80]:
for div in soup.find_all('div', class_='entry clearfix'):
    for p in tqdm(div.find_all('p')):
        word = p.text.split(' ')[0]
        if word in embeddings:
            target_words.append(word)

100%|██████████| 442/442 [00:00<00:00, 205416.33it/s]


In [81]:
len(target_words)

437

In [83]:
url = "https://japanesetest4you.com/jlpt-n3-vocabulary-list/"

content = req.get(url).text
soup = BeautifulSoup(content, 'lxml')
for div in soup.find_all('div', class_='entry clearfix'):
    for p in div.find_all('p'):
        print(p.text)

This is the list of words you need to study for the Japanese Language Proficiency Test Level N3.

Click on each word to see example sentences and download flashcards.
The link to download JTest4You’s N3 vocabulary ebook (3,375 pages) can be found here.
油 (abura): oil
愛 (ai): love, affection, care
愛情 (aijou): love, affection
相変わらず (aikawarazu): as ever, as usual, the same
生憎 (ainiku): unfortunately; sorry, but…
愛する (aisuru): to love
相手 (aite): companion, partner
合図 (aizu): sign, signal
明かり (akari): light, illumination, glow
明ける (akeru): to dawn, to end (of a period, season)
空き (aki): space, room, emptiness
空き家 (akiya): vacant house, unoccupied house​
明らか (akiraka): obvious, evident, clear
諦める (akirameru): to give up, to abandon
飽きる (akiru): to get tired of
悪魔 (akuma): demon, evil
握手 (akushu): handshake
余る (amaru): to remain, to be left over
編む (amu): to knit, to braid​
案 (an): idea, plan, thought
汗 (ase): sweat, perspiration
穴 (ana): hole, deficit, vacancy
暗記 (anki): memorization
安定 (an

容器 (youki): container, vessel
用心 (youjin): care, precaution
陽気 (youki): weather, cheerful
要求 (youkyuu): demand, firm request
用紙 (youshi): blank form​
要素 (youso): element, factor, component
様子 (yousu): state, appearance
要点 (youten): gist, main point
ようやく: finally, at last
余裕 (yoyuu): surplus, margin
唯一 (yuiitsu): only, sole, unique
ゆるい: loose
床 (yuka): floor
愉快 (yukai): pleasant, happy
行き (yuki): bound for, going to
行き先 (yukisaki): destination, whereabouts
許す (yurusu): to permit, to allow
豊か (yutaka): abundant, wealthy
郵便 (yuubin): mail service
遊園地 (yuuenchi): amusement park
夕方 (yuugata): evening, dusk​
友人 (yuujin): friend
友情 (yuujou): friendship, fellowship
勇気 (yuuki): courage, bravery
有効 (yuukou): validity, effectiveness
友好 (yuukou): friendship
有能 (yuunou): capable, efficient
有利 (yuuri): advantageous, profitable
有料 (yuuryou): fee-charging, paid
優勝 (yuushou): championship, overall victory
優秀 (yuushuu): superiority, excellence
郵送 (yuusou): mailing, posting​
譲る (yuzuru): to hand over, to

In [None]:
#generate target words

In [193]:
from bs4 import BeautifulSoup
import requests as req

urls = [
        "https://japanesetest4you.com/jlpt-n1-vocabulary-list/",
        "https://japanesetest4you.com/jlpt-n2-vocabulary-list/",
        "https://japanesetest4you.com/jlpt-n3-vocabulary-list/",
        "https://japanesetest4you.com/jlpt-n4-vocabulary-list/",
        "https://japanesetest4you.com/jlpt-n5-vocabulary-list/",
        "https://japanesetest4you.com/jlpt-n1-grammar-list/",
        "https://japanesetest4you.com/jlpt-n2-grammar-list/",
        "https://japanesetest4you.com/jlpt-n3-grammar-list/",
        "https://japanesetest4you.com/jlpt-n4-grammar-list/",
        "https://japanesetest4you.com/jlpt-n5-grammar-list/",
       ]
target_words = set()
word_list = list()

for url in urls:
    content = req.get(url).text
    soup = BeautifulSoup(content, 'lxml')
    
    for div in soup.find_all('div', class_='entry clearfix'):
        for p in tqdm(div.find_all('p')):
            word = p.text.split(' ')[0]
            
            word_list.append([word])
            
data = spark.createDataFrame(word_list).toDF("text")
model = pipeline.fit(data)
result = model.transform(data)
result = result.select('embeddings').collect()

# print(result[0].embeddings)

for res in tqdm(result):
    if len(res.embeddings) == 1:
        target_words.add(res.embeddings[0].result)

100%|██████████| 442/442 [00:00<00:00, 276080.77it/s]
100%|██████████| 1621/1621 [00:00<00:00, 389068.20it/s]
100%|██████████| 1731/1731 [00:00<00:00, 425526.92it/s]
100%|██████████| 544/544 [00:00<00:00, 256977.29it/s]
100%|██████████| 562/562 [00:00<00:00, 309628.12it/s]
100%|██████████| 224/224 [00:00<00:00, 209388.03it/s]
100%|██████████| 209/209 [00:00<00:00, 202871.91it/s]
100%|██████████| 130/130 [00:00<00:00, 148774.77it/s]
100%|██████████| 115/115 [00:00<00:00, 163395.99it/s]
100%|██████████| 60/60 [00:00<00:00, 139654.96it/s]
100%|██████████| 5638/5638 [00:00<00:00, 249656.74it/s]


In [194]:
len(target_words)

3879

In [200]:
#https://kyoan.u-biq.org/tangosearch.html has a pretty comprehensive list too so I'll include that
url = "https://kyoan.u-biq.org/tangosearch.html"
content = req.get(url)
content = req.get(url).text
content = content.encode('latin1')
soup = BeautifulSoup(content, 'lxml')

# for t in soup.find_all('table', class_='hyou'):
#     for tr in t.tbody.find_all('tr'):
#         for td in tr.find_all('td'):
#             print(td)

In [201]:
table = soup.find_all('table', class_='hyou')

In [202]:
word_list = list()

for t in soup.find_all('table', class_='hyou'):
    for td in tqdm(t.tbody.find_all('td')):
        if td.text and td.text[0] == '(':
            word = td.text.split(')')[1]
        else:
            word = td.text
        
        word_list.append([word])
            
data = spark.createDataFrame(word_list).toDF("text")
model = pipeline.fit(data)
result = model.transform(data)
result = result.select('embeddings').collect()

for res in tqdm(result):
    if len(res.embeddings) == 1:
        target_words.add(res.embeddings[0].result)

100%|██████████| 6014/6014 [00:00<00:00, 153582.51it/s]
100%|██████████| 6014/6014 [00:00<00:00, 353780.42it/s]


In [208]:
len(target_words)

4537

In [27]:
for i, word in enumerate(target_words):
    if len(word) > 50:
        print(f"{i}, {word}")

In [171]:
data = spark.createDataFrame([['動物']]).toDF("text")
model = pipeline.fit(data)
result = model.transform(data)

daily_word_vector = np.array(result.select('embeddings').collect()[0].embeddings[0].embeddings).astype('float64')

data = spark.createDataFrame([['猫']]).toDF("text")
model = pipeline.fit(data)
result = model.transform(data)

target_word_vector = np.array(result.select('embeddings').collect()[0].embeddings[0].embeddings).astype('float64')

vscore = score(daily_word_vector, target_word_vector)

In [172]:
vscore

49.25048724873456

In [None]:
#use a KD Tree to find nearest neighbors

In [209]:
from scipy import spatial
A = [[0,1,2,3,4], [4,3,2,1,0], [2,5,3,7,1], [1,0,1,0,1]]
tree = spatial.KDTree(A)

In [214]:
tree.query([0.5,0.5,0.5,0.5,0.5], k=3)[1]

array([3, 0, 1])

In [216]:
import hnswlib
def fit_hnsw_index(features, ef=100, M=16, save_index_file=False):
    # Convenience function to create HNSW graph
    # features : list of lists containing the embeddings
    # ef, M: parameters to tune the HNSW algorithm
    
    num_elements = len(features)
    labels_index = np.arange(num_elements)
    EMBEDDING_SIZE = len(features[0])
    
    # Declaring index
    # possible space options are l2, cosine or ip
    p = hnswlib.Index(space='l2', dim=EMBEDDING_SIZE)
    
    # Initing index - the maximum number of elements should be known
    p.init_index(max_elements=num_elements, ef_construction=ef, M=M)
    
    # Element insertion
    int_labels = p.add_items(features, labels_index)
    
    # Controlling the recall by setting ef
    # ef should always be > k
    p.set_ef(ef) 
    
    # If you want to save the graph to a file
    if save_index_file:
         p.save_index(save_index_file)
    
    return p


In [217]:
k = 1000
p = fit_hnsw_index(vector_list[:100], ef=k*10)
ann_neighbor_indices, ann_distances = p.knn_query(vector_list[:100], k)

NameError: name 'vector_list' is not defined

In [219]:
from datetime import datetime
import time
start = datetime.now()
time.sleep(5)
end = datetime.now()
print((end - start).total_seconds())

5.003276
