In [2]:
import numpy as np
from tqdm import tqdm

In [3]:
nWords = 0
vecSize = 0
embeddings = {}

In [4]:
def sigmoid(x):
    if x == 1:
        return 100
    return 100 * np.tanh(x)

In [5]:
def test_score(a, b):
    x = embeddings[a] * embeddings[b]
    x = sum(x) / (np.linalg.norm(embeddings[a]) * np.linalg.norm(embeddings[b]))
    x = round(x, 6)
    return sigmoid(x)

In [6]:
def score(a, b):
    x = a * b
    x = sum(x) / (np.linalg.norm(a) * np.linalg.norm(b))
    x = round(x, 6)
    return sigmoid(x)

In [7]:
from enum import Enum
class KeyType(Enum):
    INVALID = 1
    HASHTAG = 2
    VALID = 0

In [8]:
def isValidKey(key): 
    if len(key) == 0:
        return KeyType.INVALID
    elif key[0] == '#':
        return KeyType.HASHTAG
    for char in key:
        if ord(char) <= 128:
            return KeyType.INVALID
    else:
        return KeyType.VALID

In [None]:
# clean up dataset (remove punctuations and stuff)
# remove stuff with ##...##
# remove english

In [112]:
print(ord('#'))
print(ord('自')) # pretty sure ascii only goes up to 128

35
33258


In [111]:
string = "123456789"
string[2:-2]

'34567'

In [9]:
# read data into dict
with open('/Users/bigsad/Downloads/jawiki.all_vectors.300d.txt') as f:
# with open('test dataset.txt') as f:
    line = f.readline()
    nWords, vecSize = line.split(' ')
    nWords = int(nWords)
    vecSize = int(vecSize)
    with tqdm(total=nWords) as pbar:
        while line:
            line = f.readline()
            line = line[:-1] # remove newline
            arr = np.array(line.split(' '))

            # first element is the japanese word and the rest are the vector values
            key = arr[0] 
            if isValidKey(key) == KeyType.INVALID:
                pass
            elif isValidKey(key) == KeyType.HASHTAG:
                key = key[2:-2]
            
            vec = arr[1:]
            vec = vec.astype('float64')
            embeddings[key] = vec
            
            pbar.update(1)

1511783it [04:36, 5470.47it/s]                             


In [10]:
#clean up other miscel characters
del embeddings['']
del embeddings['、']
del embeddings['。']
del embeddings['（']
del embeddings['）']
del embeddings['「']
del embeddings['」']
del embeddings['・']

In [302]:
ord('・')

12539

In [207]:
len(embeddings)

24

In [63]:
'残酷' in embeddings

True

In [191]:
#words that should be similar
print(test_score('思う','思う'))
print(test_score('ショッピング','ショッピング'))
print(test_score('思う','考える'))
print(test_score('ショッピング','買い物'))
print(test_score('死ぬ','殺す'))

print()

print(test_score('俺','私')) # these are weirdly low
print(test_score('落ち込む','憂鬱'))
print(test_score('逃げる','逃す'))
print(test_score('シャワー','浴びる'))

100
100
53.67549083071811
57.97712064543794
61.578751615132255

33.74366778104572
37.481291582473865
37.27895559084609
35.02678245052315


In [192]:
#words that should be different
print(test_score('家事','みかん'))
print(test_score('最高','走る'))
print(test_score('鬼','不健康'))
print(test_score('刀','柱')) #why is this so high
print(test_score('電柱','絶望'))
print(test_score('寿命','命令'))
print(test_score('自然','突然'))
print(test_score('党','塔'))

26.48898637886783
14.887525527629219
24.46507394775458
37.68987753237022
27.53781179869285
22.651983068439485
9.764884191778691
21.20682488094399


- dot vectors together
- need an ordered map of the first 1000? most similar words to the target word
- seems like a lot of space to pre-compute every pair of the top 1000 (given that the dataset contains 1.5 mil words... that would be a billion items to contain)

In [219]:
from sortedcontainers import SortedDict

'''sorted dict testing'''

In [260]:
sd = SortedDict()

In [263]:
sd['c'] = 1
sd['a'] = 3
sd['b'] = 2


In [264]:
sd

SortedDict({'a': 3, 'b': 2, 'c': 1})

In [None]:
#conclusion: sortedDict orders by key oh well

In [279]:
test_top_1000 = SortedDict() #create a reverse dictionary mapping scores to words

In [280]:
#generate the top 1000 is nlogn
first_key = list(embeddings.keys())[0]
first_value = list(embeddings.values())[0]
for word, vec in tqdm(embeddings.items()):
    similarity = score(first_value, vec)
#     print(f"{word},{similarity}")
#     print(test_top_1000.keys())
    if not test_top_1000 or len(test_top_1000) < 10:
        test_top_1000[similarity] = word
    elif similarity > test_top_1000.keys()[0]:
        del test_top_1000[test_top_1000.keys()[0]]
        test_top_1000[similarity] = word

100%|██████████| 24/24 [00:00<00:00, 9392.86it/s]


In [281]:
print(test_top_1000)

SortedDict({63.9170484449445: 'で', 64.17679506795967: 'を', 64.49328747700123: 'た', 65.67585055637353: 'と', 66.11173851634045: 'に', 66.1399879134437: 'が', 67.04521351811081: '。', 67.46227727153806: 'は', 68.83036316627944: 'の', 100: '、'})


In [None]:
######################################################## generate top 1000s for all words ##########################################

In [307]:
top_1000s = dict()

for tword, tvec in tqdm(embeddings.items()):
    top_1000_for_target = SortedDict()
    
    for word, vec in embeddings.items():
        similarity = score(tvec, vec)
        if not top_1000_for_target or len(top_1000_for_target) < 1000:
            top_1000_for_target[similarity] = word
        elif similarity > top_1000_for_target.keys()[0]:
            del top_1000_for_target[top_1000_for_target.keys()[0]]
            top_1000_for_target[similarity] = word
    
    top_1000s[tword] = top_1000_for_target

100%|██████████| 24/24 [00:00<00:00, 425.37it/s]


In [313]:
#TESTING: print top 10s for first 10 words
c1 = 0

for key, val in top_1000s.items():
    if c1 == 10:
        break
    print(f"{key}")
    c2 = 0
    for k, v in val.items():
        if c2 == 10:
            break
        print(f"{k}: {v}")
        c2 += 1
    print()
    c1 += 1

、
40.13750764190077: .
43.68700370624287: (
50.57712359710557: ^
50.57995148549023: -
58.38031680588522: れ
58.623684332563556: さ
60.16223696138103: いる
60.19113277304079: 」
60.837913129219515: 「
61.77559583729595: ・

の
39.811256694589886: .
44.11976840367658: (
47.733872776199085: -
51.37233541225735: ^
57.72989679356676: （
58.06846192430002: ）
59.35031724266171: ・
62.59730919894266: さ
62.72624952833649: れ
63.88515770841449: 」

。
44.354384813969006: .
45.378853043753296: (
51.34067662318563: -
54.763784863207775: ・
55.460041476814624: （
55.73025881465422: ）
56.95455882489146: ^
60.13243163002346: れ
60.43119170625077: さ
62.072336386613244: いる

に
34.53778574869036: .
40.68044686712311: (
43.43615406203638: -
47.121035287452976: ^
52.86487916352364: ）
52.95150384526238: （
54.46189557627267: ・
61.12101953158623: 」
61.422501670121775: 「
64.28635078588492: さ

を
32.03777411278231: .
38.56611412938088: (
40.32131589828974: -
42.882089308151826: ^
49.92360030838841: ）
50.19099525584308: （
52.796

In [None]:
#web scraping test

In [76]:
from bs4 import BeautifulSoup
import requests as req

url = "https://japanesetest4you.com/jlpt-n1-vocabulary-list/"

content = req.get(url).text
soup = BeautifulSoup(content, 'lxml')


In [78]:
count = 1
for div in soup.find_all('div', class_='entry clearfix'):
    for p in div.find_all('p'):
        print(p.text)
        if count == 30:
            break
        count += 1

This is the list of words you need to study for the Japanese Language Proficiency Test Level N1.

This list is being updated. Click on each word to see example sentences and download flashcards.
赤字 (akaji): deficit
悪事 (akuji): evil deed, crime
圧倒 (attou): overwhelm, overpower
扱い (atsukai): treatment, service​
圧力 (atsuryoku): pressure, stress​
過ち (ayamachi): fault, error, indiscretion
買収 (baishuu): acquisition, buy-out, takeover
爆弾 (bakudan): bomb
弁護 (bengo): defence, pleading, advocacy​
弁解 (benkai): justification, explanation
弁明 (benmei): explanation, excuse
貧乏 (binbou): poverty, destitute, poor​
敏感 (binkan): sensibility, susceptibility
微笑 (bishou): smile
膨張 (bouchou): expansion, swelling, increase
防衛 (bouei): defense, protection
暴力 (bouryoku): violence, mayhem
侮辱 (bujoku): insult, affront, slight
部下 (buka): subordinate person
分配 (bunpai): division, splitting, sharing
分散 (bunsan): dispersion, breakup
文書 (bunsho): document, writing, paperwork
無礼 (burei): impolite, rude​
武装 (busou): arms

In [79]:
target_words = []

In [80]:
for div in soup.find_all('div', class_='entry clearfix'):
    for p in tqdm(div.find_all('p')):
        word = p.text.split(' ')[0]
        if word in embeddings:
            target_words.append(word)

100%|██████████| 442/442 [00:00<00:00, 205416.33it/s]


In [81]:
len(target_words)

437

In [83]:
url = "https://japanesetest4you.com/jlpt-n3-vocabulary-list/"

content = req.get(url).text
soup = BeautifulSoup(content, 'lxml')
for div in soup.find_all('div', class_='entry clearfix'):
    for p in div.find_all('p'):
        print(p.text)

This is the list of words you need to study for the Japanese Language Proficiency Test Level N3.

Click on each word to see example sentences and download flashcards.
The link to download JTest4You’s N3 vocabulary ebook (3,375 pages) can be found here.
油 (abura): oil
愛 (ai): love, affection, care
愛情 (aijou): love, affection
相変わらず (aikawarazu): as ever, as usual, the same
生憎 (ainiku): unfortunately; sorry, but…
愛する (aisuru): to love
相手 (aite): companion, partner
合図 (aizu): sign, signal
明かり (akari): light, illumination, glow
明ける (akeru): to dawn, to end (of a period, season)
空き (aki): space, room, emptiness
空き家 (akiya): vacant house, unoccupied house​
明らか (akiraka): obvious, evident, clear
諦める (akirameru): to give up, to abandon
飽きる (akiru): to get tired of
悪魔 (akuma): demon, evil
握手 (akushu): handshake
余る (amaru): to remain, to be left over
編む (amu): to knit, to braid​
案 (an): idea, plan, thought
汗 (ase): sweat, perspiration
穴 (ana): hole, deficit, vacancy
暗記 (anki): memorization
安定 (an

容器 (youki): container, vessel
用心 (youjin): care, precaution
陽気 (youki): weather, cheerful
要求 (youkyuu): demand, firm request
用紙 (youshi): blank form​
要素 (youso): element, factor, component
様子 (yousu): state, appearance
要点 (youten): gist, main point
ようやく: finally, at last
余裕 (yoyuu): surplus, margin
唯一 (yuiitsu): only, sole, unique
ゆるい: loose
床 (yuka): floor
愉快 (yukai): pleasant, happy
行き (yuki): bound for, going to
行き先 (yukisaki): destination, whereabouts
許す (yurusu): to permit, to allow
豊か (yutaka): abundant, wealthy
郵便 (yuubin): mail service
遊園地 (yuuenchi): amusement park
夕方 (yuugata): evening, dusk​
友人 (yuujin): friend
友情 (yuujou): friendship, fellowship
勇気 (yuuki): courage, bravery
有効 (yuukou): validity, effectiveness
友好 (yuukou): friendship
有能 (yuunou): capable, efficient
有利 (yuuri): advantageous, profitable
有料 (yuuryou): fee-charging, paid
優勝 (yuushou): championship, overall victory
優秀 (yuushuu): superiority, excellence
郵送 (yuusou): mailing, posting​
譲る (yuzuru): to hand over, to

In [None]:
#generate target words

In [85]:
from bs4 import BeautifulSoup
import requests as req

urls = [
        "https://japanesetest4you.com/jlpt-n1-vocabulary-list/",
        "https://japanesetest4you.com/jlpt-n2-vocabulary-list/",
        "https://japanesetest4you.com/jlpt-n3-vocabulary-list/",
        "https://japanesetest4you.com/jlpt-n4-vocabulary-list/",
        "https://japanesetest4you.com/jlpt-n5-vocabulary-list/",
        "https://japanesetest4you.com/jlpt-n1-grammar-list/",
        "https://japanesetest4you.com/jlpt-n2-grammar-list/",
        "https://japanesetest4you.com/jlpt-n3-grammar-list/",
        "https://japanesetest4you.com/jlpt-n4-grammar-list/",
        "https://japanesetest4you.com/jlpt-n5-grammar-list/",
       ]
target_words = set()

for url in urls:
    content = req.get(url).text
    soup = BeautifulSoup(content, 'lxml')
    
    for div in soup.find_all('div', class_='entry clearfix'):
        for p in tqdm(div.find_all('p')):
            word = p.text.split(' ')[0]
            if word in embeddings:
                target_words.add(word)

100%|██████████| 442/442 [00:00<00:00, 232753.59it/s]
100%|██████████| 1621/1621 [00:00<00:00, 230817.72it/s]
100%|██████████| 1731/1731 [00:00<00:00, 199082.51it/s]
100%|██████████| 544/544 [00:00<00:00, 222388.05it/s]
100%|██████████| 562/562 [00:00<00:00, 244980.13it/s]
100%|██████████| 224/224 [00:00<00:00, 214356.40it/s]
100%|██████████| 209/209 [00:00<00:00, 179706.75it/s]
100%|██████████| 130/130 [00:00<00:00, 112563.90it/s]
100%|██████████| 115/115 [00:00<00:00, 118687.24it/s]
100%|██████████| 60/60 [00:00<00:00, 96940.77it/s]


In [86]:
len(target_words)

4677

In [117]:
#https://kyoan.u-biq.org/tangosearch.html has a pretty comprehensive list too so I'll include that
url = "https://kyoan.u-biq.org/tangosearch.html"
content = req.get(url)
content = req.get(url).text
content = content.encode('latin1')
soup = BeautifulSoup(content, 'lxml')

# for t in soup.find_all('table', class_='hyou'):
#     for tr in t.tbody.find_all('tr'):
#         for td in tr.find_all('td'):
#             print(td)

In [110]:
table = soup.find_all('table', class_='hyou')

In [96]:
import unicodedata


def unicode_normalize(s):
    return unicodedata.normalize('NFKD', s)

In [121]:
for t in soup.find_all('table', class_='hyou'):
    for td in t.tbody.find_all('td'):
        if td.text and td.text[0] == '(':
            word = td.text.split(')')[1]
        else:
            word = td.text
        
        if word in embeddings:
            target_words.add(word)

In [122]:
len(target_words)

5818