In [1]:
import os
import pandas as pd

### Single-kanji Dataset

In [2]:
# read JSON file and remove all unnecessary columns
df = pd.read_json('kanji-individual.json') \
    .transpose() \
    .reset_index(names='kanji') \
    .drop(columns=[
        'strokes', 'grade', 'freq', 'jlpt_old', 'jlpt_new', 'meanings', 'wk_level',
        'wk_meanings', 'wk_radicals', 'wk_readings_on', 'wk_readings_kun'
    ])

In [3]:
df.sample(5)

Unnamed: 0,kanji,readings_on,readings_kun
4428,瀾,[らん],[なみ]
11916,鰢,"[ば, め]","[うみえび, つくら, すばしり]"
8802,熅,"[うん, おん]","[うずみび, いき.れ]"
6458,侗,"[とう, つ, ず]","[おろ.か, いた.む, かたち, なおい, まこと, つつし.む]"
9066,瓚,[さん],[]


In [4]:
# Confirm that the resulting dataset has no nulls
df.isna().sum()

kanji           0
readings_on     0
readings_kun    0
dtype: int64

### Word Dataset

In [233]:
ranges = [
    {'from': ord(u'\u3040'), 'to': ord(u'\u309f')},     # Hiragana
    {"from": ord(u"\u30a0"), "to": ord(u"\u30ff")},     # Katakana
]

def is_hira_kata(char):
    return any([range["from"] <= ord(char) <= range["to"] for range in ranges])

def is_kanji(char):
    return not is_hira_kata(char)

In [None]:
files = ['vocab-1-raw.xlsx', 'vocab-2-raw.xlsx', 'vocab-3-raw.xlsx']
pages = ['重要度順語彙リスト60894語', '想定既知語彙リスト（固有名詞など）', '使用範囲狭小語彙リスト（想定既知語彙除く）']
allWords = pd.DataFrame()

for file, page in zip(files, pages):
    wordsFile = pd.ExcelFile(file)
    words = pd.read_excel(wordsFile, page)
    words = words[['見出し語彙素\nLexeme', '標準的（新聞）表記\nStandard (Newspaper) Orthography', '標準的読み方（カタカナ）\nStandard Reading (Katakana)']]
    words.columns = ['word', 'orthography', 'reading']

    allWords = pd.concat([allWords, words])

# Make separate rows where 'word' and 'orthography' do not match
allWords['combined'] = list(zip(allWords.word, allWords.orthography))
wordsExpanded = allWords \
    .explode('combined', ignore_index=True) \
    .drop(columns=['word', 'orthography']) \
    .dropna(subset='reading') \
    .drop_duplicates() \
    .reset_index(drop=True) \
    .rename(columns={'combined': 'word'}) \
    [['word', 'reading']]

# Remove rows with number in the 'reading' column
wordsExpanded = wordsExpanded[
    (wordsExpanded.reading.str.isnumeric() == False) & \
    (wordsExpanded.word.str.isnumeric() == False)
]

# Remove rows with * in the 'word' column
wordsExpanded = wordsExpanded[wordsExpanded.word.str.contains('＊') == False]

# Remove rows with non-Kanji characters in the 'word' column
wordsExpanded = wordsExpanded[~wordsExpanded.word.apply(lambda x: all([is_hira_kata(char) for char in x]))]
wordsExpanded = wordsExpanded[~wordsExpanded.word.apply(lambda x: any([ord(char) in range(ord('a'), ord('z')) for char in x]))]

In [None]:
# Credit to SmoothKen from StackOverflow for the Hiragana-Katakana conversion
# https://stackoverflow.com/questions/4877139/how-can-i-convert-all-japanese-hiragana-to-katakana-characters-in-python

hira_start = int("3041", 16)
hira_end = int("3096", 16)
kata_start = int("30a1", 16)

kata_to_hira = dict()
for i in range(hira_start, hira_end+1):
    kata_to_hira[chr(i-hira_start+kata_start)] = chr(i)

In [316]:
# Convert katakana to hiragana
wordsExpanded['reading_hira'] = wordsExpanded.reading.apply(lambda x: ''.join([kata_to_hira.get(char, char) for char in x]))
wordsExpanded = wordsExpanded.drop(columns='reading')

In [317]:
wordsExpanded.to_csv('vocab-expanded.csv', index=False)

In [333]:
wordsExpanded.sample(20)

Unnamed: 0,word,reading_hira
12421,占有,せんゆう
40213,御酒,みき
39116,切り出し,きりだし
14415,捏造,ねつぞう
92153,ｎｏａａ,のあ
37305,反軍,はんぐん
21971,漢語,かんご
33782,物入れ,ものいれ
56062,気孔,きこう
134679,蟹玉,かにたま
