In [1]:
# Author: Haru
# Last editted: Feb 14, 2022

from sudachipy import tokenizer # Apache-2.0 License
from sudachipy import dictionary # Apache-2.0 License
import os # MIT License
import nltk # Apache-2.0 License
import pandas as pd # BSD License
import warnings # MIT License
import jaconv # MIT License
import re # Apache-2.0 License
import regex # Apache-2.0 License
import pickle # MIT License
import romkan # BSD License
from pandas.core.common import flatten

# ignore depreciation warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=UserWarning) 

In [2]:
# LOAD IN YOMI DICTIONARY NO OKURIGANA
with open('yomi_dictionary_no_okurigana.pkl', 'rb') as f:
    yomi_dictionary = pickle.load(f)


In [3]:
# CHECK IF DICTIONARY IS LOADED
yomi_dictionary

{'㐀': ['キュウ'],
 '㐁': ['テン'],
 '㐂': ['キ'],
 '㐄󠄁': ['カ', 'ケ', 'クワ'],
 '㐄󠄀': ['カ', 'ケ'],
 '㐅': ['ゴ'],
 '㐌': ['イ'],
 '㐖': ['ケツ', 'ゲチ'],
 '㐜': ['キュウ', 'グ'],
 '㐡': ['ジュ'],
 '㐤': ['タン', 'キュウ'],
 '㐧': ['ダイ', 'テイ'],
 '㐨': ['ショ', 'ジョ'],
 '㐩': ['キョウ', 'ゴ'],
 '㐪': ['ガイ'],
 '㐪󠄁': ['ガイ'],
 '㐬': ['リュウ', 'ル', 'コウ', 'トツ'],
 '㐭': ['リン'],
 '㐮': ['ジョウ', 'ショウ', 'ソウ'],
 '㐮󠄂': ['ジョウ'],
 '㐮󠄃': ['ジョウ', 'ショウ'],
 '㐯': ['ヨウ'],
 '㐰': ['シン'],
 '㐱': ['シン'],
 '㐲': ['フク', 'タイ', 'ダイ'],
 '㐳': ['ゴツ'],
 '㐴': ['ハン'],
 '㐸': ['ケン'],
 '㐹': ['キツ'],
 '㐺': ['ギン', 'シュウ'],
 '㐻': ['ダイ', 'ネ'],
 '㐼': ['ショウ'],
 '㑁󠄀': ['シュツ', 'シュチ', 'チュツ', 'チュチ'],
 '㑁': ['チュツ'],
 '㑂󠄀': ['ホウ'],
 '㑂': ['ホウ'],
 '㑃': ['オウ'],
 '㑄': ['ブ'],
 '㑅': ['サク'],
 '㑉': ['シュク'],
 '㑊': ['エキ', 'ヤク'],
 '㑋': ['キュウ', 'ク'],
 '㑌': ['オウ'],
 '㑍': ['ライ', 'レ'],
 '㑎': ['ドウ', 'ノウ'],
 '㑐': ['シュク'],
 '㑑': ['サン'],
 '㑒': ['セン'],
 '㑓': ['シュン'],
 '㑗': ['シン'],
 '㑘': ['カイ'],
 '㑙': ['チョウ'],
 '㑚': ['ダ'],
 '㑛': ['ソク', 'ショク'],
 '㑜': ['テイ', 'エイ', 'セツ', 'セチ'],
 '㑞': ['ヨウ'],
 '㑟': ['ホウ', 'ヒョウ'],


In [4]:
# SUDACHI SETTINGS
tokenizer_obj = dictionary.Dictionary().create()
mode = tokenizer.Tokenizer.SplitMode.C


In [5]:
# HELPER FUNCTIONS
def split(word):
    return [char for char in word]

def is_hiragana(text):
    p = re.compile('[\u3041-\u309F]+')
    if p.fullmatch(text) == None:
        return False
    else:
        return True
    

In [6]:
def sudachi_kana_convert(text):
    tokenizer_obj = dictionary.Dictionary().create()
    mode = tokenizer.Tokenizer.SplitMode.C
    sudachi_string = ''
    for token in tokenizer_obj.tokenize(text, mode):
        sudachi_token = token.reading_form()
        sudachi_string = sudachi_string + sudachi_token
    return(sudachi_string)

In [46]:
def furigana_machine(text):
    sudachi_text = sudachi_kana_convert(text)
    tokenizer_obj = dictionary.Dictionary().create()
    mode = tokenizer.Tokenizer.SplitMode.C

    result_list = []
    hira_list_total = []
    for token in tokenizer_obj.tokenize(text, mode):
        kanji_token = token.surface()
        sudachi_token = token.reading_form()
        if is_hiragana(kanji_token) == True:
            result_list = result_list + [kanji_token]
        
        else:
            hira_list = chouon_convert_token_new(kanji_token, sudachi_token)
            hira_list_total = hira_list_total + [hira_list]

    hira_list_flat = list(flatten(hira_list_total))
    
    final_string = ''
    counter = 0
    for j in range(len(text)):
        if is_hiragana(text[j]) == True:
            final_string = final_string + text[j]
        else:
            final_string = final_string + text[j] + '(' + hira_list_flat[counter] + ')'
            counter = counter + 1

    return final_string
            
            



In [47]:
def chouon_convert_token_new(token_string_x, sudachi_token_x):
    j = 0
    rom_result = []
    sudachi_token_x_spec = sudachi_token_x.replace('ッ', 'ツ')
    romaji_token_x_spec = romkan.to_roma(sudachi_token_x_spec)
    
    rom_result = []
    for char in token_string_x:
        if char in yomi_dictionary:
            yomis = yomi_dictionary[char]
            yomi_rom = []
            for yomi in yomis:
                rom_yomi = romkan.to_roma(yomi)
                yomi_rom = yomi_rom + [rom_yomi]
            
            for element in yomi_rom:
                if element == romaji_token_x_spec[j:j+len(element)]:
                    rom_result = rom_result + [element]
                    j = j + len(element)

    hira_list = []
    for x in rom_result:
        hira = romkan.to_hiragana(x)
        hira_list = hira_list + [hira]
        
    return hira_list
    

In [48]:
furigana_machine('道路が閉まっている')

'道(どう)路(ろ)が閉(し)まっている'

In [49]:
furigana_machine('公園に行って遊ぶ')

'公(こう)園(えん)に行(い)って遊(あそ)ぶ'

In [50]:
furigana_machine('電車に乗って帰る')

'電(でん)車(しゃ)に乗(の)って帰(かえ)る'

In [52]:
furigana_machine('母は病院で働いている')

'母(はは)は病(びょう)院(いん)で働(はたら)いている'

In [54]:
furigana_machine('海外の大学に行って勉強する')

'海(かい)外(がい)の大(だい)学(がく)に行(い)って勉(べん)強(きょう)する'