## Character base tokenizer 
常用漢字、ひらがな、カタカナ、アルファベット、記号を1文字単位でtokenizeするtokenizerを作成する

In [1]:
# !wget https://www.bunka.go.jp/kokugo_nihongo/sisaku/joho/joho/kijun/naikaku/kanji/joyokanjisakuin/index.html

In [2]:
# HTMLファイルの読み込み
with open("index.html", encoding="cp932") as f:
    html = f.read()
html[:300]

'<!DOCTYPE html PUBLIC "-//W3C//Dtd XHTML 1.0 Transitional//EN" "http://www.w3.org/tr/xhtml1/Dtd/xhtml1-transitional.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="ja"><!-- InstanceBegin template="/Templates/bottom.dwt" codeOutsideHTMLIsLocked="false" -->\n<head>\n<meta http-equiv="Content-'

In [3]:
from IPython.display import HTML
table_sample = '''
<table id="urlist" class="display" border="1" cellspacing="0"
summary="kanji" cellpadding="2" width="100%">
  <thead>
  <tr>
    <th bgcolor="#cc9999">漢字</th>
    <th bgcolor="#cc9999">音訓</th>
    <th bgcolor="#cc9999">例</th>
    <th bgcolor="#cc9999">備考</th>
  </tr>
  </thead>
  <tbody>

<tr><td><font size="7">亜</font><font size="6">（亞）</font></td><td>ア</td><td>亜流，亜麻，亜熱帯</td><td>　</td></tr>
<tr><td><font size="7">哀</font></td><td>アイ　<br />あわれ　<br />あわれむ</td><td>哀愁，哀願，悲哀<br />哀れ，哀れな話，哀れがる<br />哀れむ，哀れみ</td><td>　</td></tr>
<tr><td><font size="7">挨</font></td><td>アイ</td><td>挨拶</td><td>　</td></tr>
<tr><td><font size="7">愛</font></td><td>アイ</td><td>愛情，愛読，恋愛</td><td>愛媛（えひめ）県</td></tr>  
  </tbody>
</table>
'''
HTML(table_sample)

漢字,音訓,例,備考
亜（亞）,ア,亜流，亜麻，亜熱帯,
哀,アイ あわれ あわれむ,哀愁，哀願，悲哀 哀れ，哀れな話，哀れがる 哀れむ，哀れみ,
挨,アイ,挨拶,
愛,アイ,愛情，愛読，恋愛,愛媛（えひめ）県


In [4]:
from bs4 import BeautifulSoup
joyo_kanji_info = dict()

soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", id="urlist")

for row in table.find_all("tr"):
    if row.th:
        continue
    cols = row.find_all("td")
    kanji = cols[0].find("font").text
    on_kun = cols[1].text
    examples = cols[2].text
    note = cols[3].text
    joyo_kanji_info[kanji] = {"音訓": on_kun, "例": examples, "備考": note}
print(len(joyo_kanji_info))

2136


In [5]:
"".join(list(joyo_kanji_info.keys())[:10])

'亜哀挨愛曖悪握圧扱宛'

In [6]:
import string
# アルファベット
print(string.ascii_letters)
# 数字
print(string.digits)
# 句読点
print(string.punctuation)
# 改行など空白文字
print(repr(string.whitespace))

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
0123456789
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
' \t\n\r\x0b\x0c'


In [7]:
# https://www.unicode.org/charts/nameslist/
# ひらがな
start = 0x3041
end = 0x309F
hiragana = "".join(chr(i) for i in range(start, end))
print(hiragana)

# カタカナ
start = 0x30A1
end = 0x30FF
katakana = "".join(chr(i) for i in range(start, end))
print(katakana)

# CJK Symbols and Punctuation
start = 0x3000
end = 0x303F
cjk_symbols_punctuation = "".join(chr(i) for i in range(start, end))
print(cjk_symbols_punctuation)

# Halfwidth and Fullwidth Forms
start = 0xFF00
end = 0xFFEF
halfwidth_fullwidth_forms = "".join(chr(i) for i in range(start, end))
print(halfwidth_fullwidth_forms)

ぁあぃいぅうぇえぉおかがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺほぼぽまみむめもゃやゅゆょよらりるれろゎわゐゑをんゔゕゖ゗゘゙゚゛゜ゝゞ
ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶヷヸヹヺ・ーヽヾ
　、。〃〄々〆〇〈〉《》「」『』【】〒〓〔〕〖〗〘〙〚〛〜〝〞〟〠〡〢〣〤〥〦〧〨〩〪〭〮〯〫〬〰〱〲〳〴〵〶〷〸〹〺〻〼〽〾
＀！＂＃＄％＆＇（）＊＋，－．／０１２３４５６７８９：；＜＝＞？＠ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ［＼］＾＿｀ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ｛｜｝～｟｠｡｢｣､･ｦｧｨｩｪｫｬｭｮｯｰｱｲｳｴｵｶｷｸｹｺｻｼｽｾｿﾀﾁﾂﾃﾄﾅﾆﾇﾈﾉﾊﾋﾌﾍﾎﾏﾐﾑﾒﾓﾔﾕﾖﾗﾘﾙﾚﾛﾜﾝﾞﾟﾠﾡﾢﾣﾤﾥﾦﾧﾨﾩﾪﾫﾬﾭﾮﾯﾰﾱﾲﾳﾴﾵﾶﾷﾸﾹﾺﾻﾼﾽﾾ﾿￀￁ￂￃￄￅￆￇ￈￉ￊￋￌￍￎￏ￐￑ￒￓￔￕￖￗ￘￙ￚￛￜ￝￞￟￠￡￢￣￤￥￦￧￨￩￪￫￬￭￮


In [8]:
class CharTokenizer:
    def __init__(self, vocab):
        self.unknown_idx = -1
        if vocab is None:
            vocab = {}
        self.vocab = vocab
        vocab["<UNK>"] = self.unknown_idx
        self.inv_vocab = {v: k for k, v in self.vocab.items()}

    def encode(self, text: str) -> list[int]:
        return [self.vocab.get(token, self.unknown_idx) for token in text]
    
    def decode(self, tokens: list[int]) -> str:
        inv_vocab = {v: k for k, v in self.vocab.items()}
        return "".join([inv_vocab[token] for token in tokens])

In [9]:
vocab = dict()
entire_text = ""
entire_text += string.ascii_letters
entire_text += string.digits
entire_text += string.punctuation
entire_text += string.whitespace
entire_text += hiragana
entire_text += katakana
entire_text += cjk_symbols_punctuation
entire_text += halfwidth_fullwidth_forms
for kanji in joyo_kanji_info.keys():
    entire_text += kanji
entire_text = "".join(sorted(list(set(entire_text))))
for i, char in enumerate(entire_text):
    if char not in vocab:
        vocab[char] = i
tokenizer = CharTokenizer(vocab)

In [10]:
text = "僕の名前は原田です。"
encoded = tokenizer.encode(text)
print(encoded)
decoded = tokenizer.decode(encoded)
print(decoded)

[489, 208, 655, 560, 209, 626, 1618, 201, 187, 102]
僕の名前は原田です。


In [11]:
text = '亞僕の名前は原田です。😄'
encoded = tokenizer.encode(text)
print(encoded)
decoded = tokenizer.decode(encoded)
print(decoded)

[-1, 489, 208, 655, 560, 209, 626, 1618, 201, 187, 102, -1]
<UNK>僕の名前は原田です。<UNK>


In [12]:
class CharUTF8Tokenizer:
    def __init__(self, vocab):
        if vocab is None:
            vocab = {}
        self.vocab = vocab
        self.inv_vocab = {v: k for k, v in self.vocab.items()}
        vocab_size = len(vocab) * 1000
        for i in range(256):
            vocab[f'<utf8_{i}>'] = vocab_size + i

    def encode(self, text):
        result = []
        for char in text:
            if char not in self.vocab:
                utf_8_num = list(char.encode("utf-8"))
                for num in utf_8_num:
                    result.append(self.vocab[f'<utf8_{num}>'])
            else:
                result.append(self.vocab[char])
        return result
    
    def decode(self, tokens):
        inv_vocab = {v: k for k, v in self.vocab.items()}
        decoded_with_utf_token = [inv_vocab[token] for token in tokens]
        decoded_postprocess_utf = []
        utf_tokens = []
        for token in decoded_with_utf_token:
            if token.startswith("<utf8_"):
                utf_num = int(token.replace("<utf8_", "").replace(">", ""))
                utf_tokens.append(utf_num)
            else:
                if utf_tokens:
                    decoded_postprocess_utf.append(bytes(utf_tokens).decode("utf-8"))
                    utf_tokens = []
                decoded_postprocess_utf.append(token)
        if utf_tokens:
            decoded_postprocess_utf.append(bytes(utf_tokens).decode("utf-8"))
        return "".join(decoded_with_utf_token), "".join(decoded_postprocess_utf)

In [13]:
char_utf8_vocab = dict()
entire_text = ""
entire_text += string.ascii_letters
entire_text += string.digits
entire_text += string.punctuation
entire_text += string.whitespace
entire_text += hiragana
entire_text += katakana
entire_text += cjk_symbols_punctuation
entire_text += halfwidth_fullwidth_forms
for kanji in joyo_kanji_info.keys():
    entire_text += kanji
entire_text = "".join(sorted(list(set(entire_text))))
for i, char in enumerate(entire_text):
    if char not in char_utf8_vocab:
        char_utf8_vocab[char] = i
tokenizer = CharUTF8Tokenizer(char_utf8_vocab)

In [14]:
text = '亞僕の名前は原田です。😄'
encoded = tokenizer.encode(text)
print(encoded)
decoded_with_utf, decoded = tokenizer.decode(encoded)
print(decoded_with_utf)
print(decoded)

[2726228, 2726186, 2726158, 489, 208, 655, 560, 209, 626, 1618, 201, 187, 102, 2726240, 2726159, 2726152, 2726132]
<utf8_228><utf8_186><utf8_158>僕の名前は原田です。<utf8_240><utf8_159><utf8_152><utf8_132>
亞僕の名前は原田です。😄


In [15]:
import json
# with open("char_utf8_vocab.json", "w") as f:
#     json.dump(char_utf8_vocab, f, ensure_ascii=False, indent=2)

# load
with open("char_utf8_vocab.json", "r") as f:
    char_utf8_vocab = json.load(f)
tokenizer = CharUTF8Tokenizer(char_utf8_vocab)
tokenizer.encode('hello')

[77, 74, 81, 81, 84]

In [16]:
print([bin(utf_8) for utf_8 in list('a'.encode("utf-8"))])
print([bin(utf_8) for utf_8 in list('あ'.encode("utf-8"))])

['0b1100001']
['0b11100011', '0b10000001', '0b10000010']


In [17]:
# !wget https://www.mext.go.jp/a_menu/shotou/new-cs/__icsFiles/afieldfile/2017/05/15/1385768.pdf

In [18]:
# !pip install camelot-py[cv]