## Character base tokenizer 
常用漢字、ひらがな、カタカナ、アルファベット、記号を1文字単位でtokenizeするtokenizerを作成する

In [1]:
# !wget https://www.bunka.go.jp/kokugo_nihongo/sisaku/joho/joho/kijun/naikaku/kanji/joyokanjisakuin/index.html

In [2]:
# HTMLファイルの読み込み
with open("index.html", encoding="cp932") as f:
    html = f.read()
html[:300]

'<!DOCTYPE html PUBLIC "-//W3C//Dtd XHTML 1.0 Transitional//EN" "http://www.w3.org/tr/xhtml1/Dtd/xhtml1-transitional.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="ja"><!-- InstanceBegin template="/Templates/bottom.dwt" codeOutsideHTMLIsLocked="false" -->\n<head>\n<meta http-equiv="Content-'

In [3]:
from IPython.display import HTML
table_sample = '''
<table id="urlist" class="display" border="1" cellspacing="0"
summary="kanji" cellpadding="2" width="100%">
  <thead>
  <tr>
    <th bgcolor="#cc9999">漢字</th>
    <th bgcolor="#cc9999">音訓</th>
    <th bgcolor="#cc9999">例</th>
    <th bgcolor="#cc9999">備考</th>
  </tr>
  </thead>
  <tbody>

<tr><td><font size="7">亜</font><font size="6">（亞）</font></td><td>ア</td><td>亜流，亜麻，亜熱帯</td><td>　</td></tr>
<tr><td><font size="7">哀</font></td><td>アイ　<br />あわれ　<br />あわれむ</td><td>哀愁，哀願，悲哀<br />哀れ，哀れな話，哀れがる<br />哀れむ，哀れみ</td><td>　</td></tr>
<tr><td><font size="7">挨</font></td><td>アイ</td><td>挨拶</td><td>　</td></tr>
<tr><td><font size="7">愛</font></td><td>アイ</td><td>愛情，愛読，恋愛</td><td>愛媛（えひめ）県</td></tr>  
  </tbody>
</table>
'''
HTML(table_sample)

漢字,音訓,例,備考
亜（亞）,ア,亜流，亜麻，亜熱帯,
哀,アイ あわれ あわれむ,哀愁，哀願，悲哀 哀れ，哀れな話，哀れがる 哀れむ，哀れみ,
挨,アイ,挨拶,
愛,アイ,愛情，愛読，恋愛,愛媛（えひめ）県


In [4]:
from bs4 import BeautifulSoup
joyo_kanji_info = dict()

soup = BeautifulSoup(html, "html.parser")
table = soup.find("table", id="urlist")

for row in table.find_all("tr"):
    if row.th:
        continue
    cols = row.find_all("td")
    kanji = cols[0].find("font").text
    on_kun = cols[1].text
    examples = cols[2].text
    note = cols[3].text
    joyo_kanji_info[kanji] = {"音訓": on_kun, "例": examples, "備考": note}
print(len(joyo_kanji_info))

2136


In [5]:
"".join(list(joyo_kanji_info.keys())[:10])

'亜哀挨愛曖悪握圧扱宛'

In [6]:
import string
# アルファベット
print(string.ascii_letters)
# 数字
print(string.digits)
# 句読点
print(string.punctuation)
# 改行など空白文字
print(repr(string.whitespace))

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ
0123456789
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
' \t\n\r\x0b\x0c'


In [7]:
# https://www.unicode.org/charts/nameslist/
# ひらがな
start = 0x3041
end = 0x309F
hiragana = "".join(chr(i) for i in range(start, end))
print(hiragana)

# カタカナ
start = 0x30A1
end = 0x30FF
katakana = "".join(chr(i) for i in range(start, end))
print(katakana)

# CJK Symbols and Punctuation
start = 0x3000
end = 0x303F
cjk_symbols_punctuation = "".join(chr(i) for i in range(start, end))
print(cjk_symbols_punctuation)

# Halfwidth and Fullwidth Forms
start = 0xFF00
end = 0xFFEF
halfwidth_fullwidth_forms = "".join(chr(i) for i in range(start, end))
print(halfwidth_fullwidth_forms)

ぁあぃいぅうぇえぉおかがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺほぼぽまみむめもゃやゅゆょよらりるれろゎわゐゑをんゔゕゖ゗゘゙゚゛゜ゝゞ
ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶヷヸヹヺ・ーヽヾ
　、。〃〄々〆〇〈〉《》「」『』【】〒〓〔〕〖〗〘〙〚〛〜〝〞〟〠〡〢〣〤〥〦〧〨〩〪〭〮〯〫〬〰〱〲〳〴〵〶〷〸〹〺〻〼〽〾
＀！＂＃＄％＆＇（）＊＋，－．／０１２３４５６７８９：；＜＝＞？＠ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ［＼］＾＿｀ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ｛｜｝～｟｠｡｢｣､･ｦｧｨｩｪｫｬｭｮｯｰｱｲｳｴｵｶｷｸｹｺｻｼｽｾｿﾀﾁﾂﾃﾄﾅﾆﾇﾈﾉﾊﾋﾌﾍﾎﾏﾐﾑﾒﾓﾔﾕﾖﾗﾘﾙﾚﾛﾜﾝﾞﾟﾠﾡﾢﾣﾤﾥﾦﾧﾨﾩﾪﾫﾬﾭﾮﾯﾰﾱﾲﾳﾴﾵﾶﾷﾸﾹﾺﾻﾼﾽﾾ﾿￀￁ￂￃￄￅￆￇ￈￉ￊￋￌￍￎￏ￐￑ￒￓￔￕￖￗ￘￙ￚￛￜ￝￞￟￠￡￢￣￤￥￦￧￨￩￪￫￬￭￮


In [8]:
class CharTokenizer:
    def __init__(self, vocab):
        self.unknown_idx = -1
        if vocab is None:
            vocab = {}
        self.vocab = vocab
        vocab["<UNK>"] = self.unknown_idx
        self.inv_vocab = {v: k for k, v in self.vocab.items()}

    def encode(self, text: str) -> list[int]:
        return [self.vocab.get(token, self.unknown_idx) for token in text]
    
    def decode(self, tokens: list[int]) -> str:
        inv_vocab = {v: k for k, v in self.vocab.items()}
        return "".join([inv_vocab[token] for token in tokens])

In [9]:
vocab = dict()
entire_text = ""
entire_text += string.ascii_letters
entire_text += string.digits
entire_text += string.punctuation
entire_text += string.whitespace
entire_text += hiragana
entire_text += katakana
entire_text += cjk_symbols_punctuation
entire_text += halfwidth_fullwidth_forms
for kanji in joyo_kanji_info.keys():
    entire_text += kanji
entire_text = "".join(sorted(list(set(entire_text))))
for i, char in enumerate(entire_text):
    if char not in vocab:
        vocab[char] = i
tokenizer = CharTokenizer(vocab)

In [10]:
text = "僕の名前は原田です。"
encoded = tokenizer.encode(text)
print(encoded)
decoded = tokenizer.decode(encoded)
print(decoded)

[489, 208, 655, 560, 209, 626, 1618, 201, 187, 102]
僕の名前は原田です。


In [11]:
text = '亞僕の名前は原田です。😄'
encoded = tokenizer.encode(text)
print(encoded)
decoded = tokenizer.decode(encoded)
print(decoded)

[-1, 489, 208, 655, 560, 209, 626, 1618, 201, 187, 102, -1]
<UNK>僕の名前は原田です。<UNK>


In [100]:
class CharUTF8Tokenizer:
    def __init__(self, vocab):
        if vocab is None:
            vocab = {}
        vocab_size = len(vocab)
        for i in range(256):
            vocab[f'<utf8_{i}>'] = vocab_size + i
        self.vocab = vocab

    def encode(self, text):
        result = []
        for char in text:
            if char not in self.vocab:
                utf_8_num = list(char.encode("utf-8"))
                for num in utf_8_num:
                    result.append(self.vocab[f'<utf8_{num}>'])
            else:
                result.append(self.vocab[char])
        return result
    
    def decode_with_utf_token(self, tokens):
        inv_vocab = {v: k for k, v in self.vocab.items()}
        decoded_with_utf_token = [inv_vocab[token] for token in tokens]
        return "".join(decoded_with_utf_token)
    
    def decode(self, tokens):
        inv_vocab = {v: k for k, v in self.vocab.items()}
        decoded_with_utf_token = [inv_vocab[token] for token in tokens]
        decoded_postprocess_utf = []
        utf_tokens = []
        for token in decoded_with_utf_token:
            if token.startswith("<utf8_"):
                utf_num = int(token.replace("<utf8_", "").replace(">", ""))
                utf_tokens.append(utf_num)
            else:
                if utf_tokens:
                    decoded_postprocess_utf.append(bytes(utf_tokens).decode("utf-8"))
                    utf_tokens = []
                decoded_postprocess_utf.append(token)
        if utf_tokens:
            decoded_postprocess_utf.append(bytes(utf_tokens).decode("utf-8"))
            utf_tokens = []
        return "".join(decoded_postprocess_utf)

In [101]:
char_utf8_vocab = dict()
entire_text = ""
entire_text += string.ascii_letters
entire_text += string.digits
entire_text += string.punctuation
entire_text += string.whitespace
entire_text += hiragana
entire_text += katakana
entire_text += cjk_symbols_punctuation
entire_text += halfwidth_fullwidth_forms
for kanji in joyo_kanji_info.keys():
    entire_text += kanji
entire_text = "".join(sorted(list(set(entire_text))))
for i, char in enumerate(entire_text):
    if char not in char_utf8_vocab:
        char_utf8_vocab[char] = i
tokenizer = CharUTF8Tokenizer(char_utf8_vocab)

In [102]:
text = '亞僕の名前は原田です。😄'
encoded = tokenizer.encode(text)
print(encoded)
decoded_with_utf, decoded = tokenizer.decode_with_utf_token(encoded), tokenizer.decode(encoded)
print(decoded_with_utf)
print(decoded)

[2954, 2912, 2884, 489, 208, 655, 560, 209, 626, 1618, 201, 187, 102, 2966, 2885, 2878, 2858]
<utf8_228><utf8_186><utf8_158>僕の名前は原田です。<utf8_240><utf8_159><utf8_152><utf8_132>
亞僕の名前は原田です。😄


In [103]:
import json
# with open("char_utf8_vocab.json", "w") as f:
#     json.dump(char_utf8_vocab, f, ensure_ascii=False, indent=2)

# load
with open("char_utf8_vocab.json", "r") as f:
    char_utf8_vocab = json.load(f)
tokenizer = CharUTF8Tokenizer(char_utf8_vocab)
tokenizer.encode('hello')

[77, 74, 81, 81, 84]

In [104]:
# # https://www.mediawiki.org/wiki/API:Etiquette/ja
# import requests

# # APIエンドポイントとヘッダーの設定
# API_ENDPOINT = "https://ja.wikipedia.org/w/api.php"
# your_email_address = "example@example.com"
# HEADERS = {
#     "User-Agent": f"LMProject/1.0 ({your_email_address}) requests/2.32.2"
# }

# # 検索するページのタイトル
# title = "大規模言語モデル"

# # APIリクエストのパラメータを設定
# params = {
#     "action": "query",
#     "format": "json",
#     "prop": "extracts",
#     "explaintext": True,
#     "titles": title
# }

# # APIリクエストを送信
# response = requests.get(API_ENDPOINT, headers=HEADERS, params=params)

# # JSON形式で結果を取得
# data = response.json()

# # ページの内容を抽出
# pages = data.get("query", {}).get("pages", {})
# content_text = ""
# for page_id, page_data in pages.items():
#     content = page_data.get("extract", "ページが見つかりませんでした。")
#     if content == "ページが見つかりませんでした。":
#         continue
#     content_text += content
# with open("wiki_text_llm.txt", "w") as f:
#     f.write(content_text)

In [105]:
with open("wiki_text_llm.txt", "r") as f:
    wiki_text = f.read()
wiki_text[:100]

'大規模言語モデル（だいきぼげんごモデル、英: large language model、LLM）は、多数のパラメータ（数千万から数十億）を持つ人工ニューラルネットワークで構成されるコンピュータ言語モデ'

In [106]:
print(tokenizer.encode(wiki_text[:100]))
print(tokenizer.decode(tokenizer.encode(wiki_text[:100])))

[784, 2057, 1379, 2066, 2102, 322, 295, 331, 2494, 194, 166, 175, 222, 180, 245, 182, 322, 295, 331, 101, 1973, 31, 5, 81, 70, 87, 76, 74, 5, 81, 70, 83, 76, 90, 70, 76, 74, 5, 82, 84, 73, 74, 81, 101, 49, 49, 50, 2495, 209, 101, 781, 1225, 208, 305, 329, 321, 348, 287, 2494, 1225, 605, 354, 173, 235, 1225, 604, 493, 2495, 244, 1158, 198, 397, 933, 299, 325, 348, 329, 331, 301, 291, 296, 335, 348, 271, 201, 1375, 1104, 183, 238, 237, 275, 339, 308, 325, 348, 287, 2066, 2102, 322, 295]
大規模言語モデル（だいきぼげんごモデル、英: large language model、LLM）は、多数のパラメータ（数千万から数十億）を持つ人工ニューラルネットワークで構成されるコンピュータ言語モデ


In [117]:
ids = tokenizer.encode(wiki_text)

pair_counts = dict()
for i in range(len(ids) - 1):
    pair = (tokenizer.decode_with_utf_token([ids[i]]), tokenizer.decode_with_utf_token([ids[i + 1]]))
    if pair not in pair_counts:
        pair_counts[pair] = 0
    pair_counts[pair] += 1
pair_counts = {k: v for k, v in sorted(pair_counts.items(), key=lambda x: x[1], reverse=True)}
pair_counts

{(' ', ' '): 3587,
 ('\n', ' '): 566,
 (' ', '\n'): 328,
 ('る', '。'): 185,
 ('す', 'る'): 155,
 ('=', '='): 122,
 ('は', '、'): 119,
 ('さ', 'れ'): 104,
 ('モ', 'デ'): 99,
 ('デ', 'ル'): 99,
 ('。', '\n'): 92,
 ('\n', '\n'): 91,
 ('あ', 'る'): 88,
 ('L', 'M'): 82,
 ('L', 'L'): 80,
 ('こ', 'と'): 78,
 ('訓', '練'): 76,
 ('ー', 'タ'): 73,
 ('し', 'て'): 73,
 ('で', 'あ'): 65,
 ('ス', 'ト'): 61,
 ('e', ' '): 60,
 ('る', 'こ'): 60,
 ('ー', 'ク'): 59,
 ('れ', 'る'): 58,
 ('言', '語'): 56,
 ('{', '\\'): 55,
 ('ッ', 'ト'): 53,
 (' ', '='): 53,
 ('い', 'る'): 51,
 ('て', 'い'): 50,
 ('し', 'た'): 50,
 ('れ', 'た'): 49,
 ('=', '\n'): 49,
 (' ', '{'): 47,
 ('ン', 'グ'): 46,
 ('デ', 'ー'): 46,
 ('s', 't'): 45,
 ('た', '。'): 43,
 ('よ', 'う'): 43,
 ('語', 'モ'): 42,
 ('キ', 'ス'): 42,
 ('i', 's'): 42,
 ('ト', 'ー'): 42,
 ('規', '模'): 41,
 ('テ', 'キ'): 41,
 ('ス', 'ク'): 41,
 ('l', 'e'): 41,
 ('で', 'き'): 40,
 ('l', 'a'): 39,
 ('な', 'い'): 39,
 ('=', ' '): 38,
 ('d', 'i'): 38,
 ('ュ', 'ー'): 37,
 ('タ', 'ス'): 37,
 ('プ', 'ロ'): 37,
 ('っ', 'て'): 36,
 ('\n', '='): 3

In [118]:
merged_ids = []
new_merge_key = ''.join(list(pair_counts.keys())[0])
print(repr(new_merge_key))
new_merge_token_id = len(tokenizer.vocab)
tokenizer.vocab[new_merge_key] = new_merge_token_id
ids = tokenizer.encode(wiki_text)
for i in range(len(ids) - 1):
    if ''.join((tokenizer.decode_with_utf_token([ids[i]]), tokenizer.decode_with_utf_token([ids[i + 1]]))) == new_merge_key:
        merged_ids.append(new_merge_token_id)
        i += 2
    else:
        merged_ids.append(ids[i])

pair_counts = dict()
for i in range(len(merged_ids) - 1):
    pair = (tokenizer.decode_with_utf_token([merged_ids[i]]), tokenizer.decode_with_utf_token([merged_ids[i + 1]]))
    if pair not in pair_counts:
        pair_counts[pair] = 0
    pair_counts[pair] += 1
pair_counts = {k: v for k, v in sorted(pair_counts.items(), key=lambda x: x[1], reverse=True)}
pair_counts

'  '


{('  ', '  '): 3044,
 ('\n', '  '): 543,
 ('  ', ' '): 543,
 (' ', '\n'): 328,
 ('る', '。'): 185,
 ('す', 'る'): 155,
 ('=', '='): 122,
 ('は', '、'): 119,
 ('さ', 'れ'): 104,
 ('モ', 'デ'): 99,
 ('デ', 'ル'): 99,
 ('。', '\n'): 92,
 ('\n', '\n'): 91,
 ('あ', 'る'): 88,
 ('L', 'M'): 82,
 ('L', 'L'): 80,
 ('こ', 'と'): 78,
 ('訓', '練'): 76,
 ('ー', 'タ'): 73,
 ('し', 'て'): 73,
 ('で', 'あ'): 65,
 ('ス', 'ト'): 61,
 ('e', ' '): 60,
 ('る', 'こ'): 60,
 ('ー', 'ク'): 59,
 ('れ', 'る'): 58,
 ('言', '語'): 56,
 ('{', '\\'): 55,
 ('ッ', 'ト'): 53,
 (' ', '='): 53,
 ('い', 'る'): 51,
 ('て', 'い'): 50,
 ('し', 'た'): 50,
 ('れ', 'た'): 49,
 ('=', '\n'): 49,
 (' ', '{'): 47,
 ('ン', 'グ'): 46,
 ('デ', 'ー'): 46,
 ('s', 't'): 45,
 ('た', '。'): 43,
 ('よ', 'う'): 43,
 ('語', 'モ'): 42,
 ('キ', 'ス'): 42,
 ('i', 's'): 42,
 ('ト', 'ー'): 42,
 ('規', '模'): 41,
 ('テ', 'キ'): 41,
 ('ス', 'ク'): 41,
 ('l', 'e'): 41,
 ('で', 'き'): 40,
 ('l', 'a'): 39,
 ('な', 'い'): 39,
 ('=', ' '): 38,
 ('d', 'i'): 38,
 ('ュ', 'ー'): 37,
 ('タ', 'ス'): 37,
 ('プ', 'ロ'): 37,
 ('っ', 'て'

In [130]:
ids = tokenizer.encode(wiki_text)
len(ids)

22623

In [140]:
with open("char_utf8_vocab.json", "r") as f:
    char_utf8_vocab = json.load(f)
tokenizer = CharUTF8Tokenizer(char_utf8_vocab)
new_vocab_size = len(tokenizer.vocab) + 20
ids = tokenizer.encode(wiki_text)
print('before total token num', len(ids))
while len(tokenizer.vocab) < new_vocab_size:
    pair_counts = dict()
    while True:
        new_ids = []
        idx = 0
        merged = False
        while idx <= len(ids) - 2:
            pair_idx_padding = 1
            while idx + pair_idx_padding < len(ids):
                pair = (tokenizer.decode_with_utf_token([ids[idx]]), tokenizer.decode_with_utf_token([ids[idx + pair_idx_padding]]))
                pair_key = ''.join(pair)
                if pair_key in tokenizer.vocab:
                    ids[idx] = tokenizer.vocab[pair_key]
                    pair_idx_padding += 1
                    merged = True
                else:
                    new_ids.append(ids[idx])
                    break
            idx = idx + pair_idx_padding
            if idx == len(ids) - 1:
                new_ids.append(ids[idx])
        if not merged:
            break
        ids = new_ids
    for i in range(len(ids) - 1):
        pair = (tokenizer.decode_with_utf_token([ids[i]]), tokenizer.decode_with_utf_token([ids[i + 1]]))
        if pair not in pair_counts:
            pair_counts[pair] = 0
        pair_counts[pair] += 1
    pair_counts = {k: v for k, v in sorted(pair_counts.items(), key=lambda x: x[1], reverse=True)}
    new_merge_key = ''.join(list(pair_counts.keys())[0])
    print('new_merge_key:', repr(new_merge_key), 'token_id:', len(tokenizer.vocab))
    new_merge_token_id = len(tokenizer.vocab)
    tokenizer.vocab[new_merge_key] = new_merge_token_id
    print('after total token num', len(new_ids))

before total token num 22623
new_merge_key: '  ' token_id: 2982
after total token num 22623
new_merge_key: '    ' token_id: 2983
after total token num 20558
new_merge_key: '\n    ' token_id: 2984
after total token num 19630
new_merge_key: '\n        ' token_id: 2985
after total token num 19155
new_merge_key: 'る。' token_id: 2986
after total token num 18850
new_merge_key: 'する' token_id: 2987
after total token num 18665
new_merge_key: '==' token_id: 2988
after total token num 18531
new_merge_key: 'は、' token_id: 2989
after total token num 18457
new_merge_key: 'され' token_id: 2990
after total token num 18338
new_merge_key: 'モデ' token_id: 2991
after total token num 18234
new_merge_key: 'モデル' token_id: 2992
after total token num 18135
new_merge_key: '\n\n' token_id: 2993
after total token num 18036
new_merge_key: 'LM' token_id: 2994
after total token num 17981
new_merge_key: 'LLM' token_id: 2995
after total token num 17899
new_merge_key: '        ' token_id: 2996
after total token num 17819
ne

KeyboardInterrupt: 

In [18]:
# !wget https://www.mext.go.jp/a_menu/shotou/new-cs/__icsFiles/afieldfile/2017/05/15/1385768.pdf

In [19]:
# !pip install camelot-py[cv]