In [1]:
import json
import emoji
import os

In [2]:
dataset_path = "./pretrained-data/afriberta_train.txt"
char2freq = {}
# read all characters in the file
with open(dataset_path, "r", encoding="utf-8") as file:
    for line in file:
        # split the line into words
        text = line.strip()
        for char in text:
            if char not in char2freq:
                char2freq[char] = 0
            char2freq[char] += 1
# sort characters by frequency
sorted_chars = sorted(char2freq.items(), key=lambda item: item[1], reverse=True)

In [3]:


filtered_chars = [(c, freq) for c, freq in sorted_chars if not (repr(c).startswith("'\\u") or repr(c).startswith("'\\x"))]
emojis = [(c, freq) for c, freq in filtered_chars if  emoji.is_emoji(c)]
true_chars = [(c, freq) for c, freq in filtered_chars if not emoji.is_emoji(c)]

In [4]:
valid_chars = true_chars[1:1001] + emojis
valid_chars = sorted([p[0] for p in valid_chars])

In [5]:
special_chars = sorted([p[0] for p in true_chars[1002:1100]])[:7]

In [6]:
special_chars_mapping = {
    "wpad": " ",
    "unk": special_chars[0],
    "bos": special_chars[1],
    "eos": special_chars[2],
    "sep": special_chars[3],
    "cls": special_chars[4],
    "mask": special_chars[5],
    "pad": special_chars[6],
}
special_char2name = {v: k for k, v in special_chars_mapping.items()}

In [7]:
special_char2name

{' ': 'wpad',
 'µ': 'unk',
 'Î': 'bos',
 'Ñ': 'eos',
 'Ô': 'sep',
 'Õ': 'cls',
 'ŏ': 'mask',
 'ő': 'pad'}

In [8]:
valid_chars_mapping = {c: i for i, c in enumerate(valid_chars)}
for k, v in special_chars_mapping.items():
    valid_chars_mapping[v] = len(valid_chars_mapping)
valid_char2id = {k: v for k, v in sorted(valid_chars_mapping.items(), key=lambda item: item[1])}
valid_id2char = {v: k for k, v in valid_char2id.items()}

In [9]:
special_token2char = {
    '[WPAD]': special_chars_mapping['pad'],
    '[UNK]': special_chars_mapping['unk'],
    '[BOS]': special_chars_mapping['bos'],
    '[EOS]': special_chars_mapping['eos'],
    '[SEP]': special_chars_mapping['sep'],
    '[CLS]': special_chars_mapping['cls'],
    '[MASK]': special_chars_mapping['mask'],
    '[PAD]': special_chars_mapping['pad'],
}

In [11]:
charset = {
    "valid_char2id": valid_char2id,
    "valid_id2char": valid_id2char,
    "special_char2name": special_char2name,
    "special_name2char": special_chars_mapping,
    "special_token2char": special_token2char,
}
if not os.path.exists("tokenizer"):
    os.makedirs("tokenizer", exist_ok=True)
with open("tokenizer/charset.json", "w", encoding="utf-8") as f:
    json.dump(charset, f, ensure_ascii=False, indent=4)