In [2]:
from datasets import *
from transformers import AutoTokenizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from huggingface_hub import notebook_login


In [3]:
raw_datasets = load_from_disk("AR-dotted-mediumPlus-arrow")

In [4]:
print(raw_datasets, '\n')
print(raw_datasets['train'][0])

DatasetDict({
    train: Dataset({
        features: ['clean', 'dotless'],
        num_rows: 2980842
    })
}) 

{'clean': 'زكريا محيي الدين يحيى بن شرف النووي المتوفى ه ج ص', 'dotless': 'ركرىا محىى الدىن ىحىى بن سرف النووى المبوفى ه ح ص'}


In [5]:
checkpoint = "asafaya/bert-base-arabic"
old_tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
old_tokenizer.vocab_size

32000

In [7]:
def get_training_corpus():
    dataset = raw_datasets["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples['clean']
        
training_corpus = get_training_corpus()



In [8]:
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, vocab_size=30000)







In [9]:
tokenizer.vocab_size

30000

In [10]:
test_c = "ذهب أحمد إلى المسجد"

In [11]:
inputs_c = tokenizer(test_c)
tokens_c = tokenizer.tokenize(test_c)
print(inputs_c)
print(tokenizer.convert_ids_to_tokens(inputs_c["input_ids"]))
print(test_c)



{'input_ids': [2, 4028, 838, 109, 3439, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}
['[CLS]', 'ذهب', 'احمد', 'الى', 'المسجد', '[SEP]']
ذهب أحمد إلى المسجد


In [12]:
encoding = tokenizer(test_c)
print(encoding)

print(encoding.tokens())
print(encoding.word_ids())

{'input_ids': [2, 4028, 838, 109, 3439, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}
['[CLS]', 'ذهب', 'احمد', 'الى', 'المسجد', '[SEP]']
[None, 0, 1, 2, 3, None]


In [14]:
tokenizer.save_pretrained("AR-dotted-tokenizer")


('AR-dotted-tokenizer/tokenizer_config.json',
 'AR-dotted-tokenizer/special_tokens_map.json',
 'AR-dotted-tokenizer/vocab.txt',
 'AR-dotted-tokenizer/added_tokens.json',
 'AR-dotted-tokenizer/tokenizer.json')

In [31]:
tokenizer.push_to_hub("AR-dotted-tokenizer")


CommitInfo(commit_url='https://huggingface.co/dot-ammar/AR-dotted-tokenizer/commit/d99ea860813654ccfc8c0eb78022c1d785015ba5', commit_message='Upload tokenizer', commit_description='', oid='d99ea860813654ccfc8c0eb78022c1d785015ba5', pr_url=None, pr_revision=None, pr_num=None)