In [48]:
from datasets import *
from transformers import AutoTokenizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from huggingface_hub import notebook_login


In [22]:
raw_datasets = load_from_disk("AR-dotless-mediumPlus-arrow")

In [23]:
print(raw_datasets, '\n')
print(raw_datasets['train'][0])

DatasetDict({
    train: Dataset({
        features: ['clean', 'dotless'],
        num_rows: 4446330
    })
}) 

{'clean': 'القتالية بأسرع وقت ممكن واستئناف العملية التفاوضية في', 'dotless': 'الفبالىه باسرع وفب ممكن واسبىناف العملىه البفاوصىه فى'}


In [24]:
checkpoint = "google/t5-v1_1-small"
old_tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [25]:
def d_get_training_corpus():
    dataset = raw_datasets["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples['dotless']
        
def c_get_training_corpus():
    dataset = raw_datasets["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples['clean']
        
d_training_corpus = d_get_training_corpus()
c_training_corpus = c_get_training_corpus()


In [26]:
dotless_tokenizer = old_tokenizer.train_new_from_iterator(d_training_corpus, vocab_size=30000)






In [27]:
clean_tokenizer = old_tokenizer.train_new_from_iterator(c_training_corpus, vocab_size=30000)






In [28]:
test_d = "ىا رب معبرفا بان ابامى فد طمب فوف"
test_c = "يا رب معترفا بأن أثامى قد طمت فوق"

In [44]:
inputs_d = dotless_tokenizer(test_d)
tokens_d = dotless_tokenizer.tokenize(test_d)
print(inputs_d)
print(dotless_tokenizer.convert_ids_to_tokens(inputs_d["input_ids"]))
print(test_d)

print()
print()

inputs_c = clean_tokenizer(test_c)
tokens_c = clean_tokenizer.tokenize(test_c)
print(inputs_c)
print(clean_tokenizer.convert_ids_to_tokens(inputs_c["input_ids"]))
print(test_c)



{'input_ids': [107, 108, 105, 1086, 18092, 105, 202, 915, 2847, 162, 107, 18001, 708, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['▁', 'ى', 'ا', '▁رب', '▁معبرف', 'ا', '▁بان', '▁اب', 'امى', '▁فد', '▁', 'طمب', '▁فوف', '</s>']
ىا رب معبرفا بان ابامى فد طمب فوف


{'input_ids': [231, 1237, 19958, 106, 247, 175, 14943, 141, 172, 23083, 113, 693, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['▁يا', '▁رب', '▁معترف', 'ا', '▁بأن', '▁أ', 'ثام', 'ى', '▁قد', '▁طم', 'ت', '▁فوق', '</s>']
يا رب معترفا بأن أثامى قد طمت فوق


In [55]:
encoding = dotless_tokenizer(test_d)
print(encoding)

print(encoding.tokens())
print(encoding.word_ids())

{'input_ids': [107, 108, 105, 1086, 18092, 105, 202, 915, 2847, 162, 107, 18001, 708, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['▁', 'ى', 'ا', '▁رب', '▁معبرف', 'ا', '▁بان', '▁اب', 'امى', '▁فد', '▁', 'طمب', '▁فوف', '</s>']
[0, 0, 0, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, None]


In [45]:
dotless_tokenizer.save_pretrained("AR-dotless-dotless-tokenizer")


('AR-dotless-dotless-tokenizer/tokenizer_config.json',
 'AR-dotless-dotless-tokenizer/special_tokens_map.json',
 'AR-dotless-dotless-tokenizer/tokenizer.json')

In [46]:
clean_tokenizer.save_pretrained("AR-dotless-clean-tokenizer")


('AR-dotless-clean-tokenizer/tokenizer_config.json',
 'AR-dotless-clean-tokenizer/special_tokens_map.json',
 'AR-dotless-clean-tokenizer/tokenizer.json')

In [51]:
dotless_tokenizer.push_to_hub("AR-dotless-dotless-tokenizer")


CommitInfo(commit_url='https://huggingface.co/dot-ammar/AR-dotless-dotless-tokenizer/commit/1831a3a7d5276ddd2b2531d6b2366ffffb11b213', commit_message='Upload tokenizer', commit_description='', oid='1831a3a7d5276ddd2b2531d6b2366ffffb11b213', pr_url=None, pr_revision=None, pr_num=None)

In [50]:
clean_tokenizer.push_to_hub("AR-dotless-clean-tokenizer")

CommitInfo(commit_url='https://huggingface.co/dot-ammar/AR-dotless-clean-tokenizer/commit/a1adfda1f3a664f4c8cc2ac96086e74c8bda74d8', commit_message='Upload tokenizer', commit_description='', oid='a1adfda1f3a664f4c8cc2ac96086e74c8bda74d8', pr_url=None, pr_revision=None, pr_num=None)