In [1]:
from datasets import *
from transformers import AutoTokenizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from huggingface_hub import notebook_login


In [2]:
raw_datasets = load_from_disk("AR-dotless-mediumPlus-arrow")

In [3]:
print(raw_datasets, '\n')
print(raw_datasets['train'][0])

DatasetDict({
    train: Dataset({
        features: ['clean', 'dotless'],
        num_rows: 4446330
    })
}) 

{'clean': 'القتالية بأسرع وقت ممكن واستئناف العملية التفاوضية في', 'dotless': 'الفبالىه باسرع وفب ممكن واسبىناف العملىه البفاوصىه فى'}


In [4]:
checkpoint = "google/t5-v1_1-small"
old_tokenizer = AutoTokenizer.from_pretrained(checkpoint)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
def get_training_corpus():
    dataset = raw_datasets["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples['clean']
        yield samples['dotless']
        
training_corpus = get_training_corpus()



In [6]:
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, vocab_size=32000)






In [14]:
tokenizer.vocab_size

32000

In [15]:
test_d = "ىا رب معبرفا بان ابامى فد طمب فوف"
test_c = "يا رب معترفا بأن أثامى قد طمت فوق"

In [16]:
inputs_d = tokenizer(test_d)
tokens_d = tokenizer.tokenize(test_d)
print(inputs_d)
print(tokenizer.convert_ids_to_tokens(inputs_d["input_ids"]))
print(test_d)

print()
print()

inputs_c = tokenizer(test_c)
tokens_c = tokenizer.tokenize(test_c)
print(inputs_c)
print(tokenizer.convert_ids_to_tokens(inputs_c["input_ids"]))
print(test_c)



{'input_ids': [103, 111, 106, 971, 4908, 139, 106, 268, 103, 128, 2785, 212, 103, 21230, 1344, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['▁', 'ى', 'ا', '▁رب', '▁معبر', 'ف', 'ا', '▁بان', '▁', 'اب', 'امى', '▁فد', '▁', 'طمب', '▁فوف', '</s>']
ىا رب معبرفا بان ابامى فد طمب فوف


{'input_ids': [341, 971, 133, 123, 997, 106, 367, 20063, 2785, 228, 103, 195, 117, 123, 1473, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['▁يا', '▁رب', '▁مع', 'ت', 'رف', 'ا', '▁بأن', '▁أث', 'امى', '▁قد', '▁', 'ط', 'م', 'ت', '▁فوق', '</s>']
يا رب معترفا بأن أثامى قد طمت فوق


In [17]:
encoding = tokenizer(test_d)
print(encoding)

print(encoding.tokens())
print(encoding.word_ids())

{'input_ids': [103, 111, 106, 971, 4908, 139, 106, 268, 103, 128, 2785, 212, 103, 21230, 1344, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['▁', 'ى', 'ا', '▁رب', '▁معبر', 'ف', 'ا', '▁بان', '▁', 'اب', 'امى', '▁فد', '▁', 'طمب', '▁فوف', '</s>']
[0, 0, 0, 1, 2, 2, 2, 3, 4, 4, 4, 5, 6, 6, 7, None]


In [18]:
tokenizer.save_pretrained("AR-dotless-tokenizer")


('AR-dotless-tokenizer/tokenizer_config.json',
 'AR-dotless-tokenizer/special_tokens_map.json',
 'AR-dotless-tokenizer/tokenizer.json')

In [19]:
tokenizer.push_to_hub("AR-dotless-tokenizer")


CommitInfo(commit_url='https://huggingface.co/dot-ammar/AR-dotless-tokenizer/commit/21526c32df797ca0d1ce7129b1f1c4556475d199', commit_message='Upload tokenizer', commit_description='', oid='21526c32df797ca0d1ce7129b1f1c4556475d199', pr_url=None, pr_revision=None, pr_num=None)