In [1]:
from datasets import *
from transformers import AutoTokenizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from huggingface_hub import notebook_login


In [3]:
raw_datasets = load_from_disk("AR-dotless-medium-mixed-arrow")

In [4]:
print(raw_datasets, '\n')
print(raw_datasets['train'][0])

DatasetDict({
    train: Dataset({
        features: ['clean', 'dotless'],
        num_rows: 4446330
    })
}) 

{'clean': 'التشكيلات الحضارية المشرقية الفارسية التركية الهندية المغولية في', 'dotless': 'التشكيلات الحصارىه المسرفىه الفارسىه التركية الهندية المعولىه في'}


In [5]:
checkpoint = "google/t5-v1_1-small"
old_tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer_config.json:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/537 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
def get_training_corpus():
    dataset = raw_datasets["train"]
    for start_idx in range(0, len(dataset), 1000):
        samples = dataset[start_idx : start_idx + 1000]
        yield samples['clean']
        yield samples['dotless']
        
training_corpus = get_training_corpus()



In [8]:
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, vocab_size=32000)






In [9]:
tokenizer.vocab_size

32000

In [10]:
test_d = "ىا رب معبرفا بان ابامى فد طمب فوف"
test_c = "يا رب معترفا بأن أثامى قد طمت فوق"

In [14]:
inputs_d = tokenizer(test_d)
tokens_d = tokenizer.tokenize(test_d)
print(inputs_d)
print(tokenizer.convert_ids_to_tokens(inputs_d["input_ids"]))
print(test_d)

print()
print()

inputs_c = tokenizer(test_c)
tokens_c = tokenizer.tokenize(test_c)
print(inputs_c)
print(tokenizer.convert_ids_to_tokens(inputs_c["input_ids"]))
print(test_c)



{'input_ids': [103, 119, 104, 1028, 6358, 141, 104, 455, 826, 5161, 478, 103, 203, 117, 122, 6035, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['▁', 'ى', 'ا', '▁رب', '▁معبر', 'ف', 'ا', '▁بان', '▁اب', 'امى', '▁فد', '▁', 'ط', 'م', 'ب', '▁فوف', '</s>']
ىا رب معبرفا بان ابامى فد طمب فوف


{'input_ids': [103, 109, 104, 1028, 23397, 104, 260, 15397, 5161, 178, 103, 203, 117, 112, 775, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['▁', 'ي', 'ا', '▁رب', '▁معترف', 'ا', '▁بأن', '▁أث', 'امى', '▁قد', '▁', 'ط', 'م', 'ت', '▁فوق', '</s>']
يا رب معترفا بأن أثامى قد طمت فوق


In [15]:
encoding = tokenizer(test_d)
print(encoding)

print(encoding.tokens())
print(encoding.word_ids())

{'input_ids': [103, 119, 104, 1028, 6358, 141, 104, 455, 826, 5161, 478, 103, 203, 117, 122, 6035, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
['▁', 'ى', 'ا', '▁رب', '▁معبر', 'ف', 'ا', '▁بان', '▁اب', 'امى', '▁فد', '▁', 'ط', 'م', 'ب', '▁فوف', '</s>']
[0, 0, 0, 1, 2, 2, 2, 3, 4, 4, 5, 6, 6, 6, 6, 7, None]


In [16]:
tokenizer.save_pretrained("AR-dotless-tokenizer-mixed")


('AR-dotless-tokenizer-mixed/tokenizer_config.json',
 'AR-dotless-tokenizer-mixed/special_tokens_map.json',
 'AR-dotless-tokenizer-mixed/tokenizer.json')

In [17]:
tokenizer.push_to_hub("AR-dotless-tokenizer-mixed")


CommitInfo(commit_url='https://huggingface.co/dot-ammar/AR-dotless-tokenizer-mixed/commit/0aad0a4aa1b2ebed143d650ae6ec925bef83a7c3', commit_message='Upload tokenizer', commit_description='', oid='0aad0a4aa1b2ebed143d650ae6ec925bef83a7c3', pr_url=None, pr_revision=None, pr_num=None)