# Tokenization

In [1]:
!pip install -r requirements.txt



## Loading a Turkish Pre-trained Tokenizer

In [2]:
from transformers import AutoModel, AutoTokenizer

tokenizerTUR = AutoTokenizer.from_pretrained(
    "dbmdz/bert-base-turkish-uncased",
)
print(f"VOC size is: {tokenizerTUR.vocab_size}")
print(f"The model is {type(tokenizerTUR)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


VOC size is: 32000
The model is <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>


## Loading an English Pre-trained Tokenizer

In [3]:
from transformers import AutoModel, AutoTokenizer

tokenizerEN = AutoTokenizer.from_pretrained("bert-base-uncased")
print(f"VOC size is: {tokenizerEN.vocab_size}")
print(f"The model is {type(tokenizerEN)}")

VOC size is: 30522
The model is <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>


In [4]:
word_en = "telecommunications"
print(f"is in Turkish Model ? {word_en in tokenizerTUR.vocab}")
print(f"is in English Model ? {word_en in tokenizerEN.vocab}")

is in Turkish Model ? False
is in English Model ? True


In [5]:
tokens = tokenizerTUR.tokenize(word_en)
tokens

['tel', '##eco', '##mm', '##un', '##ica', '##tions']

In [6]:
[t in tokenizerTUR.vocab for t in tokens]

[True, True, True, True, True, True]

In [7]:
tokens = tokenizerEN.tokenize(word_en)
tokens

['telecommunications']

In [8]:
long_word_tur = "Muvaffakiyetsizleştiricileştiriveremeyebileceklerimizdenmişsinizcesine"

'''
It means that “As though you happen to have been from among those whom we will not be able to easily/quickly make a maker of unsuccessful ones”
'''

## Understanding Tokenization Algorithms

### Train tokenizers from scratch

let's load Shakespeare plays from gutenberg project

In [9]:
import nltk
from nltk.corpus import gutenberg

nltk.download("gutenberg")
nltk.download("punkt")
nltk.download('punkt_tab')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [10]:
# !apt-get install tree
!tree -a /root/nltk_data

[01;34m/root/nltk_data[0m
├── [01;34mcorpora[0m
│   ├── [01;34mgutenberg[0m
│   │   ├── [00mausten-emma.txt[0m
│   │   ├── [00mausten-persuasion.txt[0m
│   │   ├── [00mausten-sense.txt[0m
│   │   ├── [00mbible-kjv.txt[0m
│   │   ├── [00mblake-poems.txt[0m
│   │   ├── [00mbryant-stories.txt[0m
│   │   ├── [00mburgess-busterbrown.txt[0m
│   │   ├── [00mcarroll-alice.txt[0m
│   │   ├── [00mchesterton-ball.txt[0m
│   │   ├── [00mchesterton-brown.txt[0m
│   │   ├── [00mchesterton-thursday.txt[0m
│   │   ├── [00medgeworth-parents.txt[0m
│   │   ├── [00mmelville-moby_dick.txt[0m
│   │   ├── [00mmilton-paradise.txt[0m
│   │   ├── [00mREADME[0m
│   │   ├── [00mshakespeare-caesar.txt[0m
│   │   ├── [00mshakespeare-hamlet.txt[0m
│   │   ├── [00mshakespeare-macbeth.txt[0m
│   │   └── [00mwhitman-leaves.txt[0m
│   └── [01;31mgutenberg.zip[0m
└── [01;34mtokenizers[0m
    ├── [01;34mpunkt[0m
    │   ├── [00mczech.pickle[0m
    │   ├── [00mdanish.pi

In [11]:
import nltk
from nltk.corpus import gutenberg

nltk.download("gutenberg")
nltk.download("punkt")
plays = ["shakespeare-macbeth.txt", "shakespeare-hamlet.txt", "shakespeare-caesar.txt"]
# for ply in plays:
#     print("ply:", ply)
#     print(gutenberg.sents(ply))
#     for s in gutenberg.sents(ply):
#         print(" ".join(s))
#     print("--------" * 3)
shakespeare = [" ".join(s) for ply in plays for s in gutenberg.sents(ply)]
shakespeare

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['[ The Tragedie of Macbeth by William Shakespeare 1603 ]',
 'Actus Primus .',
 'Scoena Prima .',
 'Thunder and Lightning .',
 'Enter three Witches .',
 '1 .',
 'When shall we three meet againe ?',
 'In Thunder , Lightning , or in Raine ?',
 '2 .',
 "When the Hurley - burley ' s done , When the Battaile ' s lost , and wonne",
 '3 .',
 'That will be ere the set of Sunne',
 '1 .',
 'Where the place ?',
 '2 .',
 'Vpon the Heath',
 '3 .',
 'There to meet with Macbeth',
 '1 .',
 'I come , Gray - Malkin',
 'All .',
 'Padock calls anon : faire is foule , and foule is faire , Houer through the fogge and filthie ayre .',
 'Exeunt .',
 'Scena Secunda .',
 'Alarum within .',
 'Enter King Malcome , Donalbaine , Lenox , with attendants , meeting a bleeding Captaine .',
 'King .',
 'What bloody man is that ?',
 'he can report , As seemeth by his plight , of the Reuolt The newest state',
 'Mal .',
 "This is the Serieant , Who like a good and hardie Souldier fought ' Gainst my Captiuitie : Haile braue

In [12]:
from tokenizers.processors import TemplateProcessing

special_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
print("special_tokens:", special_tokens)

temp_proc = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", special_tokens.index("[CLS]")),
        ("[SEP]", special_tokens.index("[SEP]")),
    ],
)
print("temp_proc:", temp_proc)

special_tokens: ['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]']
temp_proc: TemplateProcessing(single=[SpecialToken(id="[CLS]", type_id=0), Sequence(id=A, type_id=0), SpecialToken(id="[SEP]", type_id=0)], pair=[SpecialToken(id="[CLS]", type_id=0), Sequence(id=A, type_id=0), SpecialToken(id="[SEP]", type_id=0), Sequence(id=B, type_id=1), SpecialToken(id="[SEP]", type_id=1)], special_tokens={"[CLS]":SpecialToken(id="[CLS]", ids=[1], tokens=["[CLS]"]), "[SEP]":SpecialToken(id="[SEP]", ids=[2], tokens=["[SEP]"])})


## Training BPE

In [13]:
from tokenizers import Tokenizer
from tokenizers.normalizers import Sequence, Lowercase, NFD, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE
from tokenizers.decoders import BPEDecoder

# Instantiate BPE (Byte-Pair Encoding)
tokenizer = Tokenizer(BPE())

# a unicode normalizer, lowercasing and , replacing accents in order  :
# * Sequence : It composes multiple PreTokenizer that will be run in the given order
tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])

# Whitespace: Splits on word boundaries using the regular expression \w+|[^\w\s]+
tokenizer.pre_tokenizer = Whitespace()
tokenizer.decoder = BPEDecoder()
tokenizer.post_processor = temp_proc

We are ready to train the model

In [14]:
from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer(vocab_size=5000, special_tokens=special_tokens)
tokenizer.train_from_iterator(shakespeare, trainer=trainer)
print(f"Trained vocab size: {tokenizer.get_vocab_size()}")

Trained vocab size: 5000


In [15]:
sen = "Is this a dagger which I see before me, the handle toward my hand?"
sen_enc = tokenizer.encode(sen)
print(f"Output: {format(sen_enc.tokens)}")

Output: ['[CLS]', 'is', 'this', 'a', 'dagger', 'which', 'i', 'see', 'before', 'me', ',', 'the', 'hand', 'le', 'toward', 'my', 'hand', '?', '[SEP]']


In [16]:
sen_enc2 = tokenizer.encode("Macbeth and Hugging Face")

In [17]:
print(f"Output: {format(sen_enc2.tokens)}")

Output: ['[CLS]', 'macbeth', 'and', 'hu', 'gg', 'ing', 'face', '[SEP]']


In [18]:
two_enc = tokenizer.encode("I like Hugging Face!", "He likes Macbeth!")

In [19]:
print(f"Output: {format(two_enc.tokens)}")

Output: ['[CLS]', 'i', 'like', 'hu', 'gg', 'ing', 'face', '!', '[SEP]', 'he', 'likes', 'macbeth', '!', '[SEP]']


In [20]:
tokenizer.model.save(".")

['./vocab.json', './merges.txt']

In [21]:
!wc -l ./merges.txt

4948 ./merges.txt


In [22]:
!head -6 ./merges.txt

#version: 0.2
t h
o u
a n
th e
r e


In [23]:
!head -1000 ./merges.txt | tail -5

ch ance
si g
your s
ti a
po int


In [24]:
# Save and Load Tokenizer
tokenizer.save("MyBPETokenizer.json")
tokenizerFromFile = Tokenizer.from_file("MyBPETokenizer.json")
sen_enc3 = tokenizerFromFile.encode("I like HuggingFace and Macbeth")
print(f"Output: {format(sen_enc3.tokens)}")

Output: ['[CLS]', 'i', 'like', 'hu', 'gg', 'ing', 'face', 'and', 'macbeth', '[SEP]']


## Training WordPiece

In [25]:
from tokenizers.models import WordPiece
from tokenizers.decoders import WordPiece as WordPieceDecoder
from tokenizers.normalizers import BertNormalizer

# BERT normalizer includes cleaning the text, handling accents, chinese chars and lowercasing

tokenizer = Tokenizer(WordPiece())
tokenizer.normalizer = BertNormalizer()
tokenizer.pre_tokenizer = Whitespace()

tokenizer.decoder = WordPieceDecoder()

In [26]:
from tokenizers.trainers import WordPieceTrainer

trainer = WordPieceTrainer(
    vocab_size=5000, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)

tokenizer.train_from_iterator(shakespeare, trainer=trainer)
output = tokenizer.encode(sen)
print(output.tokens)

['is', 'this', 'a', 'dagger', 'which', 'i', 'see', 'before', 'me', ',', 'the', 'hand', '##le', 'toward', 'my', 'hand', '?']


In [27]:
tokenizer.decode(output.ids)

'is this a dagger which i see before me, the handle toward my hand?'

In [28]:
tokenizer.encode("Kralsın aslansın Macbeth!").tokens

['[UNK]', '[UNK]', 'macbeth', '!']

# Pre-made tokenizers
* CharBPETokenizer: The original BPE
* ByteLevelBPETokenizer: The byte level version of the BPE
* SentencePieceBPETokenizer: A BPE implementation compatible with the one used by SentencePiece
* BertWordPieceTokenizer: The famous Bert tokenizer, using WordPiece

In [29]:
from tokenizers import (
    ByteLevelBPETokenizer,
    CharBPETokenizer,
    SentencePieceBPETokenizer,
    BertWordPieceTokenizer,
)

In [30]:
tokenizer = SentencePieceBPETokenizer()
print(tokenizer.normalizer)
print(tokenizer.pre_tokenizer)
print(tokenizer.decoder)
print(tokenizer.post_processor)

NFKC()
Metaspace(replacement="▁", prepend_scheme=always, split=True)
Metaspace(replacement="▁", prepend_scheme=always, split=True)
None


In [31]:
tokenizer = BertWordPieceTokenizer()
print(tokenizer.normalizer)
print(tokenizer.pre_tokenizer)
print(tokenizer.decoder)
print(tokenizer.post_processor)

BertNormalizer(clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True)
BertPreTokenizer()
WordPiece(prefix="##", cleanup=True)
None
