# Tokenization

In [None]:
!pip install transformers

## Loading a Turkish Pre-trained Tokenizer

In [None]:
from transformers import AutoModel, AutoTokenizer

tokenizerTUR = AutoTokenizer.from_pretrained(
    "dbmdz/bert-base-turkish-uncased",
)
print(f"VOC size is: {tokenizerTUR.vocab_size}")
print(f"The model is {type(tokenizerTUR)}")

## Loading an English Pre-trained Tokenizer

In [None]:
from transformers import AutoModel, AutoTokenizer

tokenizerEN = AutoTokenizer.from_pretrained("bert-base-uncased")
print(f"VOC size is: {tokenizerEN.vocab_size}")
print(f"The model is {type(tokenizerEN)}")

In [None]:
word_en = "telecommunications"
print(f"is in Turkish Model ? {word_en in tokenizerTUR.vocab}")
print(f"is in English Model ? {word_en in tokenizerEN.vocab}")

In [None]:
tokens = tokenizerTUR.tokenize(word_en)
tokens

But, The pieces are in the Turkish model

In [None]:
[t in tokenizerTUR.vocab for t in tokens]

In [None]:
tokens = tokenizerEN.tokenize(word_en)
tokens

In [None]:
long_word_tur = "Muvaffakiyetsizleştiricileştiriveremeyebileceklerimizdenmişsinizcesine"

'''
It means that “As though you happen to have been from among those whom we will not be able to easily/quickly make a maker of unsuccessful ones” 
'''

In [None]:
print(tokenizerTUR.tokenize(long_word_tur))

## Understanding Tokenization Algorithms

### Train tokenizers from scratch

let's load Shakespeare plays from gutenberg project

In [None]:
import nltk
from nltk.corpus import gutenberg

nltk.download("gutenberg")
nltk.download("punkt")
plays = ["shakespeare-macbeth.txt", "shakespeare-hamlet.txt", "shakespeare-caesar.txt"]
shakespeare = [" ".join(s) for ply in plays for s in gutenberg.sents(ply)]

In [None]:
# We prepare a template for the post-processing
# Some initial settings

In [None]:
from tokenizers.processors import TemplateProcessing

special_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
temp_proc = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", special_tokens.index("[CLS]")),
        ("[SEP]", special_tokens.index("[SEP]")),
    ],
)

## Training BPE

In [None]:
from tokenizers import Tokenizer
from tokenizers.normalizers import Sequence, Lowercase, NFD, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE
from tokenizers.decoders import BPEDecoder

# Instantiate BPE (Byte-Pair Encoding)
tokenizer = Tokenizer(BPE())

# a unicode normalizer, lowercasing and , replacing accents in order  :
# * Sequence : It composes multiple PreTokenizer that will be run in the given order
tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])

# Whitespace: Splits on word boundaries using the regular expression \w+|[^\w\s]+
tokenizer.pre_tokenizer = Whitespace()
tokenizer.decoder = BPEDecoder()
tokenizer.post_processor = temp_proc

We are ready to train the model 

In [None]:
from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer(vocab_size=5000, special_tokens=special_tokens)
tokenizer.train_from_iterator(shakespeare, trainer=trainer)
print(f"Trained vocab size: {tokenizer.get_vocab_size()}")

In [None]:
# take a sentence from macbeth

In [None]:
sen = "Is this a dagger which I see before me, the handle toward my hand?"
sen_enc = tokenizer.encode(sen)
print(f"Output: {format(sen_enc.tokens)}")

In [None]:
sen_enc2 = tokenizer.encode("Macbeth and Hugging Face")

In [None]:
print(f"Output: {format(sen_enc2.tokens)}")

In [None]:
# Let us pass  two sentences

In [None]:
two_enc = tokenizer.encode("I like Hugging Face!", "He likes Macbeth!")

In [None]:
print(f"Output: {format(two_enc.tokens)}")

In [None]:
tokenizer.model.save(".")

In [None]:
!wc -l ./merges.txt

In [None]:
!head -6 ./merges.txt

In [None]:
!head -1000 ./merges.txt| tail -5

In [None]:
# Save and Load Tokenizer

In [None]:
tokenizer.save("MyBPETokenizer.json")
tokenizerFromFile = Tokenizer.from_file("MyBPETokenizer.json")
sen_enc3 = tokenizerFromFile.encode("I like HuggingFace and Macbeth")
print(f"Output: {format(sen_enc3.tokens)}")

## Training WordPiece

In [None]:
from tokenizers.models import WordPiece
from tokenizers.decoders import WordPiece as WordPieceDecoder
from tokenizers.normalizers import BertNormalizer

# BERT normalizer includes cleaning the text, handling accents, chinese chars and lowercasing

tokenizer = Tokenizer(WordPiece())
tokenizer.normalizer = BertNormalizer()
tokenizer.pre_tokenizer = Whitespace()

tokenizer.decoder = WordPieceDecoder()

In [None]:
from tokenizers.trainers import WordPieceTrainer

trainer = WordPieceTrainer(
    vocab_size=5000, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)

tokenizer.train_from_iterator(shakespeare, trainer=trainer)
output = tokenizer.encode(sen)
print(output.tokens)

In [None]:
# let us use WordPiece Decoder to treat the sentences properly.

In [None]:
tokenizer.decode(output.ids)

In [None]:
# force the model to produce UNK tokens

In [None]:
tokenizer.encode("Kralsın aslansın Macbeth!").tokens

# Pre-made tokenizers 
* CharBPETokenizer: The original BPE
* ByteLevelBPETokenizer: The byte level version of the BPE
* SentencePieceBPETokenizer: A BPE implementation compatible with the one used by SentencePiece
* BertWordPieceTokenizer: The famous Bert tokenizer, using WordPiece

In [None]:
# Fast Tokenizers optimized for Research and Production

In [None]:
from tokenizers import (
    ByteLevelBPETokenizer,
    CharBPETokenizer,
    SentencePieceBPETokenizer,
    BertWordPieceTokenizer,
)

In [None]:
tokenizer = SentencePieceBPETokenizer()
print(tokenizer.normalizer)
print(tokenizer.pre_tokenizer)
print(tokenizer.decoder)
print(tokenizer.post_processor)

In [None]:
tokenizer = BertWordPieceTokenizer()
print(tokenizer.normalizer)
print(tokenizer.pre_tokenizer)
print(tokenizer.decoder)
print(tokenizer.post_processor)