In [1]:
from datasets import load_dataset

ds = load_dataset('bookcorpus', split='all')
ds

Downloading builder script:   0%|          | 0.00/3.25k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/18.5k [00:00<?, ?B/s]

The repository for bookcorpus contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/bookcorpus.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/74004228 [00:00<?, ? examples/s]

Dataset({
    features: ['text'],
    num_rows: 74004228
})

In [2]:
ds['text'][:6]

['usually , he would be tearing around the living room , playing with his toys .',
 'but just one look at a minion sent him practically catatonic .',
 "that had been megan 's plan when she got him dressed earlier .",
 "he 'd seen the movie almost by mistake , considering he was a little young for the pg cartoon , but with older cousins , along with her brothers , mason was often exposed to things that were older .",
 'she liked to think being surrounded by adults and older kids was one reason why he was a such a good talker for his age .',
 "`` are n't you being a good boy ? ''"]

In [3]:
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE

In [4]:
from tokenizers import Tokenizer

model = BPE(unk_token="[UNK]")
tokenizer = Tokenizer(model)

tokenizer.normalizer = Lowercase()
tokenizer.pre_tokenizer = Whitespace()

In [5]:
from tokenizers.trainers import BpeTrainer

trainer = BpeTrainer(vocab_size=32_000, special_tokens=["[PAD]", "[UNK]"], continuing_subword_prefix="##")

In [6]:
def get_examples(batch_size=1_000):
    for i in range(0, len(ds), batch_size):
        yield ds[i:(i + batch_size)]['text']

In [11]:
ret = tokenizer.train_from_iterator(get_examples(), trainer, len(ds))






In [13]:
tokenizer.model.save('model', prefix='hopper')

Exception: No such file or directory (os error 2)

In [14]:
!mkdir model

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
tokenizer.model.save('model', prefix='hopper')

['model/hopper-vocab.json', 'model/hopper-merges.txt']

In [16]:
!head -n10 model/hopper-merges.txt

#version: 0.2
##h ##e
t ##he
##i ##n
##e ##r
##e ##d
##o ##u
##n ##d
##in ##g
t ##o


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [18]:
!tail -n10 model/hopper-merges.txt | tac

mel ##anthe
black ##er
ad ##ject
v ##ang
betroth ##al
tiptoe ##ing
restroom ##s
consol ##ing
esp ##ionage
influ ##x


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [19]:
!wc -l model/hopper-merges.txt

31871 model/hopper-merges.txt


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [20]:
tokenizer.get_vocab_size()

32000

In [21]:
vocab = tokenizer.get_vocab()
type(vocab)

dict

In [22]:
vocab_sorted = sorted(vocab.items(), key=lambda item: item[1])
vocab_sorted[:10]

[('[PAD]', 0),
 ('[UNK]', 1),
 ('\x13', 2),
 ('\x14', 3),
 ('\x18', 4),
 ('\x19', 5),
 ('\x1c', 6),
 ('\x1d', 7),
 ('\x1f', 8),
 ('!', 9)]

## Encoder and Decoder

In [23]:
sample = ds[0]['text']
sample

'usually , he would be tearing around the living room , playing with his toys .'

In [24]:
encoding = tokenizer.encode(sample)
print(encoding)

Encoding(num_tokens=16, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [26]:
token_ids = encoding.ids
tokens = encoding.tokens
type_ids = encoding.type_ids
attention_mask = encoding.attention_mask

In [27]:
from tokenizers.tools import EncodingVisualizer
viz = EncodingVisualizer(tokenizer=tokenizer)
viz(text=sample)

In [28]:
import pandas as pd
out_dict = {'tokens': tokens, 'ids': token_ids, 'type_ids': type_ids, 'attention_mask': attention_mask}
df = pd.DataFrame.from_dict(out_dict)
df

Unnamed: 0,tokens,ids,type_ids,attention_mask
0,usually,2462,0,1
1,",",19,0,1
2,he,149,0,1
3,would,277,0,1
4,be,162,0,1
5,tearing,6456,0,1
6,around,422,0,1
7,the,131,0,1
8,living,1559,0,1
9,room,536,0,1


In [30]:
vocab['[PAD]']

0

In [32]:
samples = ds[:4]['text']
batch_enc = tokenizer.encode_batch(samples)
batch_enc

[Encoding(num_tokens=16, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=14, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=14, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=42, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

In [33]:
# padding hasn't happened

In [34]:
tokenizer.enable_padding(direction='right', pad_id=0, pad_type_id=0, pad_token='[PAD]', length=None, pad_to_multiple_of=None)
tokenizer.enable_truncation(max_length=512)

In [36]:
tokenizer.encode_batch(samples)

[Encoding(num_tokens=42, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=42, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=42, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=42, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

In [37]:
# test
text = "All this is so simple to do in HF இ😊."
enc = tokenizer.encode(text)
print(enc.tokens)

['all', 'this', 'is', 'so', 'simple', 'to', 'do', 'in', 'h', '##f', '[UNK]', '[UNK]', '##.']


In [39]:
tokenizer.save('hopper.json')

In [40]:
import json

In [41]:
with open('hopper.json', 'r') as fin:
    data = json.load(fin)
data

{'version': '1.0',
 'truncation': {'direction': 'Right',
  'max_length': 512,
  'strategy': 'LongestFirst',
  'stride': 0},
 'padding': {'strategy': 'BatchLongest',
  'direction': 'Right',
  'pad_to_multiple_of': None,
  'pad_id': 0,
  'pad_type_id': 0,
  'pad_token': '[PAD]'},
 'added_tokens': [{'id': 0,
   'content': '[PAD]',
   'single_word': False,
   'lstrip': False,
   'rstrip': False,
   'normalized': False,
   'special': True},
  {'id': 1,
   'content': '[UNK]',
   'single_word': False,
   'lstrip': False,
   'rstrip': False,
   'normalized': False,
   'special': True}],
 'normalizer': {'type': 'Lowercase'},
 'pre_tokenizer': {'type': 'Whitespace'},
 'post_processor': None,
 'decoder': None,
 'model': {'type': 'BPE',
  'dropout': None,
  'unk_token': '[UNK]',
  'continuing_subword_prefix': '##',
  'end_of_word_suffix': None,
  'fuse_unk': False,
  'byte_fallback': False,
  'ignore_merges': False,
  'vocab': {'[PAD]': 0,
   '[UNK]': 1,
   '\x13': 2,
   '\x14': 3,
   '\x18': 4,
 

In [43]:
# remake the tokenizer
t2 = Tokenizer.from_file('hopper.json')
enc = t2.encode(text)
print(enc.tokens)

['all', 'this', 'is', 'so', 'simple', 'to', 'do', 'in', 'h', '##f', '[UNK]', '[UNK]', '##.']


In [44]:
# Bert
bert_tokenizer = Tokenizer(BPE(unk_token='[UNK]'))
bert_tokenizer.normalizer = Lowercase()
bert_tokenizer.pre_tokenizer = Whitespace()
bert_trainer = BpeTrainer(vocab_size=32_000, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], continuing_subword_prefix='##')

In [45]:
from tokenizers.processors import TemplateProcessing

In [46]:
bert_tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $0 [SEP]",
    pair="[CLS] $A [SEP] $B:1",
    special_tokens=[("[CLS]", 2), ("[SEP]", 3)]
)
bert_tokenizer.train_from_iterator(get_examples(batch_size=10_000),  trainer=bert_trainer, length=len(ds))






In [49]:
from pprint import pprint

In [52]:
text = "All these are so simple to do in HF. Let's do more"
enc = bert_tokenizer.encode(text)
pprint({'ids': enc.ids, 'tokens': enc.tokens}, depth=2, compact=True)

{'ids': [2, 270, 956, 336, 231, 2534, 141, 206, 157, 56, 102, 24, 462, 17, 67,
         206, 387, 3],
 'tokens': ['[CLS]', 'all', 'these', 'are', 'so', 'simple', 'to', 'do', 'in',
            'h', '##f', '.', 'let', "'", 's', 'do', 'more', '[SEP]']}


In [53]:
pair = [
    text, "We have a long way to go!"
]
enc = bert_tokenizer.encode(*pair)
pprint({'ids': enc.ids, 'tokens': enc.tokens}, depth=2, compact=True)

{'ids': [2, 270, 956, 336, 231, 2534, 141, 206, 157, 56, 102, 24, 462, 17, 67,
         206, 387, 3, 214, 250, 49, 490, 415, 141, 260, 12],
 'tokens': ['[CLS]', 'all', 'these', 'are', 'so', 'simple', 'to', 'do', 'in',
            'h', '##f', '.', 'let', "'", 's', 'do', 'more', '[SEP]', 'we',
            'have', 'a', 'long', 'way', 'to', 'go', '!']}


In [54]:
# decoding
bert_tokenizer.decode(enc.ids)

"all these are so simple to do in h ##f . let ' s do more we have a long way to go !"

In [55]:
from tokenizers.decoders import WordPiece
bert_tokenizer.decoder = WordPiece(prefix='##')

bert_tokenizer.decode(enc.ids)

"all these are so simple to do in hf. let ' s do more we have a long way to go!"

## Pretrained Tokenizers

In [56]:
from transformers import PreTrainedTokenizerFast

In [58]:
pt_tokenizer = PreTrainedTokenizerFast.from_pretrained('hopper.json', unk_token='[UNK]', pad_token='[PAD]', model_input_names=['input_ids', 'token_type_ids', 'attention_mask'])



In [60]:
pprint(pt_tokenizer(text), compact=True)

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [267, 953, 333, 228, 2531, 138, 203, 154, 53, 92, 21, 459, 14, 64,
               203, 384],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [62]:
pprint(pt_tokenizer(text, text_pair=pair[-1]), compact=True)

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1],
 'input_ids': [267, 953, 333, 228, 2531, 138, 203, 154, 53, 92, 21, 459, 14, 64,
               203, 384, 211, 247, 46, 487, 412, 138, 257, 9],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
                    1, 1, 1, 1]}


In [64]:
batch = ['I like the book The Psychology of Money', 'I enjoyed watching the Transformers movie', 'oh! thanks for this']
enc = pt_tokenizer(batch)
pprint(enc, compact=True)

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1]],
 'input_ids': [[54, 281, 131, 1701, 131, 19478, 153, 1564],
               [54, 4096, 1443, 131, 7744, 307, 3760],
               [772, 9, 1767, 200, 254]],
 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0],
                    [0, 0, 0, 0, 0]]}


In [65]:
enc = pt_tokenizer(batch, padding=True)
pprint(enc, compact=True)

{'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0],
                    [1, 1, 1, 1, 1, 0, 0, 0]],
 'input_ids': [[54, 281, 131, 1701, 131, 19478, 153, 1564],
               [54, 4096, 1443, 131, 7744, 307, 3760, 0],
               [772, 9, 1767, 200, 254, 0, 0, 0]],
 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0],
                    [0, 0, 0, 0, 0, 0, 0, 0]]}
