In [1]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.decoders import ByteLevel
from tokenizers.processors import TemplateProcessing

from transformers import PreTrainedTokenizerFast

import srsly

  from .autonotebook import tqdm as notebook_tqdm


# Chars

In [2]:
texts = list(srsly.read_json("../data/IIIT5K/IIIT5K/train.json").values())

In [3]:
charset = set(" ".join(texts))

In [4]:
list(charset)

['F',
 'A',
 'J',
 'Y',
 'O',
 '0',
 '6',
 'S',
 '5',
 '4',
 'W',
 '1',
 'T',
 '2',
 'D',
 'B',
 'U',
 'K',
 'Q',
 'C',
 '9',
 'M',
 'H',
 'E',
 'I',
 'L',
 'V',
 'P',
 'R',
 'X',
 'G',
 ' ',
 '7',
 '8',
 'N',
 '3',
 'Z']

In [5]:
with open("iiit5k-vocab.txt", "w") as f:
    f.write("\n".join(sorted(list(charset))))

In [6]:
vocab = {
    # "<blank>": 0,
    "<pad>": 0,
    "</s>": 1,
    "<s>": 2,
    "<unk>": 3,
    "<mask>": 4,
}
vocab.update({c: i + 6 for i, c in enumerate(charset)})

In [7]:
srsly.write_json("iiit5k-vocab.json", vocab)

# Tokenizer

In [8]:
tokenizer = Tokenizer(BPE(vocab, unk_token="<unk>", merges=[]))
tokenizer.decoder = ByteLevel()
tokenizer.post_processor = TemplateProcessing(
    single="$A </s>",
    special_tokens=[
        ("</s>", tokenizer.token_to_id("</s>")),
    ],
)

In [9]:
tokenizer.decode(tokenizer.encode("hello world".upper()).ids)

'HELLO WORLD</s>'

In [10]:
tokenizer.encode_batch(["hello world", "hello world"])

[Encoding(num_tokens=12, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=12, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

In [11]:
tokenizer.add_special_tokens(["<pad>", "</s>"])

0

In [13]:
tokenizer.save("../trainer/iiit5k-tokenizers/tokenizer-pad0.json")

The OrderedVocab you are attempting to save contains a hole for index 5, your vocabulary could be corrupted !


In [14]:
tok = PreTrainedTokenizerFast(tokenizer_file="../trainer/iiit5k-tokenizers/tokenizer-pad0.json")

In [15]:
tok.add_special_tokens({"pad_token": "<pad>", "eos_token": "</s>"})

0

In [16]:
inputs = tok.batch_encode_plus(["hello world".upper(), "hello world ausha".upper()], padding="longest", return_tensors="pt")

In [17]:
tok.batch_decode(inputs.input_ids, skip_special_tokens=True)

['HELLO WORLD', 'HELLO WORLD AUSHA']

In [18]:
tok.save_pretrained("../trainer/iiit5k-tokenizers/tokenizer-pad0")

The OrderedVocab you are attempting to save contains a hole for index 5, your vocabulary could be corrupted !


('../trainer/iiit5k-tokenizers/tokenizer-pad0/tokenizer_config.json',
 '../trainer/iiit5k-tokenizers/tokenizer-pad0/special_tokens_map.json',
 '../trainer/iiit5k-tokenizers/tokenizer-pad0/tokenizer.json')

In [19]:
tok.pad_token_id, tok.eos_token_id

(0, 1)