In [1]:
from PIL import Image
import os

import srsly

In [2]:
ROOT_DIR = "../data/synth/mnt/"
IMAGES_DIR = ROOT_DIR + "90kDICT32px/"
train_file = ROOT_DIR + "annotation_train.txt"

In [3]:
def labels_from_file(file):
    with open(file, "r") as f:
        lines = f.read().splitlines()

    image_files = [line.split(" ")[0] for line in lines]

    labels = [line.split("_")[1] for line in lines]
    return labels

In [4]:
labels = labels_from_file(train_file)

In [5]:
chars = set()
for label in labels:
    for char in label:
        chars.add(char)

In [6]:
chars = sorted(list(chars))

In [7]:
with open("synth-vocab.txt", "w") as f:
    f.write("\n".join(chars))

In [13]:
vocab = {
    # "<blank>": 0,
    "<pad>": 0,
    "</s>": 1,
    "<s>": 2,
    "<unk>": 3,
    "<mask>": 4,
    " ": 5,
}
offset = len(vocab) + 1
vocab.update({c: i + offset for i, c in enumerate(chars)})

In [14]:
srsly.write_json("synth-vocab.json", vocab)

In [15]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.decoders import ByteLevel
from tokenizers.processors import TemplateProcessing

from transformers import PreTrainedTokenizerFast

import srsly

In [16]:
tokenizer = Tokenizer(BPE(vocab, unk_token="<unk>", merges=[]))
tokenizer.decoder = ByteLevel()
tokenizer.post_processor = TemplateProcessing(
    single="$A" #</s>",
    # special_tokens=[
    #     ("</s>", tokenizer.token_to_id("</s>")),
    # ],
)

In [17]:
tokenizer.decode(tokenizer.encode("hello world".upper()).ids)

'HELLO WORLD'

In [18]:
tokenizer.encode_batch(["hello world", "hello world"])

[Encoding(num_tokens=11, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=11, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

In [19]:
tokenizer.add_special_tokens(["<pad>", "</s>"])

0

In [20]:
tokenizer.save("../trainer/synth-tokenizers/tokenizer-pad0.json")

The OrderedVocab you are attempting to save contains a hole for index 6, your vocabulary could be corrupted !


In [21]:
tok = PreTrainedTokenizerFast(tokenizer_file="../trainer/synth-tokenizers/tokenizer-pad0.json")

In [22]:
tok.add_special_tokens({"pad_token": "<pad>", "eos_token": "</s>"})

0

In [23]:
inputs = tok.batch_encode_plus(["hello world".upper(), "hello world ausha".upper()], padding="longest", return_tensors="pt")

In [24]:
tok.batch_decode(inputs.input_ids, skip_special_tokens=True)

['HELLO WORLD', 'HELLO WORLD AUSHA']

In [25]:
tok.save_pretrained("../trainer/synth-tokenizers/tokenizer-pad0")

The OrderedVocab you are attempting to save contains a hole for index 6, your vocabulary could be corrupted !


('../trainer/synth-tokenizers/tokenizer-pad0/tokenizer_config.json',
 '../trainer/synth-tokenizers/tokenizer-pad0/special_tokens_map.json',
 '../trainer/synth-tokenizers/tokenizer-pad0/tokenizer.json')

In [None]:
tok.pad_token_id, tok.eos_token_id

(0, 1)