In [2]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.decoders import ByteLevel
from tokenizers.processors import TemplateProcessing

from transformers import PreTrainedTokenizerFast, Wav2Vec2CTCTokenizer

import srsly

  from .autonotebook import tqdm as notebook_tqdm


# Chars

In [28]:
trdata = srsly.read_json("../data/IAM/data/data.json")["tr"]
texts = [v["text"] for v in trdata.values()]#if not v["error"]]

In [29]:
charset = set(" ".join(texts))

In [30]:
list(charset)

['O',
 'g',
 'i',
 'e',
 'u',
 'y',
 'x',
 't',
 'N',
 '#',
 'b',
 'Z',
 'j',
 '.',
 'S',
 'C',
 'K',
 'U',
 'f',
 'k',
 'd',
 '/',
 'G',
 '1',
 ' ',
 '5',
 'm',
 'w',
 'W',
 '4',
 'D',
 '8',
 'L',
 ':',
 'Q',
 'h',
 '(',
 '*',
 '?',
 'R',
 '-',
 '9',
 'n',
 ';',
 's',
 'a',
 'V',
 'r',
 '!',
 'z',
 'M',
 'E',
 'T',
 'B',
 "'",
 '6',
 'q',
 'v',
 'A',
 'o',
 '+',
 'Y',
 'c',
 '3',
 'F',
 'P',
 '0',
 'J',
 'I',
 '&',
 ')',
 '7',
 ',',
 'l',
 '"',
 '2',
 'H',
 'p',
 'X']

In [31]:
len(charset)

79

In [32]:
with open("iam-vocab.txt", "w") as f:
    f.write("\n".join(sorted(list(charset))))

In [33]:
vocab = {
    # "<blank>": 0,
    "<pad>": 0,
    "</s>": 1,
    "<s>": 2,
    "<unk>": 3,
    "<mask>": 4,
}
offset = len(vocab)
vocab.update({c: i + offset for i, c in enumerate(charset)})

In [34]:
srsly.write_json("iam-vocab.json", vocab)

# Tokenizer

In [35]:
tokenizer = Tokenizer(BPE(vocab, unk_token="<unk>", merges=[]))
tokenizer.decoder = ByteLevel()
tokenizer.post_processor = TemplateProcessing(
    single="$A </s>",
    special_tokens=[
        ("</s>", tokenizer.token_to_id("</s>")),
    ],
)

In [36]:
tokenizer.decode(tokenizer.encode("hello world").ids)

'hello world</s>'

In [37]:
tokenizer.encode_batch(["hello world", "hello world"])

[Encoding(num_tokens=12, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=12, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

In [38]:
tokenizer.add_special_tokens(["<pad>", "</s>"])

0

In [39]:
tokenizer.save("../trainer/iam-tokenizers/tokenizer-pad0.json")

In [40]:
tok = PreTrainedTokenizerFast(tokenizer_file="../trainer/iam-tokenizers/tokenizer-pad0.json")

In [41]:
tok.add_special_tokens({"pad_token": "<pad>", "eos_token": "</s>"})

0

In [42]:
inputs = tok.batch_encode_plus(["hello world".upper(), "hello world ausha".upper()], padding="longest", return_tensors="pt")

In [43]:
tok.batch_decode(inputs.input_ids, skip_special_tokens=True)

['HELLO WORLD', 'HELLO WORLD AUSHA']

In [44]:
tok.save_pretrained("../trainer/iam-tokenizers/tokenizer-pad0")

('../trainer/iam-tokenizers/tokenizer-pad0/tokenizer_config.json',
 '../trainer/iam-tokenizers/tokenizer-pad0/special_tokens_map.json',
 '../trainer/iam-tokenizers/tokenizer-pad0/tokenizer.json')

In [45]:
tok.pad_token_id, tok.eos_token_id

(0, 1)

# Word2VecCTCTokenizer - NOPE

In [18]:
tokenizer = Wav2Vec2CTCTokenizer(vocab_file="vocab.json", word_delimiter_token="|")

In [19]:
inputs = tokenizer.batch_encode_plus(["hello world".upper(), "hello world ausha".upper()], padding="longest", return_tensors="pt")

In [20]:
tokenizer.batch_decode(inputs.input_ids, skip_special_tokens=True)

['HELO WORLD', 'HELO WORLD AUSHA']