In [1]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.decoders import ByteLevel
from tokenizers.processors import TemplateProcessing

from transformers import PreTrainedTokenizerFast, Wav2Vec2CTCTokenizer

import srsly

  from .autonotebook import tqdm as notebook_tqdm


# Chars

In [4]:
texts = list(srsly.read_json("../trainer/SROIETask2/data.json").values())

In [5]:
charset = set(" ".join(texts))

In [7]:
list(charset)

['9',
 'S',
 'E',
 'O',
 '1',
 'D',
 ',',
 '0',
 'V',
 '=',
 "'",
 ']',
 'W',
 '}',
 'B',
 '/',
 '`',
 ';',
 '*',
 'l',
 '"',
 'P',
 '_',
 'J',
 '|',
 '~',
 '[',
 '3',
 '5',
 '>',
 '\\',
 '@',
 '·',
 '<',
 '2',
 '!',
 'H',
 '+',
 '?',
 '^',
 ' ',
 '&',
 '6',
 'K',
 '4',
 '{',
 '#',
 'r',
 'N',
 'I',
 '8',
 'Z',
 'M',
 '(',
 'R',
 ':',
 'C',
 '7',
 'G',
 'T',
 '%',
 '.',
 'U',
 'A',
 ')',
 'L',
 'F',
 'Q',
 '$',
 '-',
 'X',
 'Y']

In [8]:
with open("vocab.txt", "w") as f:
    f.write("\n".join(sorted(list(charset))))

In [10]:
vocab = {
    "<blank>": 0,
    "<pad>": 1,
    "<unk>": 2,
    "<s>": 3,
    "</s>": 4,
    "<mask>": 5,
}
vocab.update({c: i + 6 for i, c in enumerate(charset)})

In [11]:
srsly.write_json("vocab.json", vocab)

# Tokenizer

In [21]:
tokenizer = Tokenizer(BPE(vocab, unk_token="<unk>", merges=[]))
tokenizer.decoder = ByteLevel()
tokenizer.post_processor = TemplateProcessing(
    single="$A </s>",
    special_tokens=[
        ("</s>", tokenizer.token_to_id("</s>")),
    ],
)

In [22]:
tokenizer.decode(tokenizer.encode("hello world".upper()).ids)

'HELLO WORLD</s>'

In [23]:
tokenizer.encode_batch(["hello world", "hello world"])

[Encoding(num_tokens=12, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=12, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

In [24]:
tokenizer.add_special_tokens(["<pad>", "</s>"])

0

In [25]:
tokenizer.save("../trainer/tokenizer.json")

In [26]:
tok = PreTrainedTokenizerFast(tokenizer_file="../trainer/tokenizer.json")

In [27]:
tok.add_special_tokens({"pad_token": "<pad>", "eos_token": "</s>"})

0

In [30]:
inputs = tok.batch_encode_plus(["hello world".upper(), "hello world ausha".upper()], padding="longest", return_tensors="pt")

In [31]:
tok.batch_decode(inputs.input_ids, skip_special_tokens=True)

['HELLO WORLD', 'HELLO WORLD AUSHA']

In [32]:
tok.save_pretrained("../trainer/tokenizer")

('../trainer/tokenizer/tokenizer_config.json',
 '../trainer/tokenizer/special_tokens_map.json',
 '../trainer/tokenizer/tokenizer.json')

# Word2VecCTCTokenizer - NOPE

In [18]:
tokenizer = Wav2Vec2CTCTokenizer(vocab_file="vocab.json", word_delimiter_token="|")

In [19]:
inputs = tokenizer.batch_encode_plus(["hello world".upper(), "hello world ausha".upper()], padding="longest", return_tensors="pt")

In [20]:
tokenizer.batch_decode(inputs.input_ids, skip_special_tokens=True)

['HELO WORLD', 'HELO WORLD AUSHA']