In [1]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.decoders import ByteLevel
from tokenizers.processors import TemplateProcessing

from transformers import PreTrainedTokenizerFast, Wav2Vec2CTCTokenizer

import srsly

  from .autonotebook import tqdm as notebook_tqdm


# Chars

In [2]:
texts = list(srsly.read_json("../SROIETask2/data.json").values())

In [3]:
charset = set(" ".join(texts))

In [4]:
list(charset)

['=',
 '7',
 '8',
 '}',
 'K',
 '9',
 'G',
 'Y',
 '_',
 '#',
 'L',
 ']',
 'A',
 'Â·',
 '0',
 'O',
 '?',
 'P',
 ')',
 '6',
 '@',
 'W',
 'l',
 ',',
 "'",
 '\\',
 ' ',
 '4',
 'D',
 'J',
 '"',
 '~',
 '^',
 'V',
 'C',
 'Q',
 '/',
 '-',
 'S',
 '`',
 'B',
 '3',
 '$',
 '>',
 'M',
 '&',
 '(',
 'T',
 'I',
 '2',
 'F',
 ';',
 '[',
 '.',
 ':',
 '!',
 '<',
 '5',
 '{',
 'U',
 'R',
 '+',
 'H',
 '|',
 'N',
 '*',
 'Z',
 'X',
 'E',
 '1',
 'r',
 '%']

In [5]:
with open("vocab.txt", "w") as f:
    f.write("\n".join(sorted(list(charset))))

In [6]:
vocab = {
    # "<blank>": 0,
    "<pad>": 0,
    "</s>": 1,
    "<s>": 2,
    "<unk>": 3,
    "<mask>": 4,
}
vocab.update({c: i + 6 for i, c in enumerate(charset)})

In [7]:
srsly.write_json("vocab.json", vocab)

# Tokenizer

In [8]:
tokenizer = Tokenizer(BPE(vocab, unk_token="<unk>", merges=[]))
tokenizer.decoder = ByteLevel()
tokenizer.post_processor = TemplateProcessing(
    single="$A </s>",
    special_tokens=[
        ("</s>", tokenizer.token_to_id("</s>")),
    ],
)

In [11]:
tokenizer.decode(tokenizer.encode("hello world".upper()).ids)

'HELLO WORLD</s>'

In [12]:
tokenizer.encode_batch(["hello world", "hello world"])

[Encoding(num_tokens=12, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=12, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

In [13]:
tokenizer.add_special_tokens(["<pad>", "</s>"])

0

In [14]:
tokenizer.save("../trainer/tokenizer-pad0.json")

The OrderedVocab you are attempting to save contains a hole for index 5, your vocabulary could be corrupted !


In [15]:
tok = PreTrainedTokenizerFast(tokenizer_file="../trainer/tokenizer-pad0.json")

In [16]:
tok.add_special_tokens({"pad_token": "<pad>", "eos_token": "</s>"})

0

In [17]:
inputs = tok.batch_encode_plus(["hello world".upper(), "hello world ausha".upper()], padding="longest", return_tensors="pt")

In [18]:
tok.batch_decode(inputs.input_ids, skip_special_tokens=True)

['HELLO WORLD', 'HELLO WORLD AUSHA']

In [19]:
tok.save_pretrained("../trainer/tokenizer-pad0")

The OrderedVocab you are attempting to save contains a hole for index 5, your vocabulary could be corrupted !


('../trainer/tokenizer-pad0/tokenizer_config.json',
 '../trainer/tokenizer-pad0/special_tokens_map.json',
 '../trainer/tokenizer-pad0/tokenizer.json')

In [21]:
tok.pad_token_id, tok.eos_token_id

(0, 1)

# Word2VecCTCTokenizer - NOPE

In [18]:
tokenizer = Wav2Vec2CTCTokenizer(vocab_file="vocab.json", word_delimiter_token="|")

In [19]:
inputs = tokenizer.batch_encode_plus(["hello world".upper(), "hello world ausha".upper()], padding="longest", return_tensors="pt")

In [20]:
tokenizer.batch_decode(inputs.input_ids, skip_special_tokens=True)

['HELO WORLD', 'HELO WORLD AUSHA']