In [1]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.decoders import ByteLevel
from tokenizers.processors import TemplateProcessing

from transformers import PreTrainedTokenizerFast, Wav2Vec2CTCTokenizer

import srsly

  from .autonotebook import tqdm as notebook_tqdm


# Chars

In [3]:
texts = list(srsly.read_json("../SROIETask2/data.json").values())

In [4]:
charset = set(" ".join(texts))

In [5]:
list(charset)

['<',
 '9',
 'M',
 'Z',
 ':',
 '/',
 '7',
 'G',
 'Y',
 'V',
 '%',
 '1',
 '&',
 '4',
 '_',
 'T',
 '*',
 'I',
 'C',
 'B',
 '.',
 'K',
 '·',
 '-',
 ',',
 'Q',
 '+',
 'D',
 '{',
 '[',
 'R',
 '#',
 'X',
 'F',
 'W',
 '!',
 'O',
 '8',
 'U',
 '5',
 '2',
 'l',
 '@',
 'S',
 ' ',
 'N',
 '$',
 '0',
 '6',
 ';',
 'r',
 '~',
 ')',
 '}',
 '(',
 'L',
 '?',
 'A',
 '^',
 "'",
 '>',
 '|',
 'J',
 'H',
 '`',
 '"',
 'P',
 '\\',
 '3',
 'E',
 ']',
 '=']

In [6]:
with open("vocab.txt", "w") as f:
    f.write("\n".join(sorted(list(charset))))

In [7]:
vocab = {
    "<blank>": 0,
    "<pad>": 1,
    "<unk>": 2,
    "<s>": 3,
    "</s>": 4,
    "<mask>": 5,
}
vocab.update({c: i + 6 for i, c in enumerate(charset)})

In [8]:
srsly.write_json("vocab.json", vocab)

# Tokenizer

In [9]:
tokenizer = Tokenizer(BPE(vocab, unk_token="<unk>", merges=[]))
tokenizer.decoder = ByteLevel()
# tokenizer.post_processor = TemplateProcessing(
#     single="$A </s>",
#     special_tokens=[
#         ("</s>", tokenizer.token_to_id("</s>")),
#     ],
# )

In [10]:
tokenizer.decode(tokenizer.encode("hello world".upper()).ids)

'HELLO WORLD'

In [11]:
tokenizer.encode_batch(["hello world", "hello world"])

[Encoding(num_tokens=11, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=11, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

In [12]:
tokenizer.add_special_tokens(["<pad>", "</s>"])

0

In [13]:
tokenizer.save("../trainer/tokenizer-noeos.json")

In [14]:
tok = PreTrainedTokenizerFast(tokenizer_file="../trainer/tokenizer-noeos.json")

In [15]:
tok.add_special_tokens({"pad_token": "<pad>", "eos_token": "</s>"})

0

In [16]:
inputs = tok.batch_encode_plus(["hello world".upper(), "hello world ausha".upper()], padding="longest", return_tensors="pt")

In [17]:
tok.batch_decode(inputs.input_ids, skip_special_tokens=True)

['HELLO WORLD', 'HELLO WORLD AUSHA']

In [18]:
tok.save_pretrained("../trainer/tokenizer-noeos")

('../trainer/tokenizer-noeos/tokenizer_config.json',
 '../trainer/tokenizer-noeos/special_tokens_map.json',
 '../trainer/tokenizer-noeos/tokenizer.json')

# Word2VecCTCTokenizer - NOPE

In [18]:
tokenizer = Wav2Vec2CTCTokenizer(vocab_file="vocab.json", word_delimiter_token="|")

In [19]:
inputs = tokenizer.batch_encode_plus(["hello world".upper(), "hello world ausha".upper()], padding="longest", return_tensors="pt")

In [20]:
tokenizer.batch_decode(inputs.input_ids, skip_special_tokens=True)

['HELO WORLD', 'HELO WORLD AUSHA']