In [35]:
from datasets import load_dataset

dataset = load_dataset("timit_asr", data_dir="timit", split="train")

def get_training_corpus():
    for i in range(0, len(dataset), 1000):
        yield list(map(lambda x: " ".join(x["utterance"]), dataset[i : i + 1000]["phonetic_detail"]))

In [36]:
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

tokenizer = Tokenizer(models.BPE())

In [37]:
tokenizer.enable_padding()

In [38]:
test_sentence = "h# sh ix hv eh dcl jh ih dcl"

In [39]:
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()

In [40]:
tokenizer.pre_tokenizer.pre_tokenize_str(test_sentence)

[('h#', (0, 2)),
 ('sh', (3, 5)),
 ('ix', (6, 8)),
 ('hv', (9, 11)),
 ('eh', (12, 14)),
 ('dcl', (15, 18)),
 ('jh', (19, 21)),
 ('ih', (22, 24)),
 ('dcl', (25, 28))]

In [41]:
tokens = set([
    x
    for xs in map(lambda x: x["phonetic_detail"]["utterance"], dataset)
    for x in xs
])
len(tokens)

61

In [42]:
trainer = trainers.BpeTrainer(special_tokens=["<|endoftext|>"])
tokenizer.train_from_iterator(get_training_corpus(), trainer=trainer)






In [43]:
encoding = tokenizer.encode(test_sentence)
encoding.tokens

['h#', 'sh', 'ix', 'hv', 'eh', 'dcl', 'jh', 'ih', 'dcl']

In [44]:
tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)

In [45]:
encoding = tokenizer.encode(test_sentence)
start, end = encoding.offsets[4]
test_sentence[start:end]

'eh'

In [46]:
tokenizer.decoder = decoders.WordPiece()

In [47]:
tokenizer.decode(encoding.ids)

'h# sh ix hv eh dcl jh ih dcl'

In [48]:
tokenizer.save("tokenizer.json")

In [49]:
tokenizer.model.save(".")

['./vocab.json', './merges.txt']