Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions keras_nlp/tokenizers/word_piece_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,17 @@ def vocabulary_size(self) -> int:
"""Get the size of the tokenizer vocabulary."""
return len(self._vocab)

def id_to_token(self, id: int) -> str:
"""Convert an integer id to a string token."""
return self._vocab[id]

def token_to_id(self, token: str) -> int:
"""Convert a string token to an integer id."""
# This will be slow, but keep memory usage down compared to building a
# dict. Assuming the main use case is looking up a few special tokens
# early in the vocab, this should be fine.
return self._vocab.index(token)

def get_config(self) -> Dict[str, Any]:
config = super().get_config()
config.update(
Expand Down
4 changes: 4 additions & 0 deletions keras_nlp/tokenizers/word_piece_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ def test_accessors(self):
tokenizer.get_vocabulary(),
["[UNK]", "the", "qu", "##ick", "br", "##own", "fox"],
)
self.assertEqual(tokenizer.id_to_token(0), "[UNK]")
self.assertEqual(tokenizer.id_to_token(6), "fox")
self.assertEqual(tokenizer.token_to_id("[UNK]"), 0)
self.assertEqual(tokenizer.token_to_id("fox"), 6)

def test_special_tokens(self):
input_data = ["quick brown whale"]
Expand Down