diff --git a/keras_nlp/tokenizers/word_piece_tokenizer.py b/keras_nlp/tokenizers/word_piece_tokenizer.py index 7e89ed49fe..fab193174a 100644 --- a/keras_nlp/tokenizers/word_piece_tokenizer.py +++ b/keras_nlp/tokenizers/word_piece_tokenizer.py @@ -225,6 +225,17 @@ def vocabulary_size(self) -> int: """Get the size of the tokenizer vocabulary.""" return len(self._vocab) + def id_to_token(self, id: int) -> str: + """Convert an integer id to a string token.""" + return self._vocab[id] + + def token_to_id(self, token: str) -> int: + """Convert a string token to an integer id.""" + # This will be slow, but keep memory usage down compared to building a + # dict. Assuming the main use case is looking up a few special tokens + # early in the vocab, this should be fine. + return self._vocab.index(token) + def get_config(self) -> Dict[str, Any]: config = super().get_config() config.update( diff --git a/keras_nlp/tokenizers/word_piece_tokenizer_test.py b/keras_nlp/tokenizers/word_piece_tokenizer_test.py index d57c542797..ab7fb1ea60 100644 --- a/keras_nlp/tokenizers/word_piece_tokenizer_test.py +++ b/keras_nlp/tokenizers/word_piece_tokenizer_test.py @@ -66,6 +66,10 @@ def test_accessors(self): tokenizer.get_vocabulary(), ["[UNK]", "the", "qu", "##ick", "br", "##own", "fox"], ) + self.assertEqual(tokenizer.id_to_token(0), "[UNK]") + self.assertEqual(tokenizer.id_to_token(6), "fox") + self.assertEqual(tokenizer.token_to_id("[UNK]"), 0) + self.assertEqual(tokenizer.token_to_id("fox"), 6) def test_special_tokens(self): input_data = ["quick brown whale"]