From e4460bc854b5478c579230590b6247ce80e22756 Mon Sep 17 00:00:00 2001 From: Matt Watson Date: Mon, 28 Mar 2022 18:12:25 -0700 Subject: [PATCH] Add tokenizer helper to convert tokens to ids And from ids to tokens. --- keras_nlp/tokenizers/word_piece_tokenizer.py | 11 +++++++++++ keras_nlp/tokenizers/word_piece_tokenizer_test.py | 4 ++++ 2 files changed, 15 insertions(+) diff --git a/keras_nlp/tokenizers/word_piece_tokenizer.py b/keras_nlp/tokenizers/word_piece_tokenizer.py index 7e89ed49fe..fab193174a 100644 --- a/keras_nlp/tokenizers/word_piece_tokenizer.py +++ b/keras_nlp/tokenizers/word_piece_tokenizer.py @@ -225,6 +225,17 @@ def vocabulary_size(self) -> int: """Get the size of the tokenizer vocabulary.""" return len(self._vocab) + def id_to_token(self, id: int) -> str: + """Convert an integer id to a string token.""" + return self._vocab[id] + + def token_to_id(self, token: str) -> int: + """Convert a string token to an integer id.""" + # This will be slow, but keep memory usage down compared to building a + # dict. Assuming the main use case is looking up a few special tokens + # early in the vocab, this should be fine. + return self._vocab.index(token) + def get_config(self) -> Dict[str, Any]: config = super().get_config() config.update( diff --git a/keras_nlp/tokenizers/word_piece_tokenizer_test.py b/keras_nlp/tokenizers/word_piece_tokenizer_test.py index d57c542797..ab7fb1ea60 100644 --- a/keras_nlp/tokenizers/word_piece_tokenizer_test.py +++ b/keras_nlp/tokenizers/word_piece_tokenizer_test.py @@ -66,6 +66,10 @@ def test_accessors(self): tokenizer.get_vocabulary(), ["[UNK]", "the", "qu", "##ick", "br", "##own", "fox"], ) + self.assertEqual(tokenizer.id_to_token(0), "[UNK]") + self.assertEqual(tokenizer.id_to_token(6), "fox") + self.assertEqual(tokenizer.token_to_id("[UNK]"), 0) + self.assertEqual(tokenizer.token_to_id("fox"), 6) def test_special_tokens(self): input_data = ["quick brown whale"]