From e4460bc854b5478c579230590b6247ce80e22756 Mon Sep 17 00:00:00 2001
From: Matt Watson <mattdangerw@gmail.com>
Date: Mon, 28 Mar 2022 18:12:25 -0700
Subject: [PATCH] Add tokenizer helper to convert tokens to ids

And from ids to tokens.
---
 keras_nlp/tokenizers/word_piece_tokenizer.py      | 11 +++++++++++
 keras_nlp/tokenizers/word_piece_tokenizer_test.py |  4 ++++
 2 files changed, 15 insertions(+)

diff --git a/keras_nlp/tokenizers/word_piece_tokenizer.py b/keras_nlp/tokenizers/word_piece_tokenizer.py
index 7e89ed49fe..fab193174a 100644
--- a/keras_nlp/tokenizers/word_piece_tokenizer.py
+++ b/keras_nlp/tokenizers/word_piece_tokenizer.py
@@ -225,6 +225,17 @@ def vocabulary_size(self) -> int:
         """Get the size of the tokenizer vocabulary."""
         return len(self._vocab)
 
+    def id_to_token(self, id: int) -> str:
+        """Convert an integer id to a string token."""
+        return self._vocab[id]
+
+    def token_to_id(self, token: str) -> int:
+        """Convert a string token to an integer id."""
+        # This will be slow, but keep memory usage down compared to building a
+        # dict. Assuming the main use case is looking up a few special tokens
+        # early in the vocab, this should be fine.
+        return self._vocab.index(token)
+
     def get_config(self) -> Dict[str, Any]:
         config = super().get_config()
         config.update(
diff --git a/keras_nlp/tokenizers/word_piece_tokenizer_test.py b/keras_nlp/tokenizers/word_piece_tokenizer_test.py
index d57c542797..ab7fb1ea60 100644
--- a/keras_nlp/tokenizers/word_piece_tokenizer_test.py
+++ b/keras_nlp/tokenizers/word_piece_tokenizer_test.py
@@ -66,6 +66,10 @@ def test_accessors(self):
             tokenizer.get_vocabulary(),
             ["[UNK]", "the", "qu", "##ick", "br", "##own", "fox"],
         )
+        self.assertEqual(tokenizer.id_to_token(0), "[UNK]")
+        self.assertEqual(tokenizer.id_to_token(6), "fox")
+        self.assertEqual(tokenizer.token_to_id("[UNK]"), 0)
+        self.assertEqual(tokenizer.token_to_id("fox"), 6)
 
     def test_special_tokens(self):
         input_data = ["quick brown whale"]