From 12087a8900d7f497ad7133931e9ce2531dcd3242 Mon Sep 17 00:00:00 2001
From: Matt Watson <mattdangerw@gmail.com>
Date: Thu, 28 Apr 2022 00:04:24 -0700
Subject: [PATCH 1/4] Add a split_input argument for word piece

This way we can have three flags for our pre-tokenization, which should
allow for a more uniform experience.

We can also remove the large regexes directly for the input arguments,
which simplifies our rendered documentation quite a bit.
---
 keras_nlp/tokenizers/word_piece_tokenizer.py  | 23 +++++++++++++------
 .../tokenizers/word_piece_tokenizer_test.py   |  4 ++--
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/keras_nlp/tokenizers/word_piece_tokenizer.py b/keras_nlp/tokenizers/word_piece_tokenizer.py
index 900692d877..2b4ad8ef38 100644
--- a/keras_nlp/tokenizers/word_piece_tokenizer.py
+++ b/keras_nlp/tokenizers/word_piece_tokenizer.py
@@ -80,7 +80,7 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
 
     If a more custom pre-tokenization step is desired, the layer can be
     configured to apply only the strict WordPiece algorithm by passing
-    `lowercase=False`, `strip_accents=False` and `split_pattern=None`. In
+    `lowercase=False`, `strip_accents=False` and `split_input=False`. In
     this case, inputs should be pre-split string tensors or ragged tensors.
 
     By default, the layer will output a `tf.RaggedTensor` where the last
@@ -101,10 +101,11 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
             tokenization.
         strip_accents: If true, all accent marks will be removed from text
             before tokenization.
-        split_pattern: A regex pattern to match delimiters to split, or None
-            indicating that the input is pre-split and no splitting should be
-            performed. By default, all whitespace and punctuation marks will
-            be split on.
+        split_input: If true, input will be split according to `split_pattern`
+            and `keep_pattern`. If false, input should be split before calling
+            the layer.
+        split_pattern: A regex pattern to match delimiters to split. By default,
+            all whitespace and punctuation marks will be split on.
         keep_pattern: A regex pattern of delimiters contained in the
             `split_pattern` of delimeters that should be kept as independent
             tokens. By default, all punctuation marks will be kept as tokens.
@@ -167,8 +168,9 @@ def __init__(
         sequence_length: int = None,
         lowercase: bool = True,
         strip_accents: bool = True,
-        split_pattern: str = WHITESPACE_AND_PUNCTUATION_REGEX,
-        keep_pattern: str = PUNCTUATION_REGEX,
+        split_input: bool = True,
+        split_pattern: str = None,
+        keep_pattern: str = None,
         suffix_indicator: str = "##",
         oov_token: str = "[UNK]",
         **kwargs,
@@ -201,9 +203,16 @@ def __init__(
         if oov_token is None:
             raise ValueError("`oov_token` cannot be None.")
 
+        if split_pattern is None:
+            split_pattern = WHITESPACE_AND_PUNCTUATION_REGEX
+
+        if keep_pattern is None:
+            keep_pattern = PUNCTUATION_REGEX
+
         self.sequence_length = sequence_length
         self.lowercase = lowercase
         self.strip_accents = strip_accents
+        self.split_input = split_input
         self.split_pattern = split_pattern
         self.keep_pattern = keep_pattern
         self.suffix_indicator = suffix_indicator
diff --git a/keras_nlp/tokenizers/word_piece_tokenizer_test.py b/keras_nlp/tokenizers/word_piece_tokenizer_test.py
index c7430ae380..b4096a7f3a 100644
--- a/keras_nlp/tokenizers/word_piece_tokenizer_test.py
+++ b/keras_nlp/tokenizers/word_piece_tokenizer_test.py
@@ -136,7 +136,7 @@ def test_no_spliting(self):
         input_data = ["t o k e n", "m i s s i n g", "t o k e n"]
         vocab_data = ["[UNK]", "t o k e n"]
         tokenizer = WordPieceTokenizer(
-            vocabulary=vocab_data, split_pattern=None
+            vocabulary=vocab_data, split_input=False
         )
         call_output = tokenizer(input_data)
         self.assertAllEqual(call_output, [1, 0, 1])
@@ -148,7 +148,7 @@ def test_word_piece_only(self):
             vocabulary=vocab_data,
             lowercase=False,
             strip_accents=False,
-            split_pattern=None,
+            split_input=False,
         )
         call_output = tokenizer(input_data)
         self.assertAllEqual(call_output, [1, 2, 3, 4, 5, 6])

From 8eaef56c194e24688893f403a0eb607a5c75ed01 Mon Sep 17 00:00:00 2001
From: Matt Watson <mattdangerw@gmail.com>
Date: Thu, 28 Apr 2022 09:12:33 -0700
Subject: [PATCH 2/4] fix

---
 keras_nlp/tokenizers/word_piece_tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keras_nlp/tokenizers/word_piece_tokenizer.py b/keras_nlp/tokenizers/word_piece_tokenizer.py
index 2b4ad8ef38..7f2bac41be 100644
--- a/keras_nlp/tokenizers/word_piece_tokenizer.py
+++ b/keras_nlp/tokenizers/word_piece_tokenizer.py
@@ -289,7 +289,7 @@ def tokenize(self, inputs):
             inputs = tf_text.normalize_utf8(inputs, "NFD")
             # Remove the accent marks.
             inputs = tf.strings.regex_replace(inputs, r"\p{Mn}", "")
-        if self.split_pattern:
+        if self.split_input:
             inputs = tf_text.regex_split(
                 inputs,
                 delim_regex_pattern=self.split_pattern,

From 8c8ecade4c3d945e4b88305b0c78901bb66fd4b9 Mon Sep 17 00:00:00 2001
From: Matt Watson <mattdangerw@gmail.com>
Date: Thu, 28 Apr 2022 09:22:43 -0700
Subject: [PATCH 3/4] Another fix

---
 keras_nlp/tokenizers/word_piece_tokenizer.py      | 11 ++++++-----
 keras_nlp/tokenizers/word_piece_tokenizer_test.py |  4 ++--
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/keras_nlp/tokenizers/word_piece_tokenizer.py b/keras_nlp/tokenizers/word_piece_tokenizer.py
index 7f2bac41be..cc1766ecb1 100644
--- a/keras_nlp/tokenizers/word_piece_tokenizer.py
+++ b/keras_nlp/tokenizers/word_piece_tokenizer.py
@@ -80,7 +80,7 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
 
     If a more custom pre-tokenization step is desired, the layer can be
     configured to apply only the strict WordPiece algorithm by passing
-    `lowercase=False`, `strip_accents=False` and `split_input=False`. In
+    `lowercase=False`, `strip_accents=False` and `split=False`. In
     this case, inputs should be pre-split string tensors or ragged tensors.
 
     By default, the layer will output a `tf.RaggedTensor` where the last
@@ -101,7 +101,7 @@ class WordPieceTokenizer(tokenizer.Tokenizer):
             tokenization.
         strip_accents: If true, all accent marks will be removed from text
             before tokenization.
-        split_input: If true, input will be split according to `split_pattern`
+        split: If true, input will be split according to `split_pattern`
             and `keep_pattern`. If false, input should be split before calling
             the layer.
         split_pattern: A regex pattern to match delimiters to split. By default,
@@ -168,7 +168,7 @@ def __init__(
         sequence_length: int = None,
         lowercase: bool = True,
         strip_accents: bool = True,
-        split_input: bool = True,
+        split: bool = True,
         split_pattern: str = None,
         keep_pattern: str = None,
         suffix_indicator: str = "##",
@@ -212,7 +212,7 @@ def __init__(
         self.sequence_length = sequence_length
         self.lowercase = lowercase
         self.strip_accents = strip_accents
-        self.split_input = split_input
+        self.split = split
         self.split_pattern = split_pattern
         self.keep_pattern = keep_pattern
         self.suffix_indicator = suffix_indicator
@@ -266,6 +266,7 @@ def get_config(self) -> Dict[str, Any]:
                 "sequence_length": self.sequence_length,
                 "lowercase": self.lowercase,
                 "strip_accents": self.strip_accents,
+                "split": self.split,
                 "split_pattern": self.split_pattern,
                 "keep_pattern": self.keep_pattern,
                 "suffix_indicator": self.suffix_indicator,
@@ -289,7 +290,7 @@ def tokenize(self, inputs):
             inputs = tf_text.normalize_utf8(inputs, "NFD")
             # Remove the accent marks.
             inputs = tf.strings.regex_replace(inputs, r"\p{Mn}", "")
-        if self.split_input:
+        if self.split:
             inputs = tf_text.regex_split(
                 inputs,
                 delim_regex_pattern=self.split_pattern,
diff --git a/keras_nlp/tokenizers/word_piece_tokenizer_test.py b/keras_nlp/tokenizers/word_piece_tokenizer_test.py
index b4096a7f3a..6852415d33 100644
--- a/keras_nlp/tokenizers/word_piece_tokenizer_test.py
+++ b/keras_nlp/tokenizers/word_piece_tokenizer_test.py
@@ -136,7 +136,7 @@ def test_no_spliting(self):
         input_data = ["t o k e n", "m i s s i n g", "t o k e n"]
         vocab_data = ["[UNK]", "t o k e n"]
         tokenizer = WordPieceTokenizer(
-            vocabulary=vocab_data, split_input=False
+            vocabulary=vocab_data, split=False
         )
         call_output = tokenizer(input_data)
         self.assertAllEqual(call_output, [1, 0, 1])
@@ -148,7 +148,7 @@ def test_word_piece_only(self):
             vocabulary=vocab_data,
             lowercase=False,
             strip_accents=False,
-            split_input=False,
+            split=False,
         )
         call_output = tokenizer(input_data)
         self.assertAllEqual(call_output, [1, 2, 3, 4, 5, 6])

From 3c68497556e6c2e8328d45264cc5310620384d21 Mon Sep 17 00:00:00 2001
From: Matt Watson <mattdangerw@gmail.com>
Date: Thu, 28 Apr 2022 09:27:01 -0700
Subject: [PATCH 4/4] format fix

---
 keras_nlp/tokenizers/word_piece_tokenizer_test.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/keras_nlp/tokenizers/word_piece_tokenizer_test.py b/keras_nlp/tokenizers/word_piece_tokenizer_test.py
index 6852415d33..fc3e6237e4 100644
--- a/keras_nlp/tokenizers/word_piece_tokenizer_test.py
+++ b/keras_nlp/tokenizers/word_piece_tokenizer_test.py
@@ -135,9 +135,7 @@ def test_custom_spliting(self):
     def test_no_spliting(self):
         input_data = ["t o k e n", "m i s s i n g", "t o k e n"]
         vocab_data = ["[UNK]", "t o k e n"]
-        tokenizer = WordPieceTokenizer(
-            vocabulary=vocab_data, split=False
-        )
+        tokenizer = WordPieceTokenizer(vocabulary=vocab_data, split=False)
         call_output = tokenizer(input_data)
         self.assertAllEqual(call_output, [1, 0, 1])