From 12087a8900d7f497ad7133931e9ce2531dcd3242 Mon Sep 17 00:00:00 2001 From: Matt Watson Date: Thu, 28 Apr 2022 00:04:24 -0700 Subject: [PATCH 1/4] Add a split_input argument for word piece This way we can have three flags for our pre-tokenization, which should allow for a more uniform experience. We can also remove the large regexes directly for the input arguments, which simplifies our rendered documentation quite a bit. --- keras_nlp/tokenizers/word_piece_tokenizer.py | 23 +++++++++++++------ .../tokenizers/word_piece_tokenizer_test.py | 4 ++-- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/keras_nlp/tokenizers/word_piece_tokenizer.py b/keras_nlp/tokenizers/word_piece_tokenizer.py index 900692d877..2b4ad8ef38 100644 --- a/keras_nlp/tokenizers/word_piece_tokenizer.py +++ b/keras_nlp/tokenizers/word_piece_tokenizer.py @@ -80,7 +80,7 @@ class WordPieceTokenizer(tokenizer.Tokenizer): If a more custom pre-tokenization step is desired, the layer can be configured to apply only the strict WordPiece algorithm by passing - `lowercase=False`, `strip_accents=False` and `split_pattern=None`. In + `lowercase=False`, `strip_accents=False` and `split_input=False`. In this case, inputs should be pre-split string tensors or ragged tensors. By default, the layer will output a `tf.RaggedTensor` where the last @@ -101,10 +101,11 @@ class WordPieceTokenizer(tokenizer.Tokenizer): tokenization. strip_accents: If true, all accent marks will be removed from text before tokenization. - split_pattern: A regex pattern to match delimiters to split, or None - indicating that the input is pre-split and no splitting should be - performed. By default, all whitespace and punctuation marks will - be split on. + split_input: If true, input will be split according to `split_pattern` + and `keep_pattern`. If false, input should be split before calling + the layer. + split_pattern: A regex pattern to match delimiters to split. By default, + all whitespace and punctuation marks will be split on. keep_pattern: A regex pattern of delimiters contained in the `split_pattern` of delimeters that should be kept as independent tokens. By default, all punctuation marks will be kept as tokens. @@ -167,8 +168,9 @@ def __init__( sequence_length: int = None, lowercase: bool = True, strip_accents: bool = True, - split_pattern: str = WHITESPACE_AND_PUNCTUATION_REGEX, - keep_pattern: str = PUNCTUATION_REGEX, + split_input: bool = True, + split_pattern: str = None, + keep_pattern: str = None, suffix_indicator: str = "##", oov_token: str = "[UNK]", **kwargs, @@ -201,9 +203,16 @@ def __init__( if oov_token is None: raise ValueError("`oov_token` cannot be None.") + if split_pattern is None: + split_pattern = WHITESPACE_AND_PUNCTUATION_REGEX + + if keep_pattern is None: + keep_pattern = PUNCTUATION_REGEX + self.sequence_length = sequence_length self.lowercase = lowercase self.strip_accents = strip_accents + self.split_input = split_input self.split_pattern = split_pattern self.keep_pattern = keep_pattern self.suffix_indicator = suffix_indicator diff --git a/keras_nlp/tokenizers/word_piece_tokenizer_test.py b/keras_nlp/tokenizers/word_piece_tokenizer_test.py index c7430ae380..b4096a7f3a 100644 --- a/keras_nlp/tokenizers/word_piece_tokenizer_test.py +++ b/keras_nlp/tokenizers/word_piece_tokenizer_test.py @@ -136,7 +136,7 @@ def test_no_spliting(self): input_data = ["t o k e n", "m i s s i n g", "t o k e n"] vocab_data = ["[UNK]", "t o k e n"] tokenizer = WordPieceTokenizer( - vocabulary=vocab_data, split_pattern=None + vocabulary=vocab_data, split_input=False ) call_output = tokenizer(input_data) self.assertAllEqual(call_output, [1, 0, 1]) @@ -148,7 +148,7 @@ def test_word_piece_only(self): vocabulary=vocab_data, lowercase=False, strip_accents=False, - split_pattern=None, + split_input=False, ) call_output = tokenizer(input_data) self.assertAllEqual(call_output, [1, 2, 3, 4, 5, 6]) From 8eaef56c194e24688893f403a0eb607a5c75ed01 Mon Sep 17 00:00:00 2001 From: Matt Watson Date: Thu, 28 Apr 2022 09:12:33 -0700 Subject: [PATCH 2/4] fix --- keras_nlp/tokenizers/word_piece_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_nlp/tokenizers/word_piece_tokenizer.py b/keras_nlp/tokenizers/word_piece_tokenizer.py index 2b4ad8ef38..7f2bac41be 100644 --- a/keras_nlp/tokenizers/word_piece_tokenizer.py +++ b/keras_nlp/tokenizers/word_piece_tokenizer.py @@ -289,7 +289,7 @@ def tokenize(self, inputs): inputs = tf_text.normalize_utf8(inputs, "NFD") # Remove the accent marks. inputs = tf.strings.regex_replace(inputs, r"\p{Mn}", "") - if self.split_pattern: + if self.split_input: inputs = tf_text.regex_split( inputs, delim_regex_pattern=self.split_pattern, From 8c8ecade4c3d945e4b88305b0c78901bb66fd4b9 Mon Sep 17 00:00:00 2001 From: Matt Watson Date: Thu, 28 Apr 2022 09:22:43 -0700 Subject: [PATCH 3/4] Another fix --- keras_nlp/tokenizers/word_piece_tokenizer.py | 11 ++++++----- keras_nlp/tokenizers/word_piece_tokenizer_test.py | 4 ++-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/keras_nlp/tokenizers/word_piece_tokenizer.py b/keras_nlp/tokenizers/word_piece_tokenizer.py index 7f2bac41be..cc1766ecb1 100644 --- a/keras_nlp/tokenizers/word_piece_tokenizer.py +++ b/keras_nlp/tokenizers/word_piece_tokenizer.py @@ -80,7 +80,7 @@ class WordPieceTokenizer(tokenizer.Tokenizer): If a more custom pre-tokenization step is desired, the layer can be configured to apply only the strict WordPiece algorithm by passing - `lowercase=False`, `strip_accents=False` and `split_input=False`. In + `lowercase=False`, `strip_accents=False` and `split=False`. In this case, inputs should be pre-split string tensors or ragged tensors. By default, the layer will output a `tf.RaggedTensor` where the last @@ -101,7 +101,7 @@ class WordPieceTokenizer(tokenizer.Tokenizer): tokenization. strip_accents: If true, all accent marks will be removed from text before tokenization. - split_input: If true, input will be split according to `split_pattern` + split: If true, input will be split according to `split_pattern` and `keep_pattern`. If false, input should be split before calling the layer. split_pattern: A regex pattern to match delimiters to split. By default, @@ -168,7 +168,7 @@ def __init__( sequence_length: int = None, lowercase: bool = True, strip_accents: bool = True, - split_input: bool = True, + split: bool = True, split_pattern: str = None, keep_pattern: str = None, suffix_indicator: str = "##", @@ -212,7 +212,7 @@ def __init__( self.sequence_length = sequence_length self.lowercase = lowercase self.strip_accents = strip_accents - self.split_input = split_input + self.split = split self.split_pattern = split_pattern self.keep_pattern = keep_pattern self.suffix_indicator = suffix_indicator @@ -266,6 +266,7 @@ def get_config(self) -> Dict[str, Any]: "sequence_length": self.sequence_length, "lowercase": self.lowercase, "strip_accents": self.strip_accents, + "split": self.split, "split_pattern": self.split_pattern, "keep_pattern": self.keep_pattern, "suffix_indicator": self.suffix_indicator, @@ -289,7 +290,7 @@ def tokenize(self, inputs): inputs = tf_text.normalize_utf8(inputs, "NFD") # Remove the accent marks. inputs = tf.strings.regex_replace(inputs, r"\p{Mn}", "") - if self.split_input: + if self.split: inputs = tf_text.regex_split( inputs, delim_regex_pattern=self.split_pattern, diff --git a/keras_nlp/tokenizers/word_piece_tokenizer_test.py b/keras_nlp/tokenizers/word_piece_tokenizer_test.py index b4096a7f3a..6852415d33 100644 --- a/keras_nlp/tokenizers/word_piece_tokenizer_test.py +++ b/keras_nlp/tokenizers/word_piece_tokenizer_test.py @@ -136,7 +136,7 @@ def test_no_spliting(self): input_data = ["t o k e n", "m i s s i n g", "t o k e n"] vocab_data = ["[UNK]", "t o k e n"] tokenizer = WordPieceTokenizer( - vocabulary=vocab_data, split_input=False + vocabulary=vocab_data, split=False ) call_output = tokenizer(input_data) self.assertAllEqual(call_output, [1, 0, 1]) @@ -148,7 +148,7 @@ def test_word_piece_only(self): vocabulary=vocab_data, lowercase=False, strip_accents=False, - split_input=False, + split=False, ) call_output = tokenizer(input_data) self.assertAllEqual(call_output, [1, 2, 3, 4, 5, 6]) From 3c68497556e6c2e8328d45264cc5310620384d21 Mon Sep 17 00:00:00 2001 From: Matt Watson Date: Thu, 28 Apr 2022 09:27:01 -0700 Subject: [PATCH 4/4] format fix --- keras_nlp/tokenizers/word_piece_tokenizer_test.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/keras_nlp/tokenizers/word_piece_tokenizer_test.py b/keras_nlp/tokenizers/word_piece_tokenizer_test.py index 6852415d33..fc3e6237e4 100644 --- a/keras_nlp/tokenizers/word_piece_tokenizer_test.py +++ b/keras_nlp/tokenizers/word_piece_tokenizer_test.py @@ -135,9 +135,7 @@ def test_custom_spliting(self): def test_no_spliting(self): input_data = ["t o k e n", "m i s s i n g", "t o k e n"] vocab_data = ["[UNK]", "t o k e n"] - tokenizer = WordPieceTokenizer( - vocabulary=vocab_data, split=False - ) + tokenizer = WordPieceTokenizer(vocabulary=vocab_data, split=False) call_output = tokenizer(input_data) self.assertAllEqual(call_output, [1, 0, 1])