Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions keras_nlp/tokenizers/unicode_character_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ class UnicodeCharacterTokenizer(tokenizer.Tokenizer):
One of The encoding of the input text. Defaults to "UTF-8".
output_encoding: One of ("UTF-8", "UTF-16-BE", or "UTF-32-BE").
The encoding of the output text. Defaults to "UTF-8".
vocabulary_size: Set the vocabulary `vocabulary_size`,
by clamping all codepoints to the range [0, vocabulary_size).
Effectively this will make the `vocabulary_size - 1` id the
the OOV value.

Examples:

Expand Down Expand Up @@ -130,6 +134,19 @@ class UnicodeCharacterTokenizer(tokenizer.Tokenizer):
numpy=array([[ 105, 32, 108, 105, 107],
[2350, 2376, 2306, 32, 2325]], dtype=int32)>

Tokenization with vocabulary_size.
>>> latin_ext_cutoff = 592
>>> tokenizer = keras_nlp.tokenizers.UnicodeCharacterTokenizer(
... vocabulary_size=latin_ext_cutoff)
>>> tokenizer("¿Cómo estás?")
<tf.Tensor: shape=(10,), dtype=int32,
numpy=array([191, 99, 243, 109, 111, 32, 101, 115, 116, 225, 115, 63],
dtype=int32)>
>>> tokenizer("आप कैसे हैं")
<tf.Tensor: shape=(11,), dtype=int32,
numpy=array([591, 591, 32, 591, 591, 591, 591, 32, 591, 591, 591],
dtype=int32)>

Detokenization.
>>> inputs = tf.constant([110, 105, 110, 106, 97], dtype=tf.int32)
>>> tokenizer = keras_nlp.tokenizers.UnicodeCharacterTokenizer()
Expand Down Expand Up @@ -168,6 +185,7 @@ def __init__(
replacement_char: int = 65533,
input_encoding: str = "UTF-8",
output_encoding: str = "UTF-8",
vocabulary_size: int = None,
**kwargs,
) -> None:
# Check dtype and provide a default.
Expand Down Expand Up @@ -213,6 +231,7 @@ def __init__(
self.replacement_char = replacement_char
self.input_encoding = input_encoding
self.output_encoding = output_encoding
self.vocabulary_size = vocabulary_size

def get_config(self) -> Dict[str, Any]:
config = super().get_config()
Expand All @@ -225,10 +244,16 @@ def get_config(self) -> Dict[str, Any]:
"replacement_char": self.replacement_char,
"input_encoding": self.input_encoding,
"output_encoding": self.output_encoding,
"vocabulary_size": self.vocabulary_size,
}
)
return config

def vocabulary_size(self) -> int:
"""Get the size of the tokenizer vocabulary. None implies no vocabulary
size was provided"""
return self.vocabulary_size

def tokenize(self, inputs):
if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)):
inputs = tf.convert_to_tensor(inputs)
Expand Down Expand Up @@ -260,6 +285,12 @@ def tokenize(self, inputs):

if scalar_input:
tokens = tf.squeeze(tokens, 0)

# Optionally clamps the output code point values to be in the
# range [0, vocabulary_size)
if self.vocabulary_size:
tokens = tf.clip_by_value(tokens, 0, self.vocabulary_size - 1)

return tokens

def detokenize(self, inputs):
Expand Down
46 changes: 46 additions & 0 deletions keras_nlp/tokenizers/unicode_character_tokenizer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,46 @@ def test_dense_output(self):
],
)

def test_tokenize_scalar_with_vocabulary_size(self):
input_data = "ninja"
tokenizer = UnicodeCharacterTokenizer(vocabulary_size=105)
call_output = tokenizer(input_data)
tokenize_output = tokenizer.tokenize(input_data)

self.assertAllEqual(call_output, [104, 104, 104, 104, 97])
self.assertAllEqual(tokenize_output, [104, 104, 104, 104, 97])

def test_tokenize_dense_with_vocabulary_size(self):
input_data = tf.constant(["ninja", "samurai", "▀▁▂▃"])
tokenizer = UnicodeCharacterTokenizer(
sequence_length=10, vocabulary_size=105
)
call_output = tokenizer(input_data)
self.assertIsInstance(call_output, tf.Tensor)
self.assertAllEqual(
call_output,
[
[104, 104, 104, 104, 97, 0, 0, 0, 0, 0],
[104, 97, 104, 104, 104, 97, 104, 0, 0, 0],
[104, 104, 104, 104, 0, 0, 0, 0, 0, 0],
],
)

def test_tokenize_ragged_with_vocabulary_size(self):
input_data = tf.constant(["ninja", "samurai", "▀▁▂▃"])
tokenizer = UnicodeCharacterTokenizer(vocabulary_size=105)
call_output = tokenizer(input_data)
tokenize_output = tokenizer.tokenize(input_data)
self.assertIsInstance(call_output, tf.RaggedTensor)
exp_outputs = [
[104, 104, 104, 104, 97],
[104, 97, 104, 104, 104, 97, 104],
[104, 104, 104, 104],
]
for i in range(call_output.shape[0]):
self.assertAllEqual(call_output[i], exp_outputs[i])
self.assertAllEqual(tokenize_output[i], exp_outputs[i])

def test_detokenize(self):
input_data = tf.ragged.constant(
[
Expand Down Expand Up @@ -232,6 +272,7 @@ def test_load_model_with_config(self):
sequence_length=11,
normalization_form="NFC",
errors="strict",
vocabulary_size=None,
)
cloned_tokenizer = UnicodeCharacterTokenizer.from_config(
original_tokenizer.get_config()
Expand All @@ -255,6 +296,7 @@ def test_config(self):
normalization_form="NFC",
errors="ignore",
replacement_char=0,
vocabulary_size=100,
)
exp_config = {
"dtype": "int32",
Expand All @@ -267,6 +309,7 @@ def test_config(self):
"input_encoding": "UTF-8",
"output_encoding": "UTF-8",
"trainable": True,
"vocabulary_size": 100,
}
self.assertEqual(tokenizer.get_config(), exp_config)

Expand All @@ -278,6 +321,7 @@ def test_config(self):
replacement_char=0,
input_encoding="UTF-16",
output_encoding="UTF-16",
vocabulary_size=None,
)
exp_config_different_encoding = {
"dtype": "int32",
Expand All @@ -290,6 +334,7 @@ def test_config(self):
"input_encoding": "UTF-16",
"output_encoding": "UTF-16",
"trainable": True,
"vocabulary_size": None,
}
self.assertEqual(
tokenize_different_encoding.get_config(),
Expand All @@ -305,6 +350,7 @@ def test_saving(self):
sequence_length=20,
normalization_form="NFKC",
errors="replace",
vocabulary_size=None,
)
inputs = keras.Input(dtype="string", shape=())
outputs = tokenizer(inputs)
Expand Down