keras-team · mattdangerw · Jul 10, 2023 · Jul 10, 2023
diff --git a/keras_nlp/models/albert/albert_backbone.py b/keras_nlp/models/albert/albert_backbone.py
@@ -76,13 +76,9 @@ class AlbertBackbone(Backbone):
     Examples:
     ```python
     input_data = {
-        "token_ids": tf.ones(shape=(1, 12), dtype="int64"),
-        "segment_ids": tf.constant(
-            [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)
-        ),
-        "padding_mask": tf.constant(
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)
-        ),
+        "token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "segment_ids": np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]]),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
     }
 
     # Randomly initialized ALBERT encoder

diff --git a/keras_nlp/models/albert/albert_classifier.py b/keras_nlp/models/albert/albert_classifier.py
@@ -85,13 +85,9 @@ class AlbertClassifier(Task):
     Preprocessed integer data.
     ```python
     features = {
-        "token_ids": tf.ones(shape=(2, 12), dtype="int64"),
-        "segment_ids": tf.constant(
-            [[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
-        ),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
-        ),
+        "token_ids": np.ones(shape=(2, 12), dtype="int32"),
+        "segment_ids": np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]] * 2),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2),
     }
     labels = [0, 3]
 

diff --git a/keras_nlp/models/albert/albert_masked_lm.py b/keras_nlp/models/albert/albert_masked_lm.py
@@ -81,14 +81,10 @@ class AlbertMaskedLM(Task):
     ```python
     # Create preprocessed batch where 0 is the mask token.
     features = {
-        "token_ids": tf.constant(
-            [[1, 2, 0, 4, 0, 6, 7, 8]] * 2, shape=(2, 8)
-        ),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1]] * 2, shape=(2, 8)
-        ),
-        "mask_positions": tf.constant([[2, 4]] * 2, shape=(2, 2)),
-        "segment_ids": tf.constant([[0, 0, 0, 0, 0, 0, 0, 0]] * 2, shape=(2, 8))
+        "token_ids": np.array([[1, 2, 0, 4, 0, 6, 7, 8]] * 2),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1]] * 2),
+        "mask_positions": np.array([[2, 4]] * 2),
+        "segment_ids": np.array([[0, 0, 0, 0, 0, 0, 0, 0]] * 2),
     }
     # Labels are the original masked values.
     labels = [[3, 5]] * 2

diff --git a/keras_nlp/models/bart/bart_backbone.py b/keras_nlp/models/bart/bart_backbone.py
@@ -65,13 +65,13 @@ class BartBackbone(Backbone):
     Examples:
     ```python
     input_data = {
-        "encoder_token_ids": tf.ones(shape=(1, 12), dtype="int64"),
-        "encoder_padding_mask": tf.constant(
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)
+        "encoder_token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "encoder_padding_mask": np.array(
+            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]
         ),
-        "decoder_token_ids": tf.ones(shape=(1, 12), dtype="int64"),
-        "decoder_padding_mask": tf.constant(
-            [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], shape=(1, 12)
+        "decoder_token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "decoder_padding_mask": np.array(
+            [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]
         ),
     }
 

diff --git a/keras_nlp/models/bart/bart_seq_2_seq_lm.py b/keras_nlp/models/bart/bart_seq_2_seq_lm.py
@@ -109,12 +109,12 @@ class BartSeq2SeqLM(GenerativeTask):
     # "The quick brown fox", and the decoder inputs to "The fast". Use
     # `"padding_mask"` to indicate values that should not be overridden.
     prompt = {
-        "encoder_token_ids": tf.constant([[0, 133, 2119, 6219, 23602, 2, 1, 1]]),
-        "encoder_padding_mask": tf.constant(
+        "encoder_token_ids": np.array([[0, 133, 2119, 6219, 23602, 2, 1, 1]]),
+        "encoder_padding_mask": np.array(
             [[True, True, True, True, True, True, False, False]]
         ),
-        "decoder_token_ids": tf.constant([[2, 0, 133, 1769, 2, 1, 1]]),
-        "decoder_padding_mask": tf.constant([[True, True, True, True, False, False]])
+        "decoder_token_ids": np.array([[2, 0, 133, 1769, 2, 1, 1]]),
+        "decoder_padding_mask": np.array([[True, True, True, True, False, False]])
     }
 
     bart_lm = keras_nlp.models.BartSeq2SeqLM.from_preset(
@@ -137,13 +137,13 @@ class BartSeq2SeqLM(GenerativeTask):
     Call `fit()` without preprocessing.
     ```python
     x = {
-        "encoder_token_ids": tf.constant([[0, 133, 2119, 2, 1]] * 2),
-        "encoder_padding_mask": tf.constant([[1, 1, 1, 1, 0]] * 2),
-        "decoder_token_ids": tf.constant([[2, 0, 133, 1769, 2]] * 2),
-        "decoder_padding_mask": tf.constant([[1, 1, 1, 1, 1]] * 2),
+        "encoder_token_ids": np.array([[0, 133, 2119, 2, 1]] * 2),
+        "encoder_padding_mask": np.array([[1, 1, 1, 1, 0]] * 2),
+        "decoder_token_ids": np.array([[2, 0, 133, 1769, 2]] * 2),
+        "decoder_padding_mask": np.array([[1, 1, 1, 1, 1]] * 2),
     }
-    y = tf.constant([[0, 133, 1769, 2, 1]] * 2)
-    sw = tf.constant([[1, 1, 1, 1, 0]] * 2)
+    y = np.array([[0, 133, 1769, 2, 1]] * 2)
+    sw = np.array([[1, 1, 1, 1, 0]] * 2)
 
     bart_lm = keras_nlp.models.BartSeq2SeqLM.from_preset(
         "bart_base_en",

diff --git a/keras_nlp/models/bert/bert_backbone.py b/keras_nlp/models/bert/bert_backbone.py
@@ -65,13 +65,9 @@ class BertBackbone(Backbone):
     Examples:
     ```python
     input_data = {
-        "token_ids": tf.ones(shape=(1, 12), dtype="int64"),
-        "segment_ids": tf.constant(
-            [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)
-        ),
-        "padding_mask": tf.constant(
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)
-        ),
+        "token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "segment_ids": np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]]),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
     }
 
     # Pretrained BERT encoder.

diff --git a/keras_nlp/models/bert/bert_classifier.py b/keras_nlp/models/bert/bert_classifier.py
@@ -86,13 +86,9 @@ class BertClassifier(Task):
     Preprocessed integer data.
     ```python
     features = {
-        "token_ids": tf.ones(shape=(2, 12), dtype="int64"),
-        "segment_ids": tf.constant(
-            [[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
-        ),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
-        ),
+        "token_ids": np.ones(shape=(2, 12), dtype="int32"),
+        "segment_ids": np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]] * 2),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2),
     }
     labels = [0, 3]
 

diff --git a/keras_nlp/models/bert/bert_masked_lm.py b/keras_nlp/models/bert/bert_masked_lm.py
@@ -80,14 +80,10 @@ class BertMaskedLM(Task):
     ```python
     # Create preprocessed batch where 0 is the mask token.
     features = {
-        "token_ids": tf.constant(
-            [[1, 2, 0, 4, 0, 6, 7, 8]] * 2, shape=(2, 8)
-        ),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1]] * 2, shape=(2, 8)
-        ),
-        "mask_positions": tf.constant([[2, 4]] * 2, shape=(2, 2)),
-        "segment_ids": tf.constant([[0, 0, 0, 0, 0, 0, 0, 0]] * 2, shape=(2, 8))
+        "token_ids": np.array([[1, 2, 0, 4, 0, 6, 7, 8]] * 2),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1]] * 2),
+        "mask_positions": np.array([[2, 4]] * 2),
+        "segment_ids": np.array([[0, 0, 0, 0, 0, 0, 0, 0]] * 2)
     }
     # Labels are the original masked values.
     labels = [[3, 5]] * 2

diff --git a/keras_nlp/models/deberta_v3/deberta_v3_backbone.py b/keras_nlp/models/deberta_v3/deberta_v3_backbone.py
@@ -73,9 +73,8 @@ class DebertaV3Backbone(Backbone):
     Example usage:
     ```python
     input_data = {
-        "token_ids": tf.ones(shape=(1, 12), dtype="int64"),
-        "padding_mask": tf.constant(
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)),
+        "token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
     }
 
     # Pretrained DeBERTa encoder.

diff --git a/keras_nlp/models/deberta_v3/deberta_v3_classifier.py b/keras_nlp/models/deberta_v3/deberta_v3_classifier.py
@@ -95,10 +95,8 @@ class DebertaV3Classifier(Task):
     Preprocessed integer data.
     ```python
     features = {
-        "token_ids": tf.ones(shape=(2, 12), dtype="int64"),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
-        ),
+        "token_ids": np.ones(shape=(2, 12), dtype="int32"),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2),
     }
     labels = [0, 3]
 

diff --git a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm.py b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm.py
@@ -84,13 +84,9 @@ class DebertaV3MaskedLM(Task):
     ```python
     # Create preprocessed batch where 0 is the mask token.
     features = {
-        "token_ids": tf.constant(
-            [[1, 2, 0, 4, 0, 6, 7, 8]] * 2, shape=(2, 8)
-        ),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1]] * 2, shape=(2, 8)
-        ),
-        "mask_positions": tf.constant([[2, 4]] * 2, shape=(2, 2)),
+        "token_ids": np.array([[1, 2, 0, 4, 0, 6, 7, 8]] * 2),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1]] * 2),
+        "mask_positions": np.array([[2, 4]] * 2),
     }
     # Labels are the original masked values.
     labels = [[3, 5]] * 2

diff --git a/keras_nlp/models/distil_bert/distil_bert_backbone.py b/keras_nlp/models/distil_bert/distil_bert_backbone.py
@@ -68,10 +68,8 @@ class DistilBertBackbone(Backbone):
     Examples:
     ```python
     input_data = {
-        "token_ids": tf.ones(shape=(1, 12), dtype="int64"),
-        "padding_mask": tf.constant(
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)
-        ),
+        "token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
     }
 
     # Pretrained DistilBERT encoder.

diff --git a/keras_nlp/models/distil_bert/distil_bert_classifier.py b/keras_nlp/models/distil_bert/distil_bert_classifier.py
@@ -97,10 +97,8 @@ class DistilBertClassifier(Task):
     Preprocessed integer data.
     ```python
     features = {
-        "token_ids": tf.ones(shape=(2, 12), dtype="int64"),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
-        ),
+        "token_ids": np.ones(shape=(2, 12), dtype="int32"),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2)
     }
     labels = [0, 3]
 

diff --git a/keras_nlp/models/distil_bert/distil_bert_masked_lm.py b/keras_nlp/models/distil_bert/distil_bert_masked_lm.py
@@ -84,13 +84,9 @@ class DistilBertMaskedLM(Task):
     ```python
     # Create preprocessed batch where 0 is the mask token.
     features = {
-        "token_ids": tf.constant(
-            [[1, 2, 0, 4, 0, 6, 7, 8]] * 2, shape=(2, 8)
-        ),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1]] * 2, shape=(2, 8)
-        ),
-        "mask_positions": tf.constant([[2, 4]] * 2, shape=(2, 2))
+        "token_ids": np.array([[1, 2, 0, 4, 0, 6, 7, 8]] * 2),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1]] * 2),
+        "mask_positions": np.array([[2, 4]] * 2)
     }
     # Labels are the original masked values.
     labels = [[3, 5]] * 2

diff --git a/keras_nlp/models/f_net/f_net_backbone.py b/keras_nlp/models/f_net/f_net_backbone.py
@@ -70,10 +70,8 @@ class FNetBackbone(Backbone):
     Examples:
     ```python
     input_data = {
-        "token_ids": tf.ones(shape=(1, 12), dtype="int64"),
-        "segment_ids": tf.constant(
-            [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)
-        ),
+        "token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "segment_ids": np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]]),
     }
 
     # Pretrained BERT encoder.

diff --git a/keras_nlp/models/f_net/f_net_classifier.py b/keras_nlp/models/f_net/f_net_classifier.py
@@ -87,10 +87,8 @@ class FNetClassifier(Task):
     Preprocessed integer data.
     ```python
     features = {
-        "token_ids": tf.ones(shape=(2, 12), dtype="int64"),
-        "segment_ids": tf.constant(
-            [[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
-        ),
+        "token_ids": np.ones(shape=(2, 12), dtype="int32"),
+        "segment_ids": np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]] * 2),
     }
     labels = [0, 3]
 

diff --git a/keras_nlp/models/f_net/f_net_masked_lm.py b/keras_nlp/models/f_net/f_net_masked_lm.py
@@ -79,13 +79,9 @@ class FNetMaskedLM(Task):
     ```python
     # Create a preprocessed dataset where 0 is the mask token.
     features = {
-        "token_ids": tf.constant(
-            [[1, 2, 0, 4, 0, 6, 7, 8]] * 2, shape=(2, 8)
-        ),
-        "segment_ids": tf.constant(
-            [[0, 0, 0, 1, 1, 1, 0, 0]] * 2, shape=(2, 8)
-        ),
-        "mask_positions": tf.constant([[2, 4]] * 2, shape=(2, 2))
+        "token_ids": np.array([[1, 2, 0, 4, 0, 6, 7, 8]] * 2),
+        "segment_ids": np.array([[0, 0, 0, 1, 1, 1, 0, 0]] * 2),
+        "mask_positions": np.array([[2, 4]] * 2)
     }
     # Labels are the original masked values.
     labels = [[3, 5]] * 2

diff --git a/keras_nlp/models/gpt2/gpt2_backbone.py b/keras_nlp/models/gpt2/gpt2_backbone.py
@@ -69,10 +69,8 @@ class GPT2Backbone(Backbone):
     Example usage:
     ```python
     input_data = {
-        "token_ids": tf.ones(shape=(1, 12), dtype="int64"),
-        "padding_mask": tf.constant(
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)
-        ),
+        "token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
     }
 
     # Pretrained GPT-2 decoder.

diff --git a/keras_nlp/models/gpt2/gpt2_causal_lm.py b/keras_nlp/models/gpt2/gpt2_causal_lm.py
@@ -99,8 +99,8 @@ class GPT2CausalLM(GenerativeTask):
     # Prompt the model with `5338, 318` (the token ids for `"Who is"`).
     # Use `"padding_mask"` to indicate values that should not be overridden.
     prompt = {
-        "token_ids": tf.constant([[5338, 318, 0, 0, 0]] * 2),
-        "padding_mask": tf.constant([[1, 1, 0, 0, 0]] * 2),
+        "token_ids": np.array([[5338, 318, 0, 0, 0]] * 2),
+        "padding_mask": np.array([[1, 1, 0, 0, 0]] * 2),
     }
 
     gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset(
@@ -120,11 +120,11 @@ class GPT2CausalLM(GenerativeTask):
     Call `fit()` without preprocessing.
     ```python
     x = {
-        "token_ids": tf.constant([[50256, 1, 2, 3, 4]] * 2),
-        "padding_mask": tf.constant([[1, 1, 1, 1, 1]] * 2),
+        "token_ids": np.array([[50256, 1, 2, 3, 4]] * 2),
+        "padding_mask": np.array([[1, 1, 1, 1, 1]] * 2),
     }
-    y = tf.constant([[1, 2, 3, 4, 50256]] * 2)
-    sw = tf.constant([[1, 1, 1, 1, 1]] * 2)
+    y = np.array([[1, 2, 3, 4, 50256]] * 2)
+    sw = np.array([[1, 1, 1, 1, 1]] * 2)
 
     gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset(
         "gpt2_base_en",

diff --git a/keras_nlp/models/opt/opt_backbone.py b/keras_nlp/models/opt/opt_backbone.py
@@ -67,10 +67,8 @@ class OPTBackbone(Backbone):
     Examples:
     ```python
     input_data = {
-        "token_ids": tf.ones(shape=(1, 12), dtype="int64"),
-        "padding_mask": tf.constant(
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)
-        ),
+        "token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
     }
 
     # Pretrained OPT decoder

diff --git a/keras_nlp/models/opt/opt_causal_lm.py b/keras_nlp/models/opt/opt_causal_lm.py
@@ -99,8 +99,8 @@ class OPTCausalLM(GenerativeTask):
     # Prompt the model with `5338, 318` (the token ids for `"Who is"`).
     # Use `"padding_mask"` to indicate values that should not be overridden.
     prompt = {
-        "token_ids": tf.constant([[5338, 318, 0, 0, 0]] * 2),
-        "padding_mask": tf.constant([[1, 1, 0, 0, 0]] * 2),
+        "token_ids": np.array([[5338, 318, 0, 0, 0]] * 2),
+        "padding_mask": np.array([[1, 1, 0, 0, 0]] * 2),
     }
 
     opt_lm = keras_nlp.models.OPTCausalLM.from_preset(
@@ -120,11 +120,11 @@ class OPTCausalLM(GenerativeTask):
     Call `fit()` without preprocessing.
     ```python
     x = {
-        "token_ids": tf.constant([[1, 2, 3, 4, 5]] * 2),
-        "padding_mask": tf.constant([[1, 1, 1, 1, 1]] * 2),
+        "token_ids": np.array([[1, 2, 3, 4, 5]] * 2),
+        "padding_mask": np.array([[1, 1, 1, 1, 1]] * 2),
     }
-    y = tf.constant([[2, 3, 4, 5, 0]] * 2)
-    sw = tf.constant([[1, 1, 1, 1, 1]] * 2)
+    y = np.array([[2, 3, 4, 5, 0]] * 2)
+    sw = np.array([[1, 1, 1, 1, 1]] * 2)
 
     opt_lm = keras_nlp.models.OPTCausalLM.from_preset(
         "opt_base_en",

diff --git a/keras_nlp/models/roberta/roberta_backbone.py b/keras_nlp/models/roberta/roberta_backbone.py
@@ -67,8 +67,8 @@ class RobertaBackbone(Backbone):
     Examples:
     ```python
     input_data = {
-        "token_ids": tf.ones(shape=(1, 12), dtype="int64"),
-        "padding_mask": tf.constant(
+        "token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "padding_mask": np.array(
             [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)),
     }
 

diff --git a/keras_nlp/models/roberta/roberta_classifier.py b/keras_nlp/models/roberta/roberta_classifier.py
@@ -87,10 +87,8 @@ class RobertaClassifier(Task):
     Preprocessed integer data.
     ```python
     features = {
-        "token_ids": tf.ones(shape=(2, 12), dtype="int64"),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
-        ),
+        "token_ids": np.ones(shape=(2, 12), dtype="int32"),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2),
     }
     labels = [0, 3]