diff --git a/keras_nlp/models/albert/albert_backbone.py b/keras_nlp/models/albert/albert_backbone.py
index a65df56453..36424faf85 100644
--- a/keras_nlp/models/albert/albert_backbone.py
+++ b/keras_nlp/models/albert/albert_backbone.py
@@ -76,13 +76,9 @@ class AlbertBackbone(Backbone):
     Examples:
     ```python
     input_data = {
-        "token_ids": tf.ones(shape=(1, 12), dtype="int64"),
-        "segment_ids": tf.constant(
-            [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)
-        ),
-        "padding_mask": tf.constant(
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)
-        ),
+        "token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "segment_ids": np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]]),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
     }
 
     # Randomly initialized ALBERT encoder
diff --git a/keras_nlp/models/albert/albert_classifier.py b/keras_nlp/models/albert/albert_classifier.py
index fe2bd7c7ff..e884d05d88 100644
--- a/keras_nlp/models/albert/albert_classifier.py
+++ b/keras_nlp/models/albert/albert_classifier.py
@@ -85,13 +85,9 @@ class AlbertClassifier(Task):
     Preprocessed integer data.
     ```python
     features = {
-        "token_ids": tf.ones(shape=(2, 12), dtype="int64"),
-        "segment_ids": tf.constant(
-            [[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
-        ),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
-        ),
+        "token_ids": np.ones(shape=(2, 12), dtype="int32"),
+        "segment_ids": np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]] * 2),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2),
     }
     labels = [0, 3]
 
diff --git a/keras_nlp/models/albert/albert_masked_lm.py b/keras_nlp/models/albert/albert_masked_lm.py
index 9843282c5a..fa04b83b09 100644
--- a/keras_nlp/models/albert/albert_masked_lm.py
+++ b/keras_nlp/models/albert/albert_masked_lm.py
@@ -81,14 +81,10 @@ class AlbertMaskedLM(Task):
     ```python
     # Create preprocessed batch where 0 is the mask token.
     features = {
-        "token_ids": tf.constant(
-            [[1, 2, 0, 4, 0, 6, 7, 8]] * 2, shape=(2, 8)
-        ),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1]] * 2, shape=(2, 8)
-        ),
-        "mask_positions": tf.constant([[2, 4]] * 2, shape=(2, 2)),
-        "segment_ids": tf.constant([[0, 0, 0, 0, 0, 0, 0, 0]] * 2, shape=(2, 8))
+        "token_ids": np.array([[1, 2, 0, 4, 0, 6, 7, 8]] * 2),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1]] * 2),
+        "mask_positions": np.array([[2, 4]] * 2),
+        "segment_ids": np.array([[0, 0, 0, 0, 0, 0, 0, 0]] * 2),
     }
     # Labels are the original masked values.
     labels = [[3, 5]] * 2
diff --git a/keras_nlp/models/bart/bart_backbone.py b/keras_nlp/models/bart/bart_backbone.py
index 3e6020421e..594f2467a0 100644
--- a/keras_nlp/models/bart/bart_backbone.py
+++ b/keras_nlp/models/bart/bart_backbone.py
@@ -65,13 +65,13 @@ class BartBackbone(Backbone):
     Examples:
     ```python
     input_data = {
-        "encoder_token_ids": tf.ones(shape=(1, 12), dtype="int64"),
-        "encoder_padding_mask": tf.constant(
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)
+        "encoder_token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "encoder_padding_mask": np.array(
+            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]
         ),
-        "decoder_token_ids": tf.ones(shape=(1, 12), dtype="int64"),
-        "decoder_padding_mask": tf.constant(
-            [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], shape=(1, 12)
+        "decoder_token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "decoder_padding_mask": np.array(
+            [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]
         ),
     }
 
diff --git a/keras_nlp/models/bart/bart_seq_2_seq_lm.py b/keras_nlp/models/bart/bart_seq_2_seq_lm.py
index 50868c4810..17c02e8e80 100644
--- a/keras_nlp/models/bart/bart_seq_2_seq_lm.py
+++ b/keras_nlp/models/bart/bart_seq_2_seq_lm.py
@@ -109,12 +109,12 @@ class BartSeq2SeqLM(GenerativeTask):
     # "The quick brown fox", and the decoder inputs to "The fast". Use
     # `"padding_mask"` to indicate values that should not be overridden.
     prompt = {
-        "encoder_token_ids": tf.constant([[0, 133, 2119, 6219, 23602, 2, 1, 1]]),
-        "encoder_padding_mask": tf.constant(
+        "encoder_token_ids": np.array([[0, 133, 2119, 6219, 23602, 2, 1, 1]]),
+        "encoder_padding_mask": np.array(
             [[True, True, True, True, True, True, False, False]]
         ),
-        "decoder_token_ids": tf.constant([[2, 0, 133, 1769, 2, 1, 1]]),
-        "decoder_padding_mask": tf.constant([[True, True, True, True, False, False]])
+        "decoder_token_ids": np.array([[2, 0, 133, 1769, 2, 1, 1]]),
+        "decoder_padding_mask": np.array([[True, True, True, True, False, False]])
     }
 
     bart_lm = keras_nlp.models.BartSeq2SeqLM.from_preset(
@@ -137,13 +137,13 @@ class BartSeq2SeqLM(GenerativeTask):
     Call `fit()` without preprocessing.
     ```python
     x = {
-        "encoder_token_ids": tf.constant([[0, 133, 2119, 2, 1]] * 2),
-        "encoder_padding_mask": tf.constant([[1, 1, 1, 1, 0]] * 2),
-        "decoder_token_ids": tf.constant([[2, 0, 133, 1769, 2]] * 2),
-        "decoder_padding_mask": tf.constant([[1, 1, 1, 1, 1]] * 2),
+        "encoder_token_ids": np.array([[0, 133, 2119, 2, 1]] * 2),
+        "encoder_padding_mask": np.array([[1, 1, 1, 1, 0]] * 2),
+        "decoder_token_ids": np.array([[2, 0, 133, 1769, 2]] * 2),
+        "decoder_padding_mask": np.array([[1, 1, 1, 1, 1]] * 2),
     }
-    y = tf.constant([[0, 133, 1769, 2, 1]] * 2)
-    sw = tf.constant([[1, 1, 1, 1, 0]] * 2)
+    y = np.array([[0, 133, 1769, 2, 1]] * 2)
+    sw = np.array([[1, 1, 1, 1, 0]] * 2)
 
     bart_lm = keras_nlp.models.BartSeq2SeqLM.from_preset(
         "bart_base_en",
diff --git a/keras_nlp/models/bert/bert_backbone.py b/keras_nlp/models/bert/bert_backbone.py
index 056086fe28..8bb6057acb 100644
--- a/keras_nlp/models/bert/bert_backbone.py
+++ b/keras_nlp/models/bert/bert_backbone.py
@@ -65,13 +65,9 @@ class BertBackbone(Backbone):
     Examples:
     ```python
     input_data = {
-        "token_ids": tf.ones(shape=(1, 12), dtype="int64"),
-        "segment_ids": tf.constant(
-            [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)
-        ),
-        "padding_mask": tf.constant(
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)
-        ),
+        "token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "segment_ids": np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]]),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
     }
 
     # Pretrained BERT encoder.
diff --git a/keras_nlp/models/bert/bert_classifier.py b/keras_nlp/models/bert/bert_classifier.py
index 3dff53ed8d..719dab5dfc 100644
--- a/keras_nlp/models/bert/bert_classifier.py
+++ b/keras_nlp/models/bert/bert_classifier.py
@@ -86,13 +86,9 @@ class BertClassifier(Task):
     Preprocessed integer data.
     ```python
     features = {
-        "token_ids": tf.ones(shape=(2, 12), dtype="int64"),
-        "segment_ids": tf.constant(
-            [[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
-        ),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
-        ),
+        "token_ids": np.ones(shape=(2, 12), dtype="int32"),
+        "segment_ids": np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]] * 2),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2),
     }
     labels = [0, 3]
 
diff --git a/keras_nlp/models/bert/bert_masked_lm.py b/keras_nlp/models/bert/bert_masked_lm.py
index 1b1cd4853f..f99314e91b 100644
--- a/keras_nlp/models/bert/bert_masked_lm.py
+++ b/keras_nlp/models/bert/bert_masked_lm.py
@@ -80,14 +80,10 @@ class BertMaskedLM(Task):
     ```python
     # Create preprocessed batch where 0 is the mask token.
     features = {
-        "token_ids": tf.constant(
-            [[1, 2, 0, 4, 0, 6, 7, 8]] * 2, shape=(2, 8)
-        ),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1]] * 2, shape=(2, 8)
-        ),
-        "mask_positions": tf.constant([[2, 4]] * 2, shape=(2, 2)),
-        "segment_ids": tf.constant([[0, 0, 0, 0, 0, 0, 0, 0]] * 2, shape=(2, 8))
+        "token_ids": np.array([[1, 2, 0, 4, 0, 6, 7, 8]] * 2),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1]] * 2),
+        "mask_positions": np.array([[2, 4]] * 2),
+        "segment_ids": np.array([[0, 0, 0, 0, 0, 0, 0, 0]] * 2)
     }
     # Labels are the original masked values.
     labels = [[3, 5]] * 2
diff --git a/keras_nlp/models/deberta_v3/deberta_v3_backbone.py b/keras_nlp/models/deberta_v3/deberta_v3_backbone.py
index 1c889fc36c..1707e6d3fd 100644
--- a/keras_nlp/models/deberta_v3/deberta_v3_backbone.py
+++ b/keras_nlp/models/deberta_v3/deberta_v3_backbone.py
@@ -73,9 +73,8 @@ class DebertaV3Backbone(Backbone):
     Example usage:
     ```python
     input_data = {
-        "token_ids": tf.ones(shape=(1, 12), dtype="int64"),
-        "padding_mask": tf.constant(
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)),
+        "token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
     }
 
     # Pretrained DeBERTa encoder.
diff --git a/keras_nlp/models/deberta_v3/deberta_v3_classifier.py b/keras_nlp/models/deberta_v3/deberta_v3_classifier.py
index 2d15a95ab9..dc2e8b1db8 100644
--- a/keras_nlp/models/deberta_v3/deberta_v3_classifier.py
+++ b/keras_nlp/models/deberta_v3/deberta_v3_classifier.py
@@ -95,10 +95,8 @@ class DebertaV3Classifier(Task):
     Preprocessed integer data.
     ```python
     features = {
-        "token_ids": tf.ones(shape=(2, 12), dtype="int64"),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
-        ),
+        "token_ids": np.ones(shape=(2, 12), dtype="int32"),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2),
     }
     labels = [0, 3]
 
diff --git a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm.py b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm.py
index 8fe6e0c919..d79453a4e1 100644
--- a/keras_nlp/models/deberta_v3/deberta_v3_masked_lm.py
+++ b/keras_nlp/models/deberta_v3/deberta_v3_masked_lm.py
@@ -84,13 +84,9 @@ class DebertaV3MaskedLM(Task):
     ```python
     # Create preprocessed batch where 0 is the mask token.
     features = {
-        "token_ids": tf.constant(
-            [[1, 2, 0, 4, 0, 6, 7, 8]] * 2, shape=(2, 8)
-        ),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1]] * 2, shape=(2, 8)
-        ),
-        "mask_positions": tf.constant([[2, 4]] * 2, shape=(2, 2)),
+        "token_ids": np.array([[1, 2, 0, 4, 0, 6, 7, 8]] * 2),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1]] * 2),
+        "mask_positions": np.array([[2, 4]] * 2),
     }
     # Labels are the original masked values.
     labels = [[3, 5]] * 2
diff --git a/keras_nlp/models/distil_bert/distil_bert_backbone.py b/keras_nlp/models/distil_bert/distil_bert_backbone.py
index 87097acb2b..23f9cfbfc2 100644
--- a/keras_nlp/models/distil_bert/distil_bert_backbone.py
+++ b/keras_nlp/models/distil_bert/distil_bert_backbone.py
@@ -68,10 +68,8 @@ class DistilBertBackbone(Backbone):
     Examples:
     ```python
     input_data = {
-        "token_ids": tf.ones(shape=(1, 12), dtype="int64"),
-        "padding_mask": tf.constant(
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)
-        ),
+        "token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
     }
 
     # Pretrained DistilBERT encoder.
diff --git a/keras_nlp/models/distil_bert/distil_bert_classifier.py b/keras_nlp/models/distil_bert/distil_bert_classifier.py
index 1ae94cb2fa..22a68dda8a 100644
--- a/keras_nlp/models/distil_bert/distil_bert_classifier.py
+++ b/keras_nlp/models/distil_bert/distil_bert_classifier.py
@@ -97,10 +97,8 @@ class DistilBertClassifier(Task):
     Preprocessed integer data.
     ```python
     features = {
-        "token_ids": tf.ones(shape=(2, 12), dtype="int64"),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
-        ),
+        "token_ids": np.ones(shape=(2, 12), dtype="int32"),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2)
     }
     labels = [0, 3]
 
diff --git a/keras_nlp/models/distil_bert/distil_bert_masked_lm.py b/keras_nlp/models/distil_bert/distil_bert_masked_lm.py
index 1bc7f0dbf1..b307ea1031 100644
--- a/keras_nlp/models/distil_bert/distil_bert_masked_lm.py
+++ b/keras_nlp/models/distil_bert/distil_bert_masked_lm.py
@@ -84,13 +84,9 @@ class DistilBertMaskedLM(Task):
     ```python
     # Create preprocessed batch where 0 is the mask token.
     features = {
-        "token_ids": tf.constant(
-            [[1, 2, 0, 4, 0, 6, 7, 8]] * 2, shape=(2, 8)
-        ),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1]] * 2, shape=(2, 8)
-        ),
-        "mask_positions": tf.constant([[2, 4]] * 2, shape=(2, 2))
+        "token_ids": np.array([[1, 2, 0, 4, 0, 6, 7, 8]] * 2),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1]] * 2),
+        "mask_positions": np.array([[2, 4]] * 2)
     }
     # Labels are the original masked values.
     labels = [[3, 5]] * 2
diff --git a/keras_nlp/models/f_net/f_net_backbone.py b/keras_nlp/models/f_net/f_net_backbone.py
index 2965676066..166dae56c8 100644
--- a/keras_nlp/models/f_net/f_net_backbone.py
+++ b/keras_nlp/models/f_net/f_net_backbone.py
@@ -70,10 +70,8 @@ class FNetBackbone(Backbone):
     Examples:
     ```python
     input_data = {
-        "token_ids": tf.ones(shape=(1, 12), dtype="int64"),
-        "segment_ids": tf.constant(
-            [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)
-        ),
+        "token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "segment_ids": np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]]),
     }
 
     # Pretrained BERT encoder.
diff --git a/keras_nlp/models/f_net/f_net_classifier.py b/keras_nlp/models/f_net/f_net_classifier.py
index ca15a52392..3bb812090a 100644
--- a/keras_nlp/models/f_net/f_net_classifier.py
+++ b/keras_nlp/models/f_net/f_net_classifier.py
@@ -87,10 +87,8 @@ class FNetClassifier(Task):
     Preprocessed integer data.
     ```python
     features = {
-        "token_ids": tf.ones(shape=(2, 12), dtype="int64"),
-        "segment_ids": tf.constant(
-            [[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
-        ),
+        "token_ids": np.ones(shape=(2, 12), dtype="int32"),
+        "segment_ids": np.array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]] * 2),
     }
     labels = [0, 3]
 
diff --git a/keras_nlp/models/f_net/f_net_masked_lm.py b/keras_nlp/models/f_net/f_net_masked_lm.py
index f6ecd549d8..cc8d44eb62 100644
--- a/keras_nlp/models/f_net/f_net_masked_lm.py
+++ b/keras_nlp/models/f_net/f_net_masked_lm.py
@@ -79,13 +79,9 @@ class FNetMaskedLM(Task):
     ```python
     # Create a preprocessed dataset where 0 is the mask token.
     features = {
-        "token_ids": tf.constant(
-            [[1, 2, 0, 4, 0, 6, 7, 8]] * 2, shape=(2, 8)
-        ),
-        "segment_ids": tf.constant(
-            [[0, 0, 0, 1, 1, 1, 0, 0]] * 2, shape=(2, 8)
-        ),
-        "mask_positions": tf.constant([[2, 4]] * 2, shape=(2, 2))
+        "token_ids": np.array([[1, 2, 0, 4, 0, 6, 7, 8]] * 2),
+        "segment_ids": np.array([[0, 0, 0, 1, 1, 1, 0, 0]] * 2),
+        "mask_positions": np.array([[2, 4]] * 2)
     }
     # Labels are the original masked values.
     labels = [[3, 5]] * 2
diff --git a/keras_nlp/models/gpt2/gpt2_backbone.py b/keras_nlp/models/gpt2/gpt2_backbone.py
index 40aefda88b..6a00e6ecc5 100644
--- a/keras_nlp/models/gpt2/gpt2_backbone.py
+++ b/keras_nlp/models/gpt2/gpt2_backbone.py
@@ -69,10 +69,8 @@ class GPT2Backbone(Backbone):
     Example usage:
     ```python
     input_data = {
-        "token_ids": tf.ones(shape=(1, 12), dtype="int64"),
-        "padding_mask": tf.constant(
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)
-        ),
+        "token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
     }
 
     # Pretrained GPT-2 decoder.
diff --git a/keras_nlp/models/gpt2/gpt2_causal_lm.py b/keras_nlp/models/gpt2/gpt2_causal_lm.py
index 2bc67088e7..bd99128f00 100644
--- a/keras_nlp/models/gpt2/gpt2_causal_lm.py
+++ b/keras_nlp/models/gpt2/gpt2_causal_lm.py
@@ -99,8 +99,8 @@ class GPT2CausalLM(GenerativeTask):
     # Prompt the model with `5338, 318` (the token ids for `"Who is"`).
     # Use `"padding_mask"` to indicate values that should not be overridden.
     prompt = {
-        "token_ids": tf.constant([[5338, 318, 0, 0, 0]] * 2),
-        "padding_mask": tf.constant([[1, 1, 0, 0, 0]] * 2),
+        "token_ids": np.array([[5338, 318, 0, 0, 0]] * 2),
+        "padding_mask": np.array([[1, 1, 0, 0, 0]] * 2),
     }
 
     gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset(
@@ -120,11 +120,11 @@ class GPT2CausalLM(GenerativeTask):
     Call `fit()` without preprocessing.
     ```python
     x = {
-        "token_ids": tf.constant([[50256, 1, 2, 3, 4]] * 2),
-        "padding_mask": tf.constant([[1, 1, 1, 1, 1]] * 2),
+        "token_ids": np.array([[50256, 1, 2, 3, 4]] * 2),
+        "padding_mask": np.array([[1, 1, 1, 1, 1]] * 2),
     }
-    y = tf.constant([[1, 2, 3, 4, 50256]] * 2)
-    sw = tf.constant([[1, 1, 1, 1, 1]] * 2)
+    y = np.array([[1, 2, 3, 4, 50256]] * 2)
+    sw = np.array([[1, 1, 1, 1, 1]] * 2)
 
     gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset(
         "gpt2_base_en",
diff --git a/keras_nlp/models/opt/opt_backbone.py b/keras_nlp/models/opt/opt_backbone.py
index fb7bb0bd70..996511f66d 100644
--- a/keras_nlp/models/opt/opt_backbone.py
+++ b/keras_nlp/models/opt/opt_backbone.py
@@ -67,10 +67,8 @@ class OPTBackbone(Backbone):
     Examples:
     ```python
     input_data = {
-        "token_ids": tf.ones(shape=(1, 12), dtype="int64"),
-        "padding_mask": tf.constant(
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)
-        ),
+        "token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
     }
 
     # Pretrained OPT decoder
diff --git a/keras_nlp/models/opt/opt_causal_lm.py b/keras_nlp/models/opt/opt_causal_lm.py
index 0bdedf1ed9..06730f9010 100644
--- a/keras_nlp/models/opt/opt_causal_lm.py
+++ b/keras_nlp/models/opt/opt_causal_lm.py
@@ -99,8 +99,8 @@ class OPTCausalLM(GenerativeTask):
     # Prompt the model with `5338, 318` (the token ids for `"Who is"`).
     # Use `"padding_mask"` to indicate values that should not be overridden.
     prompt = {
-        "token_ids": tf.constant([[5338, 318, 0, 0, 0]] * 2),
-        "padding_mask": tf.constant([[1, 1, 0, 0, 0]] * 2),
+        "token_ids": np.array([[5338, 318, 0, 0, 0]] * 2),
+        "padding_mask": np.array([[1, 1, 0, 0, 0]] * 2),
     }
 
     opt_lm = keras_nlp.models.OPTCausalLM.from_preset(
@@ -120,11 +120,11 @@ class OPTCausalLM(GenerativeTask):
     Call `fit()` without preprocessing.
     ```python
     x = {
-        "token_ids": tf.constant([[1, 2, 3, 4, 5]] * 2),
-        "padding_mask": tf.constant([[1, 1, 1, 1, 1]] * 2),
+        "token_ids": np.array([[1, 2, 3, 4, 5]] * 2),
+        "padding_mask": np.array([[1, 1, 1, 1, 1]] * 2),
     }
-    y = tf.constant([[2, 3, 4, 5, 0]] * 2)
-    sw = tf.constant([[1, 1, 1, 1, 1]] * 2)
+    y = np.array([[2, 3, 4, 5, 0]] * 2)
+    sw = np.array([[1, 1, 1, 1, 1]] * 2)
 
     opt_lm = keras_nlp.models.OPTCausalLM.from_preset(
         "opt_base_en",
diff --git a/keras_nlp/models/roberta/roberta_backbone.py b/keras_nlp/models/roberta/roberta_backbone.py
index 0166fb66c0..f8440b0c02 100644
--- a/keras_nlp/models/roberta/roberta_backbone.py
+++ b/keras_nlp/models/roberta/roberta_backbone.py
@@ -67,8 +67,8 @@ class RobertaBackbone(Backbone):
     Examples:
     ```python
     input_data = {
-        "token_ids": tf.ones(shape=(1, 12), dtype="int64"),
-        "padding_mask": tf.constant(
+        "token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "padding_mask": np.array(
             [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)),
     }
 
diff --git a/keras_nlp/models/roberta/roberta_classifier.py b/keras_nlp/models/roberta/roberta_classifier.py
index 3f838c1d4c..c5e9f53de1 100644
--- a/keras_nlp/models/roberta/roberta_classifier.py
+++ b/keras_nlp/models/roberta/roberta_classifier.py
@@ -87,10 +87,8 @@ class RobertaClassifier(Task):
     Preprocessed integer data.
     ```python
     features = {
-        "token_ids": tf.ones(shape=(2, 12), dtype="int64"),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
-        ),
+        "token_ids": np.ones(shape=(2, 12), dtype="int32"),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2),
     }
     labels = [0, 3]
 
diff --git a/keras_nlp/models/roberta/roberta_masked_lm.py b/keras_nlp/models/roberta/roberta_masked_lm.py
index 659e6c2e00..37f5dc3920 100644
--- a/keras_nlp/models/roberta/roberta_masked_lm.py
+++ b/keras_nlp/models/roberta/roberta_masked_lm.py
@@ -82,13 +82,9 @@ class RobertaMaskedLM(Task):
     ```python
     # Create a preprocessed dataset where 0 is the mask token.
     features = {
-        "token_ids": tf.constant(
-            [[1, 2, 0, 4, 0, 6, 7, 8]] * 2, shape=(2, 8)
-        ),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1]] * 2, shape=(2, 8)
-        ),
-        "mask_positions": tf.constant([[2, 4]] * 2, shape=(2, 2))
+        "token_ids": np.array([[1, 2, 0, 4, 0, 6, 7, 8]] * 2),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1]] * 2),
+        "mask_positions": np.array([[2, 4]] * 2)
     }
     # Labels are the original masked values.
     labels = [[3, 5]] * 2
diff --git a/keras_nlp/models/whisper/whisper_backbone.py b/keras_nlp/models/whisper/whisper_backbone.py
index 1f68853ba1..d13961f1d8 100644
--- a/keras_nlp/models/whisper/whisper_backbone.py
+++ b/keras_nlp/models/whisper/whisper_backbone.py
@@ -78,10 +78,10 @@ class WhisperBackbone(Backbone):
 
     ```python
     input_data = {
-        "encoder_features": tf.ones(shape=(1, 12, 80), dtype="int64"),
-        "decoder_token_ids": tf.ones(shape=(1, 12), dtype="int64"),
-        "decoder_padding_mask": tf.constant(
-            [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0], shape=(1, 12)
+        "encoder_features": np.ones(shape=(1, 12, 80), dtype="int32"),
+        "decoder_token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "decoder_padding_mask": np.array(
+            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]
         ),
     }
 
diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py b/keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py
index bd25322394..81a069cb18 100644
--- a/keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py
+++ b/keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py
@@ -58,9 +58,8 @@ class XLMRobertaBackbone(roberta_backbone.RobertaBackbone):
     Examples:
     ```python
     input_data = {
-        "token_ids": tf.ones(shape=(1, 12), dtype="int64"),
-        "padding_mask": tf.constant(
-            [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)),
+        "token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
     }
 
     # Pretrained XLM-R encoder.
diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py
index 344d19b8a6..1eec0588e7 100644
--- a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py
+++ b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py
@@ -89,10 +89,8 @@ class XLMRobertaClassifier(Task):
     Preprocessed integer data.
     ```python
     features = {
-        "token_ids": tf.ones(shape=(2, 12), dtype="int64"),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
-        ),
+        "token_ids": np.ones(shape=(2, 12), dtype="int32"),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2),
     }
     labels = [0, 3]
 
diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm.py b/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm.py
index eeeb051366..80715624f5 100644
--- a/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm.py
+++ b/keras_nlp/models/xlm_roberta/xlm_roberta_masked_lm.py
@@ -85,13 +85,9 @@ class XLMRobertaMaskedLM(Task):
     ```python
     # Create a preprocessed dataset where 0 is the mask token.
     features = {
-        "token_ids": tf.constant(
-            [[1, 2, 0, 4, 0, 6, 7, 8]] * 2, shape=(2, 8)
-        ),
-        "padding_mask": tf.constant(
-            [[1, 1, 1, 1, 1, 1, 1, 1]] * 2, shape=(2, 8)
-        ),
-        "mask_positions": tf.constant([[2, 4]] * 2, shape=(2, 2))
+        "token_ids": np.array([[1, 2, 0, 4, 0, 6, 7, 8]] * 2),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1]] * 2),
+        "mask_positions": np.array([[2, 4]] * 2)
     }
     # Labels are the original masked values.
     labels = [[3, 5]] * 2
diff --git a/keras_nlp/samplers/beam_sampler.py b/keras_nlp/samplers/beam_sampler.py
index 0e4ac64668..077f250b91 100644
--- a/keras_nlp/samplers/beam_sampler.py
+++ b/keras_nlp/samplers/beam_sampler.py
@@ -53,14 +53,14 @@ class BeamSampler(Sampler):
 
     def next(prompt, cache, index):
         prompt_batch_size = tf.shape(prompt)[0]
-        hidden_states = tf.ones((prompt_batch_size, 10))
+        hidden_states = np.ones((prompt_batch_size, 10))
         # A uniform distribution over our alphabet.
-        logits = tf.ones((prompt_batch_size, vocab_size))
+        logits = np.ones((prompt_batch_size, vocab_size))
         return logits, hidden_states, cache
 
     output = keras_nlp.samplers.BeamSampler()(
         next=next,
-        prompt=tf.fill((batch_size, length), char_lookup["z"]),
+        prompt=np.full((batch_size, length), char_lookup["z"], dtype="int32"),
         index=5,
     )
     print(["".join([int_lookup[i] for i in s]) for s in output.numpy()])
@@ -76,14 +76,14 @@ def next(prompt, cache, index):
 
     def next(prompt, cache, index):
         prompt_batch_size = tf.shape(prompt)[0]
-        hidden_states = tf.ones((prompt_batch_size, 10))
+        hidden_states = np.ones((prompt_batch_size, 10))
         # A uniform distribution over our alphabet.
-        logits = tf.ones((batch_size, vocab_size))
+        logits = np.ones((batch_size, vocab_size))
         return logits, hidden_states, cache
 
     beams, probs = keras_nlp.samplers.BeamSampler(return_all_beams=True)(
         next=next,
-        prompt=tf.fill((batch_size, length,), char_lookup['z']),
+        prompt=np.full((batch_size, length,), char_lookup['z'], dtype="int32"),
         index=5,
     )
 
diff --git a/keras_nlp/samplers/contrastive_sampler.py b/keras_nlp/samplers/contrastive_sampler.py
index ea5d668c54..5a5f1fd543 100644
--- a/keras_nlp/samplers/contrastive_sampler.py
+++ b/keras_nlp/samplers/contrastive_sampler.py
@@ -56,16 +56,16 @@ class ContrastiveSampler(Sampler):
 
     def next(prompt, cache, index):
         prompt_batch_size = tf.shape(prompt)[0]
-        hidden_states = tf.ones((prompt_batch_size, hidden_size))
+        hidden_states = np.ones((prompt_batch_size, hidden_size))
         # A uniform distribution over our alphabet.
-        logits = tf.ones((prompt_batch_size, vocab_size))
+        logits = np.ones((prompt_batch_size, vocab_size))
         return logits, hidden_states, cache
 
     output = keras_nlp.samplers.ContrastiveSampler()(
         next=next,
-        prompt=tf.fill((batch_size, length), char_lookup["z"]),
+        prompt=np.full((batch_size, length), char_lookup["z"], dtype="int32"),
         index=index,
-        hidden_states=tf.ones([batch_size, index, hidden_size]),
+        hidden_states=np.ones([batch_size, index, hidden_size]),
     )
     print(["".join([int_lookup[i] for i in s]) for s in output.numpy()])
     # >>> "zzzzzeeeeeee"
diff --git a/keras_nlp/samplers/greedy_sampler.py b/keras_nlp/samplers/greedy_sampler.py
index ed087f4c4f..62bbb06ed3 100644
--- a/keras_nlp/samplers/greedy_sampler.py
+++ b/keras_nlp/samplers/greedy_sampler.py
@@ -39,14 +39,14 @@ class GreedySampler(Sampler):
     batch_size, length, vocab_size = 1, 12, len(int_lookup)
 
     def next(prompt, cache, index):
-        hidden_states = tf.ones((batch_size, 10))
+        hidden_states = np.ones((batch_size, 10))
         # A uniform distribution over our alphabet.
-        logits = tf.ones((batch_size, vocab_size))
+        logits = np.ones((batch_size, vocab_size))
         return logits, hidden_states, cache
 
     output = keras_nlp.samplers.GreedySampler()(
         next=next,
-        prompt=tf.fill((batch_size, length,), char_lookup['z']),
+        prompt=np.full((batch_size, length,), char_lookup['z'], dtype="int32"),
         index=5,
     )
     print(["".join([int_lookup[i] for i in s]) for s in output.numpy()])
diff --git a/keras_nlp/samplers/random_sampler.py b/keras_nlp/samplers/random_sampler.py
index baf13cf875..e18de34f2d 100644
--- a/keras_nlp/samplers/random_sampler.py
+++ b/keras_nlp/samplers/random_sampler.py
@@ -44,14 +44,14 @@ class RandomSampler(Sampler):
     batch_size, length, vocab_size = 1, 12, len(int_lookup)
 
     def next(prompt, state, index):
-        hidden_states = tf.ones((batch_size, 10))
+        hidden_states = np.ones((batch_size, 10))
         # A uniform distribution over our alphabet.
-        logits = tf.ones((batch_size, vocab_size))
+        logits = np.ones((batch_size, vocab_size))
         return logits, hidden_states, state
 
     output = keras_nlp.samplers.RandomSampler()(
         next=next,
-        prompt=tf.fill((batch_size, length,), char_lookup['z']),
+        prompt=np.full((batch_size, length,), char_lookup['z'], dtype="int32"),
         index=5,
     )
     print(["".join([int_lookup[i] for i in s]) for s in output.numpy()])
diff --git a/keras_nlp/samplers/top_k_sampler.py b/keras_nlp/samplers/top_k_sampler.py
index 39d50d2ad4..bceacd0a00 100644
--- a/keras_nlp/samplers/top_k_sampler.py
+++ b/keras_nlp/samplers/top_k_sampler.py
@@ -45,14 +45,14 @@ class TopKSampler(Sampler):
     batch_size, length, vocab_size = 1, 12, len(int_lookup)
 
     def next(prompt, cache, index):
-        hidden_states = tf.ones((batch_size, 10))
+        hidden_states = np.ones((batch_size, 10))
         # A uniform distribution over our alphabet.
-        logits = tf.ones((batch_size, vocab_size))
+        logits = np.ones((batch_size, vocab_size))
         return logits, hidden_states, cache
 
     output = keras_nlp.samplers.TopKSampler(k=3)(
         next=next,
-        prompt=tf.fill((batch_size, length,), char_lookup['z']),
+        prompt=np.full((batch_size, length,), char_lookup['z'], dtypes="int32"),
         index=5,
     )
     print(["".join([int_lookup[i] for i in s]) for s in output.numpy()])
diff --git a/keras_nlp/samplers/top_p_sampler.py b/keras_nlp/samplers/top_p_sampler.py
index 8ff3147b72..608f227121 100644
--- a/keras_nlp/samplers/top_p_sampler.py
+++ b/keras_nlp/samplers/top_p_sampler.py
@@ -53,14 +53,14 @@ class TopPSampler(Sampler):
     batch_size, length, vocab_size = 1, 12, len(int_lookup)
 
     def next(prompt, cache, index):
-        hidden_states = tf.ones((batch_size, 10))
+        hidden_states = np.ones((batch_size, 10))
         # A uniform distribution over our alphabet.
-        logits = tf.ones((batch_size, vocab_size))
+        logits = np.ones((batch_size, vocab_size))
         return logits, hidden_states, cache
 
     output = keras_nlp.samplers.TopPSampler(p=0.1)(
         next=next,
-        prompt=tf.fill((batch_size, length,), char_lookup['z']),
+        prompt=np.full((batch_size, length,), char_lookup['z'], dtype="int32"),
         index=5,
     )
     print(["".join([int_lookup[i] for i in s]) for s in output.numpy()])