keras-team
diff --git a/‎keras_nlp/models/gpt2/gpt2_causal_lm.py‎
Lines changed: 43 additions & 21 deletions b/‎keras_nlp/models/gpt2/gpt2_causal_lm.py‎
Lines changed: 43 additions & 21 deletions
diff --git a/‎keras_nlp/models/gpt2/gpt2_causal_lm_test.py‎
Lines changed: 8 additions & 6 deletions b/‎keras_nlp/models/gpt2/gpt2_causal_lm_test.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎keras_nlp/samplers/beam_sampler.py‎
Lines changed: 36 additions & 43 deletions b/‎keras_nlp/samplers/beam_sampler.py‎
Lines changed: 36 additions & 43 deletions
@@ -17,15 +17,15 @@
 
 import tensorflow as tf
 
-import keras_nlp
+from keras_nlp import samplers
 from keras_nlp.api_export import keras_nlp_export
 from keras_nlp.models.gpt2.gpt2_backbone import GPT2Backbone
 from keras_nlp.models.gpt2.gpt2_causal_lm_preprocessor import (
     GPT2CausalLMPreprocessor,
 )
 from keras_nlp.models.gpt2.gpt2_presets import backbone_presets
 from keras_nlp.models.task import Task
-from keras_nlp.samplers import serialize
+from keras_nlp.utils.keras_utils import is_xla_compatible
 from keras_nlp.utils.python_utils import classproperty
 from keras_nlp.utils.tf_utils import truncate_at
 
@@ -37,8 +37,12 @@ class GPT2CausalLM(Task):
     A causal language model (LM) predicts the next token based on previous
     tokens the next token based on previous tokens, which is the way GPT2 gets
     pretrained. You can finetune `GPT2CausalLM` to generate text similar to
-    the custom dataset. `GPT2CausalLM` also has a method `generate()`, which
-    generates text based on given prompt.
+    the custom dataset.
+
+    `GPT2CausalLM` has a method `generate()`, which generates text based on a
+    prompt. The generation strategy used is controlled by an additional
+    `sampler` argument on `compile()`. You can recompile the model with
+    different samplers to control generation.
 
     This model can optionally be configured with a `preprocessor` layer, in
     which case it will automatically apply preprocessing to raw inputs during
@@ -67,15 +71,13 @@ class GPT2CausalLM(Task):
     gpt2_lm.generate(["This is a", "Where are you"], max_length=30)
     ```
 
-    Use a custom sampler for text generation.
+    Compile the `generate()` function with custom samplers.
     ```python
     gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset("gpt2_base_en")
+    gpt2_lm.compile(sampler="top_p")
+    gpt2_lm.generate("I want to say", max_length=30)
 
-    # Use string identifier to set sampler.
-    gpt2_lm.generate("I want to say", max_length=30, sampler="top_p")
-
-    # Construct a sampler instance.
-    sampler = keras_nlp.samplers.BeamSampler(num_beams=2)
+    gpt2_lm.compile(sampler=keras_nlp.samplers.BeamSampler(num_beams=2))
     gpt2_lm.generate("I want to say", max_length=30, sampler=sampler)
     ```
 
@@ -189,8 +191,8 @@ def __init__(
 
         self.backbone = backbone
         self.preprocessor = preprocessor
-        self.sampler = None
         self.generate_function = None
+        self.sampler = samplers.get("top_k")
 
     @classproperty
     def presets(cls):
@@ -260,12 +262,30 @@ def build_empty_cache(self, batch_size, max_length):
         shape = [batch_size, num_layers, 2, max_length, num_heads, head_dim]
         return tf.zeros(shape)
 
-    def make_generate_function(self, sampler):
+    def compile(
+        self,
+        *args,
+        run_eagerly=False,
+        jit_compile=True,
+        sampler="top_k",
+        **kwargs,
+    ):
+        jit_compile = jit_compile and is_xla_compatible(self)
+        jit_compile = jit_compile and not run_eagerly
+        super().compile(
+            *args,
+            run_eagerly=run_eagerly,
+            jit_compile=jit_compile,
+            **kwargs,
+        )
+        # Clear the compiled generate function.
+        self.generate_function = None
+        self.sampler = samplers.get(sampler)
+
+    def make_generate_function(self):
         """Create or return the compiled generation function."""
-        # If our sampler has not changed, re-use the compiled function.
-        if self.sampler and serialize(self.sampler) == serialize(sampler):
+        if self.generate_function is not None:
             return self.generate_function
-        self.sampler = sampler
 
         def fn(prompt, input_mask, min_length, max_length):
             batch_size = tf.shape(prompt)[0]
@@ -284,9 +304,9 @@ def next(prompt, state, index):
                 )
                 return tf.squeeze(probs, axis=1), state
 
-            return sampler(
-                prompt=prompt,
+            return self.sampler(
                 next=next,
+                prompt=prompt,
                 state=cache,
                 index=min_length,
                 mask=input_mask,
@@ -306,7 +326,6 @@ def generate(
         self,
         prompt,
         max_length,
-        sampler="top_k",
     ):
         """Generate text.
 
@@ -327,9 +346,11 @@ def generate(
                 "`self.preprocessor` is `None`, please make sure "
                 "`preprocessor` is set before calling `generate`."
             )
-        sampler = keras_nlp.samplers.get(sampler)
 
         # Tokenize.
+        prompt = tf.convert_to_tensor(prompt)
+        input_is_scalar = prompt.shape.rank == 0
+        prompt = prompt[tf.newaxis] if input_is_scalar else prompt
         prompt = self.preprocessor.tokenizer(prompt)
 
         # Pad ragged to dense tensors.
@@ -339,12 +360,13 @@ def generate(
         prompt = prompt.to_tensor(shape=padded_shape)
 
         # Run the (possibly compiled) generate function on dense inputs.
-        generate_function = self.make_generate_function(sampler)
+        generate_function = self.make_generate_function()
         output = generate_function(prompt, input_mask, min_length, max_length)
 
         # Truncate back to ragged to account for end of sequence ids.
         end_token_id = self.preprocessor.tokenizer.end_token_id
         output = truncate_at(output, end_token_id, input_mask)
 
         # Detokenize.
-        return self.preprocessor.tokenizer.detokenize(output)
+        output = self.preprocessor.tokenizer.detokenize(output)
+        return tf.squeeze(output, 0) if input_is_scalar else output
@@ -122,25 +122,27 @@ def test_gpt2_causal_lm_fit_no_preprocessing(self, jit_compile):
         self.causal_lm_no_preprocessing.fit(self.preprocessed_dataset)
 
     @parameterized.named_parameters(
-        ("non_jit_compile_cache", False, True),
-        ("non_jit_compile_non_cache", False, False),
-        ("jit_compile_non_cache", True, False),
+        ("jit_compile_false", False), ("jit_compile_true", True)
     )
-    def test_gpt2_causal_lm_generate(self, jit_compile, use_cache):
+    def test_compilation(self, jit_compile):
+        # Tensor input.
         self.causal_lm.compile(jit_compile=jit_compile)
         self.causal_lm.generate(
             self.raw_batch,
             max_length=10,
         )
-
-        # String input
+        first_fn = self.causal_lm.generate_function
+        # String input.
         prompt = " airplane"
         generated = self.causal_lm.generate(
             prompt,
             max_length=10,
         )
         generated = generated.numpy().decode("utf-8")
         self.assertTrue(prompt in generated)
+        second_fn = self.causal_lm.generate_function
+        # Assert we did not recompile.
+        self.assertEqual(first_fn, second_fn)
 
     @parameterized.named_parameters(
         ("tf_format", "tf", "model"),
 
@@ -42,30 +42,23 @@ class BeamSampler(Sampler):
 
     Examples:
     ```python
-    VOCAB_SIZE = 10
-
-    # Create a dummy model to predict the next token.
-    model = keras.Sequential(
-        [
-            keras.Input(shape=[None]),
-            keras.layers.Embedding(
-                input_dim=VOCAB_SIZE,
-                output_dim=16,
-            ),
-            keras.layers.Dense(VOCAB_SIZE, activation="softmax"),
-        ]
+    # Use a simple alphabet of lowercase characters to [0, 26).
+    int_lookup = {i: chr(i + ord('a')) for i in range(26)}
+    char_lookup = {v: k for k, v in int_lookup.items()}
+    batch_size, length, vocab_size = 1, 12, len(int_lookup)
+
+    def next(prompt, state, index):
+        # A uniform distribution over our alphabet.
+        probs = tf.ones((batch_size, vocab_size))
+        return probs, state
+
+    output = keras_nlp.samplers.BeamSampler()(
+        next=next,
+        prompt=tf.fill((batch_size, length,), char_lookup['z']),
+        index=5,
     )
-
-    # Define a function that outputs the next token's probability for each token
-    # in the input sequence.
-    def token_probability_fn(inputs, mask):
-        return model(inputs)
-
-    prompt = tf.fill((8, 1), 1)
-
-    sampler = keras_nlp.samplers.BeamSampler(num_beams=3)
-    # Print the generated sequence (token ids).
-    print(sampler(prompt, token_probability_fn, max_length=10))
+    print(["".join([int_lookup[i] for i in s]) for s in output.numpy()])
+    # >>> "zzzzzaaaaaaa"
     ```
     """
 
@@ -78,8 +71,8 @@ def __init__(
 
     def __call__(
         self,
-        prompt,
         next,
+        prompt,
         index=0,
         state=None,
         mask=None,
@@ -99,6 +92,17 @@ def unflatten(x):
             unflat_shape = [batch_size, self.num_beams] + x.shape.as_list()[1:]
             return tf.reshape(x, shape=unflat_shape)
 
+        mask = tf.zeros_like(prompt, dtype=tf.bool) if mask is None else mask
+        # `tf.while_loop` will not accept `None` as a value for `loop_vars`.
+        state = () if state is None else state
+        # Add extra sequences for each beam.
+        prompt, mask = add_beams(prompt), add_beams(mask)
+        state = tf.nest.map_structure(add_beams, state)
+        # Setup the initial beam log-likelihoods.
+        # On the first loop, make sure only the original beam is considered.
+        beam_probs = tf.constant([[0.0] + [-1e9] * (self.num_beams - 1)])
+        beam_probs = flatten(tf.repeat(beam_probs, batch_size, axis=0))
+
         def cond(prompt, state, index, beam_probs):
             if end_token_id is None:
                 return True
@@ -127,13 +131,13 @@ def body(prompt, state, index, beam_probs):
             # We need `ensure_shape` as `top_k` will change the static shape.
             beam_probs = tf.ensure_shape(flatten(next_probs), beam_probs.shape)
 
-            # Gather the correct prompt and state beams.
-            prompt = unflatten(prompt)
-            state = tf.nest.map_structure(unflatten, state)
-            prompt = tf.gather(prompt, beam_indices, axis=1, batch_dims=1)
-            state = tf.gather(state, beam_indices, axis=1, batch_dims=1)
-            prompt = flatten(prompt)
-            state = tf.nest.map_structure(flatten, state)
+            def gather_beams(x):
+                x = unflatten(x)
+                x = tf.gather(x, beam_indices, axis=1, batch_dims=1)
+                return flatten(x)
+
+            prompt = gather_beams(prompt)
+            state = tf.nest.map_structure(gather_beams, state)
 
             # Update each beam with the next token.
             next_token = tf.cast(next_token, prompt.dtype)
@@ -145,25 +149,14 @@ def body(prompt, state, index, beam_probs):
             # Return the iteration of the loop state.
             return (prompt, state, index + 1, beam_probs)
 
-        mask = tf.zeros_like(prompt, dtype=tf.bool) if mask is None else mask
-        # `tf.while_loop` will not accept `None` as a value for `loop_vars`.
-        state = () if state is None else state
-        # Add extra sequences for each beam.
-        prompt, mask = add_beams(prompt), add_beams(mask)
-        state = tf.nest.map_structure(add_beams, state)
-        # Setup the initial beam log-likelihoods.
-        # On the first loop, make sure only the original beam is considered.
-        beam_probs = tf.constant([[0.0] + [-1e9] * (self.num_beams - 1)])
-        beam_probs = flatten(tf.repeat(beam_probs, batch_size, axis=0))
-
         prompt, _, _, beam_probs = tf.while_loop(
             cond=cond,
             body=body,
             loop_vars=(prompt, state, index, beam_probs),
             maximum_iterations=(max_length - index),
         )
 
-        # Gather the top beams for each batch index.
+        # Gather the top beam at each batch index.
         prompt, beam_probs = unflatten(prompt), unflatten(beam_probs)
         top_beams = tf.math.argmax(beam_probs, axis=-1)[:, tf.newaxis]
         prompt = tf.gather(prompt, top_beams, axis=1, batch_dims=1)