keras-team · chenmoneygithub · May 1, 2023 · Apr 28, 2023 · May 1, 2023
diff --git a/keras_nlp/models/gpt2/gpt2_backbone.py b/keras_nlp/models/gpt2/gpt2_backbone.py
@@ -60,7 +60,7 @@ class GPT2Backbone(Backbone):
             a two-layer feedforward network for each transformer.
         dropout: float. Dropout probability for the Transformer encoder.
         max_sequence_length: int. The maximum sequence length that this encoder
-            can consume. If None, `max_sequence_length` uses the value from
+            can consume. If `None`, `max_sequence_length` uses the value from
             sequence length. This determines the variable shape for positional
             embeddings.
 

diff --git a/keras_nlp/models/gpt2/gpt2_causal_lm.py b/keras_nlp/models/gpt2/gpt2_causal_lm.py
@@ -37,9 +37,10 @@ class GPT2CausalLM(Task):
     """An end-to-end GPT2 model for causal langauge modeling.
 
     A causal language model (LM) predicts the next token based on previous
-    tokens the next token based on previous tokens, which is the way GPT2 gets
-    pretrained. You can finetune `GPT2CausalLM` to generate text similar to
-    the custom dataset.
+    tokens. This task setup can be used to train the model unsupervised on
+    plain text input, or to autoregressively generate plain text similar to
+    the data used for training. This task can be used for pre-training or
+    fine-tuning a GPT-2 model, simply by calling `fit()`.
 
     This model has a `generate()` method, which generates text based on a
     prompt. The generation strategy used is controlled by an additional

diff --git a/keras_nlp/models/gpt2/gpt2_causal_lm_preprocessor.py b/keras_nlp/models/gpt2/gpt2_causal_lm_preprocessor.py
@@ -29,19 +29,19 @@ class GPT2CausalLMPreprocessor(GPT2Preprocessor):
     `keras_nlp.models.GPT2CausalLM`. By default, it will take in batches of
     strings, and return outputs in a `(x, y, sample_weight)` format, where the
     `y` label is the next token id in the `x` sequence. For use with generation,
-    pass `return_labels=False` in which case the output will simply be the
+    pass `return_labels=False`, in which case the output will simply be the
     encoded string features.
 
     Args:
         tokenizer: A `keras_nlp.models.GPT2Tokenizer` instance.
         sequence_length: The length of the packed inputs.
-        add_start_token: If true, the preprocessor will append the tokenizer
+        add_start_token: If true, the preprocessor will prepend the tokenizer
             start token to each input sequence.
         add_end_token: If true, the preprocessor will append the tokenizer
             end token to each input sequence.
 
     Call arguments:
-        x: A string `tf.Tensor` or list of python strings.
+        x: A string, `tf.Tensor` or list of python strings.
         y: Label data. Should always be `None` as the layer generates labels.
         sample_weight: Label weights. Should always be `None` as the layer
             generates label weights.

diff --git a/keras_nlp/models/gpt2/gpt2_preprocessor.py b/keras_nlp/models/gpt2/gpt2_preprocessor.py
@@ -34,7 +34,7 @@ class GPT2Preprocessor(Preprocessor):
 
     This preprocessing layer will do 2 things:
 
-    - Tokenize the input using the `tokenizer`.
+    - Tokenize the inputs using the `tokenizer`.
     - Construct a dictionary with keys `"token_ids"`, `"padding_mask"`, that can
         be passed directly to a `keras_nlp.models.GPT2Backbone`.
 
@@ -57,20 +57,20 @@ class GPT2Preprocessor(Preprocessor):
     Args:
         tokenizer: A `keras_nlp.models.GPT2Tokenizer` instance.
         sequence_length: The length of the packed inputs.
-        add_start_token: If true, the preprocessor will append the tokenizer
+        add_start_token: If true, the preprocessor will prepend the tokenizer
             start token to each input sequence.
         add_end_token: If true, the preprocessor will append the tokenizer
             end token to each input sequence.
 
     Call arguments:
-        x: A string `tf.Tensor` or list of python strings.
+        x: A string, `tf.Tensor` or list of python strings.
         y: Any label data. Will be passed through unaltered.
         sample_weight: Any label weight data. Will be passed through unaltered.
         sequence_length: Pass to override the configured `sequence_length` of
             the layer.
-        add_start_token: Pass to override the configure value of
+        add_start_token: Pass to override the configured value of
             `add_start_token` on the layer.
-        add_end_token: Pass to override the configure value of
+        add_end_token: Pass to override the configured value of
             `add_end_token` on the layer.
 
     Examples:

diff --git a/keras_nlp/models/opt/opt_backbone.py b/keras_nlp/models/opt/opt_backbone.py
@@ -58,7 +58,7 @@ class OPTBackbone(Backbone):
             a two-layer feedforward network for each transformer decoder layer.
         dropout: float. Dropout probability for the Transformer decoder.
         max_sequence_length: int. The maximum sequence length that this decoder
-            can consume. If None, `max_sequence_length` uses the value from
+            can consume. If `None`, `max_sequence_length` uses the value from
             sequence length. This determines the variable shape for positional
             embeddings.
 

diff --git a/keras_nlp/models/opt/opt_causal_lm.py b/keras_nlp/models/opt/opt_causal_lm.py
@@ -37,9 +37,10 @@ class OPTCausalLM(Task):
     """An end-to-end OPT model for causal langauge modeling.
 
     A causal language model (LM) predicts the next token based on previous
-    tokens the next token based on previous tokens, which is the way OPT gets
-    pretrained. You can finetune `OPTCausalLM` to generate text similar to
-    the custom dataset.
+    tokens. This task setup can be used to train the model unsupervised on
+    plain text input, or to autoregressively generate plain text similar to
+    the data used for training. This task can be used for pre-training or
+    fine-tuning a GPT-2 model, simply by calling `fit()`.
 
     This model has a `generate()` method, which generates text based on a
     prompt. The generation strategy used is controlled by an additional

diff --git a/keras_nlp/models/opt/opt_causal_lm_preprocessor.py b/keras_nlp/models/opt/opt_causal_lm_preprocessor.py
@@ -35,13 +35,13 @@ class OPTCausalLMPreprocessor(OPTPreprocessor):
     Args:
         tokenizer: A `keras_nlp.models.OPTTokenizer` instance.
         sequence_length: The length of the packed inputs.
-        add_start_token: If true, the preprocessor will append the tokenizer
+        add_start_token: If true, the preprocessor will prepend the tokenizer
             start token to each input sequence.
         add_end_token: If true, the preprocessor will append the tokenizer
             end token to each input sequence.
 
     Call arguments:
-        x: A string `tf.Tensor` or list of python strings.
+        x: A string, `tf.Tensor` or list of python strings.
         y: Label data. Should always be `None` as the layer generates labels.
         sample_weight: Label weights. Should always be `None` as the layer
             generates label weights.

diff --git a/keras_nlp/models/opt/opt_preprocessor.py b/keras_nlp/models/opt/opt_preprocessor.py
@@ -63,7 +63,7 @@ class OPTPreprocessor(Preprocessor):
             end token to each input sequence.
 
     Call arguments:
-        x: A string `tf.Tensor` or list of python strings.
+        x: A string, `tf.Tensor` or list of python strings.
         y: Any label data. Will be passed through unaltered.
         sample_weight: Any label weight data. Will be passed through unaltered.
         sequence_length: Pass to override the configured `sequence_length` of

diff --git a/keras_nlp/samplers/sampler.py b/keras_nlp/samplers/sampler.py
@@ -31,15 +31,17 @@
         cache: Optional. A tensor or nested structure of tensors that will be
             updated by each call to `next`. This can be used to cache
             computations from early iterations of the generative loop.
-        index: Optional. The first index to start sampling at.
+        index: Optional. The first index of `prompt` to start sampling at.
+            Usually this is set as the length of the shortest non-padding
+            sequence in `prompt`.
         mask: Optional. A 2D integer tensor with the same shape as `prompt`.
             Locations which are `True` in the mask are never updated during
-            sampling. Often this will mark all ids in `prompt` which were
-            present in the original input.
+            sampling. Usually used to mark all locations in the dense prompt
+            tensor which were present in a user input.
         end_token_id: Optional. The token marking the end of the sequence. If
             specified, sampling will stop as soon as all sequences in the prompt
             produce a `end_token_id` in a location where `mask` is `False`.
-    """
+"""
 
 
 @format_docstring(call_args=call_args_docstring)
@@ -60,8 +62,8 @@ class Sampler:
 
     - Override the `get_next_token()` method, which computes the next token
       based on a probability distribution over all possible vocab entries.
-    - Override `__call__`, if the sampling method need additional cache beyond
-      the next tokens probability distribution to sample a sequence.
+    - Override `__call__`, if the sampling method needs additional information
+      beyond the next tokens probability distribution to sample a sequence.
 
     Please check available subclass samplers for examples.