From 98258000b574a2c388a015df51780a441eec6b56 Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Mon, 20 Mar 2023 04:05:59 +0200 Subject: [PATCH 1/4] Rework docstring of XLMRoberta --- .../xlm_roberta/xlm_roberta_backbone.py | 38 +++-- .../xlm_roberta/xlm_roberta_classifier.py | 132 ++++++++-------- .../xlm_roberta/xlm_roberta_preprocessor.py | 141 +++++++++--------- .../xlm_roberta/xlm_roberta_tokenizer.py | 35 +++-- 4 files changed, 166 insertions(+), 180 deletions(-) diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py b/keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py index 6bef8e2c0a..840ae31f07 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py @@ -24,17 +24,16 @@ @keras_nlp_export("keras_nlp.models.XLMRobertaBackbone") class XLMRobertaBackbone(roberta_backbone.RobertaBackbone): - """XLM-RoBERTa encoder. + """An XLM-RoBERTa encoder. - This network implements a bi-directional Transformer-based encoder as - described in - ["Unsupervised Cross-lingual Representation Learning at Scale"](https://arxiv.org/abs/1911.02116). - It includes the embedding lookups and transformer layers, but does not + This class implements a bi-directional Transformer-based encoder as + described in ["Unsupervised Cross-lingual Representation Learning at Scale"](https://arxiv.org/abs/1911.02116). + It includes the embedding lookups and transformer layers, but it does not include the masked language modeling head used during pretraining. The default constructor gives a fully customizable, randomly initialized - RoBERTa encoder with any number of layers, heads, and embedding - dimensions. To load preset architectures and weights, use the `from_preset` + RoBERTa encoder with any number of layers, heads, and embedding dimensions. + To load preset architectures and weights, use the `from_preset()` constructor. Disclaimer: Pre-trained models are provided on an "as is" basis, without @@ -53,9 +52,10 @@ class XLMRobertaBackbone(roberta_backbone.RobertaBackbone): dropout: float. Dropout probability for the Transformer encoder. max_sequence_length: int. The maximum sequence length this encoder can consume. The sequence length of the input must be less than - `max_sequence_length`. + `max_sequence_length` default value. This determines the variable + shape for positional embeddings. - Example usage: + Examples: ```python input_data = { "token_ids": tf.ones(shape=(1, 12), dtype=tf.int64), @@ -63,24 +63,22 @@ class XLMRobertaBackbone(roberta_backbone.RobertaBackbone): [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)), } - # Pretrained XLM-R encoder + # Pretrained XLM-R encoder. model = keras_nlp.models.XLMRobertaBackbone.from_preset( "xlm_roberta_base_multi", ) - output = model(input_data) + model(input_data) - # Randomly initialized XLM-R model with custom config + # Randomly initialized XLM-R model with custom config. model = keras_nlp.models.XLMRobertaBackbone( vocabulary_size=250002, - num_layers=12, - num_heads=12, - hidden_dim=768, - intermediate_dim=3072, - max_sequence_length=12 + num_layers=4, + num_heads=4, + hidden_dim=256, + intermediate_dim=512, + max_sequence_length=128 ) - - # Call the model on the input data. - output = model(input_data) + model(input_data) ``` """ diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py index 3ff68b00b3..8096bb7433 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py @@ -34,9 +34,9 @@ class XLMRobertaClassifier(Task): """An end-to-end XLM-RoBERTa model for classification tasks. This model attaches a classification head to a - `keras_nlp.model.XLMRobertaBackbone` model, mapping from the backbone - outputs to logit output suitable for a classification task. For usage of - this model with pre-trained weights, see the `from_preset()` method. + `keras_nlp.model.XLMRobertaBackbone` instance, mapping from the backbone + outputs to logits suitable for a classification task. For usage of + this model with pre-trained weights, see the `from_preset()` constructor. This model can optionally be configured with a `preprocessor` layer, in which case it will automatically apply preprocessing to raw inputs during @@ -49,7 +49,7 @@ class XLMRobertaClassifier(Task): [here](https://github.com/facebookresearch/fairseq). Args: - backbone: A `keras_nlp.models.XLMRoberta` instance. + backbone: A `keras_nlp.models.XLMRobertaBackbone` instance. num_classes: int. Number of classes to predict. hidden_dim: int. The size of the pooler layer. dropout: float. The dropout probability value, applied to the pooled @@ -60,85 +60,34 @@ class XLMRobertaClassifier(Task): Examples: - Example usage. + Raw string data. ```python - preprocessed_features = { - "token_ids": tf.ones(shape=(2, 12), dtype=tf.int64), - "padding_mask": tf.constant( - [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)), - } - labels = [0, 3] - - # Randomly initialized XLM-RoBERTa encoder - backbone = keras_nlp.models.XLMRobertaBackbone( - vocabulary_size=250002, - num_layers=12, - num_heads=12, - hidden_dim=768, - intermediate_dim=3072, - max_sequence_length=12 - ) - - # Create a XLM-RoBERTa classifier and fit your data. - classifier = keras_nlp.models.XLMRobertaClassifier( - backbone, - num_classes=4, - preprocessor=None, - ) - classifier.compile( - loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), - ) - classifier.fit(x=preprocessed_features, y=labels, batch_size=2) - - # Access backbone programatically (e.g., to change `trainable`) - classifier.backbone.trainable = False - ``` - - Raw string inputs. - ```python - # Create a dataset with raw string features in an `(x, y)` format. - features = ["The quick brown fox jumped.", "I forgot my homework."] + features = ["The quick brown fox jumped.", "نسيت الواجب"] labels = [0, 3] - # Create a XLMRobertaClassifier and fit your data. + # Pretrained classifier. classifier = keras_nlp.models.XLMRobertaClassifier.from_preset( "xlm_roberta_base_multi", num_classes=4, ) - classifier.compile( - loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), - ) classifier.fit(x=features, y=labels, batch_size=2) - ``` + classifier.predict(x=features, batch_size=2) - Raw string inputs with customized preprocessing. - ```python - # Create a dataset with raw string features in an `(x, y)` format. - features = ["The quick brown fox jumped.", "I forgot my homework."] - labels = [0, 3] - - # Use a shorter sequence length. - preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset( - "xlm_roberta_base_multi", - sequence_length=128, - ) - - # Create a XLMRobertaClassifier and fit your data. - classifier = keras_nlp.models.XLMRobertaClassifier.from_preset( - "xlm_roberta_base_multi", - num_classes=4, - preprocessor=preprocessor, - ) + # Re-compile (e.g., with a new learning rate). classifier.compile( loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), + optimizer=keras.optimizers.Adam(5e-5), + jit_compile=True, ) + # Access backbone programmatically (e.g., to change `trainable`). + classifier.backbone.trainable = False + # Fit again. classifier.fit(x=features, y=labels, batch_size=2) ``` - Preprocessed inputs. + Preprocessed integer data. ```python - # Create a dataset with preprocessed features in an `(x, y)` format. - preprocessed_features = { + features = { "token_ids": tf.ones(shape=(2, 12), dtype=tf.int64), "padding_mask": tf.constant( [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12) @@ -146,16 +95,57 @@ class XLMRobertaClassifier(Task): } labels = [0, 3] - # Create a XLMRobertaClassifier and fit your data. + # Pretrained classifier without preprocessing. classifier = keras_nlp.models.XLMRobertaClassifier.from_preset( "xlm_roberta_base_multi", num_classes=4, preprocessor=None, ) - classifier.compile( - loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), + classifier.fit(x=features, y=labels, batch_size=2) + ``` + + Custom backbone and vocabulary. + ```python + features = ["The quick brown fox jumped.", "نسيت الواجب"] + labels = [0, 3] + + def train_sentencepiece(ds, vocab_size): + bytes_io = io.BytesIO() + sentencepiece.SentencePieceTrainer.train( + sentence_iterator=ds.as_numpy_iterator(), + model_writer=bytes_io, + vocab_size=vocab_size, + model_type="WORD", + unk_id=0, + bos_id=1, + eos_id=2, + ) + return bytes_io.getvalue() + ds = tf.data.Dataset.from_tensor_slices( + ["the quick brown fox", "the earth is round"] + ) + proto = train_sentencepiece(ds, vocab_size=10) + tokenizer = keras_nlp.models.XLMRobertaTokenizer( + proto=proto + ) + preprocessor = keras_nlp.models.XLMRobertaPreprocessor( + tokenizer, + sequence_length=128, + ) + backbone = keras_nlp.models.XLMRobertaBackbone( + vocabulary_size=250002, + num_layers=4, + num_heads=4, + hidden_dim=256, + intermediate_dim=512, + max_sequence_length=128, ) - classifier.fit(x=preprocessed_features, y=labels, batch_size=2) + classifier = keras_nlp.models.XLMRobertaClassifier( + backbone=backbone, + preprocessor=preprocessor, + num_classes=4, + ) + classifier.fit(x=features, y=labels, batch_size=2) ``` """ diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py index fa38f1e380..95adb123f3 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py @@ -34,35 +34,23 @@ @keras_nlp_export("keras_nlp.models.XLMRobertaPreprocessor") class XLMRobertaPreprocessor(Preprocessor): - """XLM-RoBERTa preprocessing layer. + """An XLM-RoBERTa preprocessing layer which tokenizes and packs inputs. This preprocessing layer will do three things: - - Tokenize any number of input segments using the `tokenizer`. - - Pack the inputs together with the appropriate `""`, `""` and - `""` tokens, i.e., adding a single `""` at the start of the - entire sequence, `""` at the end of each segment, save the last - and a `""` at the end of the entire sequence. - - Construct a dictionary with keys `"token_ids"` and `"padding_mask"`, - that can be passed directly to a XLM-RoBERTa model. + 1. Tokenize any number of input segments using the `tokenizer`. + 2. Pack the inputs together using a `keras_nlp.layers.MultiSegmentPacker`. + with the appropriate `""`, `""` and `""` tokens, i.e., adding + a single `""` at the start of the entire sequence, `""` at the + end of each segment, save the last and a `""` at the end of the + entire sequence. + 3. Construct a dictionary with keys `"token_ids"` and `"padding_mask"`, + that can be passed directly to an XLM-RoBERTa model. This layer can be used directly with `tf.data.Dataset.map` to preprocess string data in the `(x, y, sample_weight)` format used by `keras.Model.fit`. - The call method of this layer accepts three arguments, `x`, `y`, and - `sample_weight`. `x` can be a python string or tensor representing a single - segment, a list of python strings representing a batch of single segments, - or a list of tensors representing multiple segments to be packed together. - `y` and `sample_weight` are both optional, can have any format, and will be - passed through unaltered. - - Special care should be taken when using `tf.data` to map over an unlabeled - tuple of string segments. `tf.data.Dataset.map` will unpack this tuple - directly into the call arguments of this layer, rather than forward all - argument to `x`. To handle this case, it is recommended to explicitly call - the layer, e.g. `ds.map(lambda seg1, seg2: preprocessor(x=(seg1, seg2)))`. - Args: tokenizer: A `keras_nlp.tokenizers.XLMRobertaTokenizer` instance. sequence_length: The length of the packed inputs. @@ -77,77 +65,84 @@ class XLMRobertaPreprocessor(Preprocessor): left-to-right manner and fills up the buckets until we run out of budget. It supports an arbitrary number of segments. + Call arguments: + x: A tensor of single string sequences, or a tuple of multiple + tensor sequences to be packed together. Inputs may be batched or + unbatched. For single sequences, raw python inputs will be converted + to tensors. For multiple sequences, pass tensors directly. + y: Any label data. Will be passed through unaltered. + sample_weight: Any label weight data. Will be passed through unaltered. + Examples: ```python - # Load the preprocessor from a preset. - preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset("xlm_roberta_base_multi") + preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset( + "xlm_roberta_base_multi" + ) # Tokenize and pack a single sentence. - sentence = tf.constant("The quick brown fox jumped.") - preprocessor(sentence) - # Same output. preprocessor("The quick brown fox jumped.") - # Tokenize and a batch of single sentences. - sentences = tf.constant( - ["The quick brown fox jumped.", "Call me Ishmael."] + # Tokenize a batch of single sentences. + preprocessor(["The quick brown fox jumped.", "اسمي اسماعيل"]) + + # Preprocess a batch of sentence pairs. + # When handling multiple sequences, always convert to tensors first! + first = tf.constant(["The quick brown fox jumped.", "اسمي اسماعيل"]) + second = tf.constant(["The fox tripped.", "Oh look, a whale."]) + preprocessor((first, second)) + + # Custom vocabulary. + def train_sentencepiece(ds, vocab_size): + bytes_io = io.BytesIO() + sentencepiece.SentencePieceTrainer.train( + sentence_iterator=ds.as_numpy_iterator(), + model_writer=bytes_io, + vocab_size=vocab_size, + model_type="WORD", + unk_id=0, + bos_id=1, + eos_id=2, + ) + return bytes_io.getvalue() + ds = tf.data.Dataset.from_tensor_slices( + ["the quick brown fox", "the earth is round"] ) - preprocessor(sentences) - # Same output. - preprocessor( - ["The quick brown fox jumped.", "Call me Ishmael."] + proto = train_sentencepiece(ds, vocab_size=10) + tokenizer = keras_nlp.models.XLMRobertaTokenizer(proto=proto) + preprocessor = keras_nlp.models.XLMRobertaPreprocessor(tokenizer) + preprocessor("The quick brown fox jumped.") + ``` + + Mapping with `tf.data.Dataset`. + ```python + preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset( + "xlm_roberta_base_multi" ) - # Tokenize and pack a sentence pair. - first_sentence = tf.constant("The quick brown fox jumped.") - second_sentence = tf.constant("The fox tripped.") - preprocessor((first_sentence, second_sentence)) + first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."]) + second = tf.constant(["The fox tripped.", "Oh look, a whale."]) + label = tf.constant([1, 1]) - # Map a dataset to preprocess a single sentence. - features = tf.constant( - ["The quick brown fox jumped.", "Call me Ishmael."] - ) - labels = tf.constant([0, 1]) - ds = tf.data.Dataset.from_tensor_slices((features, labels)) + # Map labeled single sentences. + ds = tf.data.Dataset.from_tensor_slices((first, label)) ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - # Map a dataset to preprocess sentence pairs. - first_sentences = tf.constant( - ["The quick brown fox jumped.", "Call me Ishmael."] - ) - second_sentences = tf.constant( - ["The fox tripped.", "Oh look, a whale."] - ) - labels = tf.constant([1, 1]) - ds = tf.data.Dataset.from_tensor_slices( - ( - (first_sentences, second_sentences), labels - ) - ) + # Map unlabeled single sentences. + ds = tf.data.Dataset.from_tensor_slices(first) ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) - # Map a dataset to preprocess unlabeled sentence pairs. - first_sentences = tf.constant( - ["The quick brown fox jumped.", "Call me Ishmael."] - ) - second_sentences = tf.constant( - ["The fox tripped.", "Oh look, a whale."] - ) - ds = tf.data.Dataset.from_tensor_slices((first_sentences, second_sentences)) + # Map labeled sentence pairs. + ds = tf.data.Dataset.from_tensor_slices(((first, second), label)) + ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE) + + # Map unlabeled sentence pairs. + ds = tf.data.Dataset.from_tensor_slices((first, second)) # Watch out for tf.data's default unpacking of tuples here! # Best to invoke the `preprocessor` directly in this case. ds = ds.map( - lambda s1, s2: preprocessor(x=(s1, s2)), + lambda first, second: preprocessor(x=(first, second)), num_parallel_calls=tf.data.AUTOTUNE, ) - - # Alternatively, you can create a preprocessor from your own vocabulary. - # The usage is exactly the same as above. - tokenizer = keras_nlp.models.XLMRobertaTokenizer(proto="model.spm") - preprocessor = keras_nlp.models.XLMRobertaPreprocessor( - tokenizer=tokenizer, - sequence_length=10, - ) ``` """ diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py index e077a7643d..9fa173d941 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py @@ -27,19 +27,19 @@ @keras_nlp_export("keras_nlp.models.XLMRobertaTokenizer") class XLMRobertaTokenizer(SentencePieceTokenizer): - """XLM-RoBERTa tokenizer layer based on SentencePiece. + """An XLM-RoBERTa using SentencePiece subword segmentation. This tokenizer class will tokenize raw strings into integer sequences and is based on `keras_nlp.tokenizers.SentencePieceTokenizer`. Unlike the underlying tokenizer, it will check for all special tokens needed by XLM-RoBERTa models and provides a `from_preset()` method to automatically - download a matching vocabulary for a XLM-RoBERTa preset. + download a matching vocabulary for an XLM-RoBERTa preset. The original fairseq implementation of XLM-RoBERTa modifies the indices of the SentencePiece tokenizer output. To preserve compatibility, we make the same changes, i.e., `""`, `""`, `""` and `""` are mapped to - 0, 1, 2, 3, respectively, and non-special token indices are shifted right - by one. + 0, 1, 2, and 3, respectively, and non-special token indices are shifted + right by one. If input is a batch of strings (rank > 0), the layer will output a `tf.RaggedTensor` where the last dimension of the output is ragged. @@ -48,14 +48,27 @@ class XLMRobertaTokenizer(SentencePieceTokenizer): `tf.Tensor` with static shape `[None]`. Args: - proto: Either a `string` path to a SentencePiece proto file, or a + proto: Either a `string` path to a SentencePiece proto file or a `bytes` object with a serialized SentencePiece proto. See the [SentencePiece repository](https://github.com/google/sentencepiece) for more details on the format. Examples: - ```python + tokenizer = keras_nlp.models.XLMRobertaTokenizer.from_preset( + "xlm_roberta_base_multi", + ) + + # Unbatched inputs. + tokenizer("the quick brown fox") + + # Batched inputs. + tokenizer(["the quick brown fox", "الأرض كروية"]) + + # Detokenization. + tokenizer.detokenize(tokenizer("the quick brown fox")) + + # Custom vocabulary def train_sentencepiece(ds, vocab_size): bytes_io = io.BytesIO() sentencepiece.SentencePieceTrainer.train( @@ -72,18 +85,8 @@ def train_sentencepiece(ds, vocab_size): ds = tf.data.Dataset.from_tensor_slices( ["the quick brown fox", "the earth is round"] ) - proto = train_sentencepiece(ds, vocab_size=10) tokenizer = keras_nlp.models.XLMRobertaTokenizer(proto=proto) - - # Batched inputs. - tokenizer(["the quick brown fox", "the earth is round"]) - - # Unbatched inputs. - tokenizer("the quick brown fox") - - # Detokenization. - tokenizer.detokenize(tf.constant([[0, 4, 9, 5, 7, 2]])) ``` """ From 19f05f4decaa61c64cef974fad8125ae81ee9c77 Mon Sep 17 00:00:00 2001 From: Mohamed Abu El-Nasr <64566340+abuelnasr0@users.noreply.github.com> Date: Mon, 20 Mar 2023 13:18:37 +0200 Subject: [PATCH 2/4] Fix typo --- keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py index 9fa173d941..0aa483ad01 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py @@ -27,7 +27,7 @@ @keras_nlp_export("keras_nlp.models.XLMRobertaTokenizer") class XLMRobertaTokenizer(SentencePieceTokenizer): - """An XLM-RoBERTa using SentencePiece subword segmentation. + """An XLM-RoBERTa tokenizer using SentencePiece subword segmentation. This tokenizer class will tokenize raw strings into integer sequences and is based on `keras_nlp.tokenizers.SentencePieceTokenizer`. Unlike the From 8289b15ef062883b5298d0f18a9a78e2a2ee1c28 Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Wed, 22 Mar 2023 02:28:20 +0200 Subject: [PATCH 3/4] Add arabic example and Shorten the comment about sentencepiece tokenizer --- keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py | 8 ++++---- .../models/xlm_roberta/xlm_roberta_classifier.py | 2 +- .../models/xlm_roberta/xlm_roberta_preprocessor.py | 10 +++++----- keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py | 11 +++++------ 4 files changed, 15 insertions(+), 16 deletions(-) diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py b/keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py index 840ae31f07..db5967cb38 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py @@ -24,7 +24,7 @@ @keras_nlp_export("keras_nlp.models.XLMRobertaBackbone") class XLMRobertaBackbone(roberta_backbone.RobertaBackbone): - """An XLM-RoBERTa encoder. + """An XLM-RoBERTa encoder network. This class implements a bi-directional Transformer-based encoder as described in ["Unsupervised Cross-lingual Representation Learning at Scale"](https://arxiv.org/abs/1911.02116). @@ -32,8 +32,8 @@ class XLMRobertaBackbone(roberta_backbone.RobertaBackbone): include the masked language modeling head used during pretraining. The default constructor gives a fully customizable, randomly initialized - RoBERTa encoder with any number of layers, heads, and embedding dimensions. - To load preset architectures and weights, use the `from_preset()` + RoBERTa encoder with any number of layers, heads, and embedding dimensions. + To load preset architectures and weights, use the `from_preset()` constructor. Disclaimer: Pre-trained models are provided on an "as is" basis, without @@ -52,7 +52,7 @@ class XLMRobertaBackbone(roberta_backbone.RobertaBackbone): dropout: float. Dropout probability for the Transformer encoder. max_sequence_length: int. The maximum sequence length this encoder can consume. The sequence length of the input must be less than - `max_sequence_length` default value. This determines the variable + `max_sequence_length` default value. This determines the variable shape for positional embeddings. Examples: diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py index 8096bb7433..d7cf1dba61 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py @@ -91,7 +91,7 @@ class XLMRobertaClassifier(Task): "token_ids": tf.ones(shape=(2, 12), dtype=tf.int64), "padding_mask": tf.constant( [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12) - ), + ), xlm } labels = [0, 3] diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py index 95adb123f3..6ecb5016e7 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py @@ -40,9 +40,9 @@ class XLMRobertaPreprocessor(Preprocessor): 1. Tokenize any number of input segments using the `tokenizer`. 2. Pack the inputs together using a `keras_nlp.layers.MultiSegmentPacker`. - with the appropriate `""`, `""` and `""` tokens, i.e., adding - a single `""` at the start of the entire sequence, `""` at the - end of each segment, save the last and a `""` at the end of the + with the appropriate `""`, `""` and `""` tokens, i.e., adding + a single `""` at the start of the entire sequence, `""` at the + end of each segment, save the last and a `""` at the end of the entire sequence. 3. Construct a dictionary with keys `"token_ids"` and `"padding_mask"`, that can be passed directly to an XLM-RoBERTa model. @@ -88,7 +88,7 @@ class XLMRobertaPreprocessor(Preprocessor): # Preprocess a batch of sentence pairs. # When handling multiple sequences, always convert to tensors first! first = tf.constant(["The quick brown fox jumped.", "اسمي اسماعيل"]) - second = tf.constant(["The fox tripped.", "Oh look, a whale."]) + second = tf.constant(["The fox tripped.", "الأسد ملك الغابة"]) preprocessor((first, second)) # Custom vocabulary. @@ -110,7 +110,7 @@ def train_sentencepiece(ds, vocab_size): proto = train_sentencepiece(ds, vocab_size=10) tokenizer = keras_nlp.models.XLMRobertaTokenizer(proto=proto) preprocessor = keras_nlp.models.XLMRobertaPreprocessor(tokenizer) - preprocessor("The quick brown fox jumped.") + preprocessor("The quick brown fox jumped.") ``` Mapping with `tf.data.Dataset`. diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py index 0aa483ad01..d679ab9ba3 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py @@ -35,11 +35,10 @@ class XLMRobertaTokenizer(SentencePieceTokenizer): XLM-RoBERTa models and provides a `from_preset()` method to automatically download a matching vocabulary for an XLM-RoBERTa preset. - The original fairseq implementation of XLM-RoBERTa modifies the indices of - the SentencePiece tokenizer output. To preserve compatibility, we make the - same changes, i.e., `""`, `""`, `""` and `""` are mapped to - 0, 1, 2, and 3, respectively, and non-special token indices are shifted - right by one. + Note: If you are providing your own custom SentencePiece model, the original + fairseq implementation of XLM-RoBERTa re-maps some token indices from the + underlying sentencepiece output. To preserve compatibility, we do the same + re-mapping here. If input is a batch of strings (rank > 0), the layer will output a `tf.RaggedTensor` where the last dimension of the output is ragged. @@ -58,7 +57,7 @@ class XLMRobertaTokenizer(SentencePieceTokenizer): tokenizer = keras_nlp.models.XLMRobertaTokenizer.from_preset( "xlm_roberta_base_multi", ) - + # Unbatched inputs. tokenizer("the quick brown fox") From 01782be175f9e1778e0ca6604a42e153ba4d2d79 Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Wed, 22 Mar 2023 02:36:16 +0200 Subject: [PATCH 4/4] Fix typo --- keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py index d7cf1dba61..8096bb7433 100644 --- a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py +++ b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py @@ -91,7 +91,7 @@ class XLMRobertaClassifier(Task): "token_ids": tf.ones(shape=(2, 12), dtype=tf.int64), "padding_mask": tf.constant( [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12) - ), xlm + ), } labels = [0, 3]