diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py b/keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py
index 6bef8e2c0a..db5967cb38 100644
--- a/keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py
+++ b/keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py
@@ -24,17 +24,16 @@
@keras_nlp_export("keras_nlp.models.XLMRobertaBackbone")
class XLMRobertaBackbone(roberta_backbone.RobertaBackbone):
- """XLM-RoBERTa encoder.
+ """An XLM-RoBERTa encoder network.
- This network implements a bi-directional Transformer-based encoder as
- described in
- ["Unsupervised Cross-lingual Representation Learning at Scale"](https://arxiv.org/abs/1911.02116).
- It includes the embedding lookups and transformer layers, but does not
+ This class implements a bi-directional Transformer-based encoder as
+ described in ["Unsupervised Cross-lingual Representation Learning at Scale"](https://arxiv.org/abs/1911.02116).
+ It includes the embedding lookups and transformer layers, but it does not
include the masked language modeling head used during pretraining.
The default constructor gives a fully customizable, randomly initialized
- RoBERTa encoder with any number of layers, heads, and embedding
- dimensions. To load preset architectures and weights, use the `from_preset`
+ RoBERTa encoder with any number of layers, heads, and embedding dimensions.
+ To load preset architectures and weights, use the `from_preset()`
constructor.
Disclaimer: Pre-trained models are provided on an "as is" basis, without
@@ -53,9 +52,10 @@ class XLMRobertaBackbone(roberta_backbone.RobertaBackbone):
dropout: float. Dropout probability for the Transformer encoder.
max_sequence_length: int. The maximum sequence length this encoder can
consume. The sequence length of the input must be less than
- `max_sequence_length`.
+ `max_sequence_length` default value. This determines the variable
+ shape for positional embeddings.
- Example usage:
+ Examples:
```python
input_data = {
"token_ids": tf.ones(shape=(1, 12), dtype=tf.int64),
@@ -63,24 +63,22 @@ class XLMRobertaBackbone(roberta_backbone.RobertaBackbone):
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)),
}
- # Pretrained XLM-R encoder
+ # Pretrained XLM-R encoder.
model = keras_nlp.models.XLMRobertaBackbone.from_preset(
"xlm_roberta_base_multi",
)
- output = model(input_data)
+ model(input_data)
- # Randomly initialized XLM-R model with custom config
+ # Randomly initialized XLM-R model with custom config.
model = keras_nlp.models.XLMRobertaBackbone(
vocabulary_size=250002,
- num_layers=12,
- num_heads=12,
- hidden_dim=768,
- intermediate_dim=3072,
- max_sequence_length=12
+ num_layers=4,
+ num_heads=4,
+ hidden_dim=256,
+ intermediate_dim=512,
+ max_sequence_length=128
)
-
- # Call the model on the input data.
- output = model(input_data)
+ model(input_data)
```
"""
diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py
index 3ff68b00b3..8096bb7433 100644
--- a/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py
+++ b/keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py
@@ -34,9 +34,9 @@ class XLMRobertaClassifier(Task):
"""An end-to-end XLM-RoBERTa model for classification tasks.
This model attaches a classification head to a
- `keras_nlp.model.XLMRobertaBackbone` model, mapping from the backbone
- outputs to logit output suitable for a classification task. For usage of
- this model with pre-trained weights, see the `from_preset()` method.
+ `keras_nlp.model.XLMRobertaBackbone` instance, mapping from the backbone
+ outputs to logits suitable for a classification task. For usage of
+ this model with pre-trained weights, see the `from_preset()` constructor.
This model can optionally be configured with a `preprocessor` layer, in
which case it will automatically apply preprocessing to raw inputs during
@@ -49,7 +49,7 @@ class XLMRobertaClassifier(Task):
[here](https://github.com/facebookresearch/fairseq).
Args:
- backbone: A `keras_nlp.models.XLMRoberta` instance.
+ backbone: A `keras_nlp.models.XLMRobertaBackbone` instance.
num_classes: int. Number of classes to predict.
hidden_dim: int. The size of the pooler layer.
dropout: float. The dropout probability value, applied to the pooled
@@ -60,85 +60,34 @@ class XLMRobertaClassifier(Task):
Examples:
- Example usage.
+ Raw string data.
```python
- preprocessed_features = {
- "token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
- "padding_mask": tf.constant(
- [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)),
- }
- labels = [0, 3]
-
- # Randomly initialized XLM-RoBERTa encoder
- backbone = keras_nlp.models.XLMRobertaBackbone(
- vocabulary_size=250002,
- num_layers=12,
- num_heads=12,
- hidden_dim=768,
- intermediate_dim=3072,
- max_sequence_length=12
- )
-
- # Create a XLM-RoBERTa classifier and fit your data.
- classifier = keras_nlp.models.XLMRobertaClassifier(
- backbone,
- num_classes=4,
- preprocessor=None,
- )
- classifier.compile(
- loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
- )
- classifier.fit(x=preprocessed_features, y=labels, batch_size=2)
-
- # Access backbone programatically (e.g., to change `trainable`)
- classifier.backbone.trainable = False
- ```
-
- Raw string inputs.
- ```python
- # Create a dataset with raw string features in an `(x, y)` format.
- features = ["The quick brown fox jumped.", "I forgot my homework."]
+ features = ["The quick brown fox jumped.", "نسيت الواجب"]
labels = [0, 3]
- # Create a XLMRobertaClassifier and fit your data.
+ # Pretrained classifier.
classifier = keras_nlp.models.XLMRobertaClassifier.from_preset(
"xlm_roberta_base_multi",
num_classes=4,
)
- classifier.compile(
- loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
- )
classifier.fit(x=features, y=labels, batch_size=2)
- ```
+ classifier.predict(x=features, batch_size=2)
- Raw string inputs with customized preprocessing.
- ```python
- # Create a dataset with raw string features in an `(x, y)` format.
- features = ["The quick brown fox jumped.", "I forgot my homework."]
- labels = [0, 3]
-
- # Use a shorter sequence length.
- preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset(
- "xlm_roberta_base_multi",
- sequence_length=128,
- )
-
- # Create a XLMRobertaClassifier and fit your data.
- classifier = keras_nlp.models.XLMRobertaClassifier.from_preset(
- "xlm_roberta_base_multi",
- num_classes=4,
- preprocessor=preprocessor,
- )
+ # Re-compile (e.g., with a new learning rate).
classifier.compile(
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+ optimizer=keras.optimizers.Adam(5e-5),
+ jit_compile=True,
)
+ # Access backbone programmatically (e.g., to change `trainable`).
+ classifier.backbone.trainable = False
+ # Fit again.
classifier.fit(x=features, y=labels, batch_size=2)
```
- Preprocessed inputs.
+ Preprocessed integer data.
```python
- # Create a dataset with preprocessed features in an `(x, y)` format.
- preprocessed_features = {
+ features = {
"token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
"padding_mask": tf.constant(
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
@@ -146,16 +95,57 @@ class XLMRobertaClassifier(Task):
}
labels = [0, 3]
- # Create a XLMRobertaClassifier and fit your data.
+ # Pretrained classifier without preprocessing.
classifier = keras_nlp.models.XLMRobertaClassifier.from_preset(
"xlm_roberta_base_multi",
num_classes=4,
preprocessor=None,
)
- classifier.compile(
- loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+ classifier.fit(x=features, y=labels, batch_size=2)
+ ```
+
+ Custom backbone and vocabulary.
+ ```python
+ features = ["The quick brown fox jumped.", "نسيت الواجب"]
+ labels = [0, 3]
+
+ def train_sentencepiece(ds, vocab_size):
+ bytes_io = io.BytesIO()
+ sentencepiece.SentencePieceTrainer.train(
+ sentence_iterator=ds.as_numpy_iterator(),
+ model_writer=bytes_io,
+ vocab_size=vocab_size,
+ model_type="WORD",
+ unk_id=0,
+ bos_id=1,
+ eos_id=2,
+ )
+ return bytes_io.getvalue()
+ ds = tf.data.Dataset.from_tensor_slices(
+ ["the quick brown fox", "the earth is round"]
+ )
+ proto = train_sentencepiece(ds, vocab_size=10)
+ tokenizer = keras_nlp.models.XLMRobertaTokenizer(
+ proto=proto
+ )
+ preprocessor = keras_nlp.models.XLMRobertaPreprocessor(
+ tokenizer,
+ sequence_length=128,
+ )
+ backbone = keras_nlp.models.XLMRobertaBackbone(
+ vocabulary_size=250002,
+ num_layers=4,
+ num_heads=4,
+ hidden_dim=256,
+ intermediate_dim=512,
+ max_sequence_length=128,
)
- classifier.fit(x=preprocessed_features, y=labels, batch_size=2)
+ classifier = keras_nlp.models.XLMRobertaClassifier(
+ backbone=backbone,
+ preprocessor=preprocessor,
+ num_classes=4,
+ )
+ classifier.fit(x=features, y=labels, batch_size=2)
```
"""
diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py
index fa38f1e380..6ecb5016e7 100644
--- a/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py
+++ b/keras_nlp/models/xlm_roberta/xlm_roberta_preprocessor.py
@@ -34,35 +34,23 @@
@keras_nlp_export("keras_nlp.models.XLMRobertaPreprocessor")
class XLMRobertaPreprocessor(Preprocessor):
- """XLM-RoBERTa preprocessing layer.
+ """An XLM-RoBERTa preprocessing layer which tokenizes and packs inputs.
This preprocessing layer will do three things:
- - Tokenize any number of input segments using the `tokenizer`.
- - Pack the inputs together with the appropriate `""`, `""` and
- `""` tokens, i.e., adding a single `""` at the start of the
- entire sequence, `""` at the end of each segment, save the last
- and a `""` at the end of the entire sequence.
- - Construct a dictionary with keys `"token_ids"` and `"padding_mask"`,
- that can be passed directly to a XLM-RoBERTa model.
+ 1. Tokenize any number of input segments using the `tokenizer`.
+ 2. Pack the inputs together using a `keras_nlp.layers.MultiSegmentPacker`.
+ with the appropriate `""`, `""` and `""` tokens, i.e., adding
+ a single `""` at the start of the entire sequence, `""` at the
+ end of each segment, save the last and a `""` at the end of the
+ entire sequence.
+ 3. Construct a dictionary with keys `"token_ids"` and `"padding_mask"`,
+ that can be passed directly to an XLM-RoBERTa model.
This layer can be used directly with `tf.data.Dataset.map` to preprocess
string data in the `(x, y, sample_weight)` format used by
`keras.Model.fit`.
- The call method of this layer accepts three arguments, `x`, `y`, and
- `sample_weight`. `x` can be a python string or tensor representing a single
- segment, a list of python strings representing a batch of single segments,
- or a list of tensors representing multiple segments to be packed together.
- `y` and `sample_weight` are both optional, can have any format, and will be
- passed through unaltered.
-
- Special care should be taken when using `tf.data` to map over an unlabeled
- tuple of string segments. `tf.data.Dataset.map` will unpack this tuple
- directly into the call arguments of this layer, rather than forward all
- argument to `x`. To handle this case, it is recommended to explicitly call
- the layer, e.g. `ds.map(lambda seg1, seg2: preprocessor(x=(seg1, seg2)))`.
-
Args:
tokenizer: A `keras_nlp.tokenizers.XLMRobertaTokenizer` instance.
sequence_length: The length of the packed inputs.
@@ -77,77 +65,84 @@ class XLMRobertaPreprocessor(Preprocessor):
left-to-right manner and fills up the buckets until we run
out of budget. It supports an arbitrary number of segments.
+ Call arguments:
+ x: A tensor of single string sequences, or a tuple of multiple
+ tensor sequences to be packed together. Inputs may be batched or
+ unbatched. For single sequences, raw python inputs will be converted
+ to tensors. For multiple sequences, pass tensors directly.
+ y: Any label data. Will be passed through unaltered.
+ sample_weight: Any label weight data. Will be passed through unaltered.
+
Examples:
```python
- # Load the preprocessor from a preset.
- preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset("xlm_roberta_base_multi")
+ preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset(
+ "xlm_roberta_base_multi"
+ )
# Tokenize and pack a single sentence.
- sentence = tf.constant("The quick brown fox jumped.")
- preprocessor(sentence)
- # Same output.
preprocessor("The quick brown fox jumped.")
- # Tokenize and a batch of single sentences.
- sentences = tf.constant(
- ["The quick brown fox jumped.", "Call me Ishmael."]
+ # Tokenize a batch of single sentences.
+ preprocessor(["The quick brown fox jumped.", "اسمي اسماعيل"])
+
+ # Preprocess a batch of sentence pairs.
+ # When handling multiple sequences, always convert to tensors first!
+ first = tf.constant(["The quick brown fox jumped.", "اسمي اسماعيل"])
+ second = tf.constant(["The fox tripped.", "الأسد ملك الغابة"])
+ preprocessor((first, second))
+
+ # Custom vocabulary.
+ def train_sentencepiece(ds, vocab_size):
+ bytes_io = io.BytesIO()
+ sentencepiece.SentencePieceTrainer.train(
+ sentence_iterator=ds.as_numpy_iterator(),
+ model_writer=bytes_io,
+ vocab_size=vocab_size,
+ model_type="WORD",
+ unk_id=0,
+ bos_id=1,
+ eos_id=2,
+ )
+ return bytes_io.getvalue()
+ ds = tf.data.Dataset.from_tensor_slices(
+ ["the quick brown fox", "the earth is round"]
)
- preprocessor(sentences)
- # Same output.
- preprocessor(
- ["The quick brown fox jumped.", "Call me Ishmael."]
+ proto = train_sentencepiece(ds, vocab_size=10)
+ tokenizer = keras_nlp.models.XLMRobertaTokenizer(proto=proto)
+ preprocessor = keras_nlp.models.XLMRobertaPreprocessor(tokenizer)
+ preprocessor("The quick brown fox jumped.")
+ ```
+
+ Mapping with `tf.data.Dataset`.
+ ```python
+ preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset(
+ "xlm_roberta_base_multi"
)
- # Tokenize and pack a sentence pair.
- first_sentence = tf.constant("The quick brown fox jumped.")
- second_sentence = tf.constant("The fox tripped.")
- preprocessor((first_sentence, second_sentence))
+ first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
+ second = tf.constant(["The fox tripped.", "Oh look, a whale."])
+ label = tf.constant([1, 1])
- # Map a dataset to preprocess a single sentence.
- features = tf.constant(
- ["The quick brown fox jumped.", "Call me Ishmael."]
- )
- labels = tf.constant([0, 1])
- ds = tf.data.Dataset.from_tensor_slices((features, labels))
+ # Map labeled single sentences.
+ ds = tf.data.Dataset.from_tensor_slices((first, label))
ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
- # Map a dataset to preprocess sentence pairs.
- first_sentences = tf.constant(
- ["The quick brown fox jumped.", "Call me Ishmael."]
- )
- second_sentences = tf.constant(
- ["The fox tripped.", "Oh look, a whale."]
- )
- labels = tf.constant([1, 1])
- ds = tf.data.Dataset.from_tensor_slices(
- (
- (first_sentences, second_sentences), labels
- )
- )
+ # Map unlabeled single sentences.
+ ds = tf.data.Dataset.from_tensor_slices(first)
ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
- # Map a dataset to preprocess unlabeled sentence pairs.
- first_sentences = tf.constant(
- ["The quick brown fox jumped.", "Call me Ishmael."]
- )
- second_sentences = tf.constant(
- ["The fox tripped.", "Oh look, a whale."]
- )
- ds = tf.data.Dataset.from_tensor_slices((first_sentences, second_sentences))
+ # Map labeled sentence pairs.
+ ds = tf.data.Dataset.from_tensor_slices(((first, second), label))
+ ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
+
+ # Map unlabeled sentence pairs.
+ ds = tf.data.Dataset.from_tensor_slices((first, second))
# Watch out for tf.data's default unpacking of tuples here!
# Best to invoke the `preprocessor` directly in this case.
ds = ds.map(
- lambda s1, s2: preprocessor(x=(s1, s2)),
+ lambda first, second: preprocessor(x=(first, second)),
num_parallel_calls=tf.data.AUTOTUNE,
)
-
- # Alternatively, you can create a preprocessor from your own vocabulary.
- # The usage is exactly the same as above.
- tokenizer = keras_nlp.models.XLMRobertaTokenizer(proto="model.spm")
- preprocessor = keras_nlp.models.XLMRobertaPreprocessor(
- tokenizer=tokenizer,
- sequence_length=10,
- )
```
"""
diff --git a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py
index e077a7643d..d679ab9ba3 100644
--- a/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py
+++ b/keras_nlp/models/xlm_roberta/xlm_roberta_tokenizer.py
@@ -27,19 +27,18 @@
@keras_nlp_export("keras_nlp.models.XLMRobertaTokenizer")
class XLMRobertaTokenizer(SentencePieceTokenizer):
- """XLM-RoBERTa tokenizer layer based on SentencePiece.
+ """An XLM-RoBERTa tokenizer using SentencePiece subword segmentation.
This tokenizer class will tokenize raw strings into integer sequences and
is based on `keras_nlp.tokenizers.SentencePieceTokenizer`. Unlike the
underlying tokenizer, it will check for all special tokens needed by
XLM-RoBERTa models and provides a `from_preset()` method to automatically
- download a matching vocabulary for a XLM-RoBERTa preset.
+ download a matching vocabulary for an XLM-RoBERTa preset.
- The original fairseq implementation of XLM-RoBERTa modifies the indices of
- the SentencePiece tokenizer output. To preserve compatibility, we make the
- same changes, i.e., `""`, `""`, `""` and `""` are mapped to
- 0, 1, 2, 3, respectively, and non-special token indices are shifted right
- by one.
+ Note: If you are providing your own custom SentencePiece model, the original
+ fairseq implementation of XLM-RoBERTa re-maps some token indices from the
+ underlying sentencepiece output. To preserve compatibility, we do the same
+ re-mapping here.
If input is a batch of strings (rank > 0), the layer will output a
`tf.RaggedTensor` where the last dimension of the output is ragged.
@@ -48,14 +47,27 @@ class XLMRobertaTokenizer(SentencePieceTokenizer):
`tf.Tensor` with static shape `[None]`.
Args:
- proto: Either a `string` path to a SentencePiece proto file, or a
+ proto: Either a `string` path to a SentencePiece proto file or a
`bytes` object with a serialized SentencePiece proto. See the
[SentencePiece repository](https://github.com/google/sentencepiece)
for more details on the format.
Examples:
-
```python
+ tokenizer = keras_nlp.models.XLMRobertaTokenizer.from_preset(
+ "xlm_roberta_base_multi",
+ )
+
+ # Unbatched inputs.
+ tokenizer("the quick brown fox")
+
+ # Batched inputs.
+ tokenizer(["the quick brown fox", "الأرض كروية"])
+
+ # Detokenization.
+ tokenizer.detokenize(tokenizer("the quick brown fox"))
+
+ # Custom vocabulary
def train_sentencepiece(ds, vocab_size):
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
@@ -72,18 +84,8 @@ def train_sentencepiece(ds, vocab_size):
ds = tf.data.Dataset.from_tensor_slices(
["the quick brown fox", "the earth is round"]
)
-
proto = train_sentencepiece(ds, vocab_size=10)
tokenizer = keras_nlp.models.XLMRobertaTokenizer(proto=proto)
-
- # Batched inputs.
- tokenizer(["the quick brown fox", "the earth is round"])
-
- # Unbatched inputs.
- tokenizer("the quick brown fox")
-
- # Detokenization.
- tokenizer.detokenize(tf.constant([[0, 4, 9, 5, 7, 2]]))
```
"""