Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 18 additions & 20 deletions keras_nlp/models/xlm_roberta/xlm_roberta_backbone.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,16 @@

@keras_nlp_export("keras_nlp.models.XLMRobertaBackbone")
class XLMRobertaBackbone(roberta_backbone.RobertaBackbone):
"""XLM-RoBERTa encoder.
"""An XLM-RoBERTa encoder network.

This network implements a bi-directional Transformer-based encoder as
described in
["Unsupervised Cross-lingual Representation Learning at Scale"](https://arxiv.org/abs/1911.02116).
It includes the embedding lookups and transformer layers, but does not
This class implements a bi-directional Transformer-based encoder as
described in ["Unsupervised Cross-lingual Representation Learning at Scale"](https://arxiv.org/abs/1911.02116).
It includes the embedding lookups and transformer layers, but it does not
include the masked language modeling head used during pretraining.

The default constructor gives a fully customizable, randomly initialized
RoBERTa encoder with any number of layers, heads, and embedding
dimensions. To load preset architectures and weights, use the `from_preset`
RoBERTa encoder with any number of layers, heads, and embedding dimensions.
To load preset architectures and weights, use the `from_preset()`
constructor.

Disclaimer: Pre-trained models are provided on an "as is" basis, without
Expand All @@ -53,34 +52,33 @@ class XLMRobertaBackbone(roberta_backbone.RobertaBackbone):
dropout: float. Dropout probability for the Transformer encoder.
max_sequence_length: int. The maximum sequence length this encoder can
consume. The sequence length of the input must be less than
`max_sequence_length`.
`max_sequence_length` default value. This determines the variable
shape for positional embeddings.

Example usage:
Examples:
```python
input_data = {
"token_ids": tf.ones(shape=(1, 12), dtype=tf.int64),
"padding_mask": tf.constant(
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0], shape=(1, 12)),
}

# Pretrained XLM-R encoder
# Pretrained XLM-R encoder.
model = keras_nlp.models.XLMRobertaBackbone.from_preset(
"xlm_roberta_base_multi",
)
output = model(input_data)
model(input_data)

# Randomly initialized XLM-R model with custom config
# Randomly initialized XLM-R model with custom config.
model = keras_nlp.models.XLMRobertaBackbone(
vocabulary_size=250002,
num_layers=12,
num_heads=12,
hidden_dim=768,
intermediate_dim=3072,
max_sequence_length=12
num_layers=4,
num_heads=4,
hidden_dim=256,
intermediate_dim=512,
max_sequence_length=128
)

# Call the model on the input data.
output = model(input_data)
model(input_data)
```
"""

Expand Down
132 changes: 61 additions & 71 deletions keras_nlp/models/xlm_roberta/xlm_roberta_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ class XLMRobertaClassifier(Task):
"""An end-to-end XLM-RoBERTa model for classification tasks.

This model attaches a classification head to a
`keras_nlp.model.XLMRobertaBackbone` model, mapping from the backbone
outputs to logit output suitable for a classification task. For usage of
this model with pre-trained weights, see the `from_preset()` method.
`keras_nlp.model.XLMRobertaBackbone` instance, mapping from the backbone
outputs to logits suitable for a classification task. For usage of
this model with pre-trained weights, see the `from_preset()` constructor.

This model can optionally be configured with a `preprocessor` layer, in
which case it will automatically apply preprocessing to raw inputs during
Expand All @@ -49,7 +49,7 @@ class XLMRobertaClassifier(Task):
[here](https://github.com/facebookresearch/fairseq).

Args:
backbone: A `keras_nlp.models.XLMRoberta` instance.
backbone: A `keras_nlp.models.XLMRobertaBackbone` instance.
num_classes: int. Number of classes to predict.
hidden_dim: int. The size of the pooler layer.
dropout: float. The dropout probability value, applied to the pooled
Expand All @@ -60,102 +60,92 @@ class XLMRobertaClassifier(Task):

Examples:

Example usage.
Raw string data.
```python
preprocessed_features = {
"token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
"padding_mask": tf.constant(
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)),
}
labels = [0, 3]

# Randomly initialized XLM-RoBERTa encoder
backbone = keras_nlp.models.XLMRobertaBackbone(
vocabulary_size=250002,
num_layers=12,
num_heads=12,
hidden_dim=768,
intermediate_dim=3072,
max_sequence_length=12
)

# Create a XLM-RoBERTa classifier and fit your data.
classifier = keras_nlp.models.XLMRobertaClassifier(
backbone,
num_classes=4,
preprocessor=None,
)
classifier.compile(
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
)
classifier.fit(x=preprocessed_features, y=labels, batch_size=2)

# Access backbone programatically (e.g., to change `trainable`)
classifier.backbone.trainable = False
```

Raw string inputs.
```python
# Create a dataset with raw string features in an `(x, y)` format.
features = ["The quick brown fox jumped.", "I forgot my homework."]
features = ["The quick brown fox jumped.", "نسيت الواجب"]
labels = [0, 3]

# Create a XLMRobertaClassifier and fit your data.
# Pretrained classifier.
classifier = keras_nlp.models.XLMRobertaClassifier.from_preset(
"xlm_roberta_base_multi",
num_classes=4,
)
classifier.compile(
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
)
classifier.fit(x=features, y=labels, batch_size=2)
```
classifier.predict(x=features, batch_size=2)

Raw string inputs with customized preprocessing.
```python
# Create a dataset with raw string features in an `(x, y)` format.
features = ["The quick brown fox jumped.", "I forgot my homework."]
labels = [0, 3]

# Use a shorter sequence length.
preprocessor = keras_nlp.models.XLMRobertaPreprocessor.from_preset(
"xlm_roberta_base_multi",
sequence_length=128,
)

# Create a XLMRobertaClassifier and fit your data.
classifier = keras_nlp.models.XLMRobertaClassifier.from_preset(
"xlm_roberta_base_multi",
num_classes=4,
preprocessor=preprocessor,
)
# Re-compile (e.g., with a new learning rate).
classifier.compile(
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
optimizer=keras.optimizers.Adam(5e-5),
jit_compile=True,
)
# Access backbone programmatically (e.g., to change `trainable`).
classifier.backbone.trainable = False
# Fit again.
classifier.fit(x=features, y=labels, batch_size=2)
```

Preprocessed inputs.
Preprocessed integer data.
```python
# Create a dataset with preprocessed features in an `(x, y)` format.
preprocessed_features = {
features = {
"token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
"padding_mask": tf.constant(
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
),
}
labels = [0, 3]

# Create a XLMRobertaClassifier and fit your data.
# Pretrained classifier without preprocessing.
classifier = keras_nlp.models.XLMRobertaClassifier.from_preset(
"xlm_roberta_base_multi",
num_classes=4,
preprocessor=None,
)
classifier.compile(
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
classifier.fit(x=features, y=labels, batch_size=2)
```

Custom backbone and vocabulary.
```python
features = ["The quick brown fox jumped.", "نسيت الواجب"]
labels = [0, 3]

def train_sentencepiece(ds, vocab_size):
bytes_io = io.BytesIO()
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=ds.as_numpy_iterator(),
model_writer=bytes_io,
vocab_size=vocab_size,
model_type="WORD",
unk_id=0,
bos_id=1,
eos_id=2,
)
return bytes_io.getvalue()
ds = tf.data.Dataset.from_tensor_slices(
["the quick brown fox", "the earth is round"]
)
proto = train_sentencepiece(ds, vocab_size=10)
tokenizer = keras_nlp.models.XLMRobertaTokenizer(
proto=proto
)
preprocessor = keras_nlp.models.XLMRobertaPreprocessor(
tokenizer,
sequence_length=128,
)
backbone = keras_nlp.models.XLMRobertaBackbone(
vocabulary_size=250002,
num_layers=4,
num_heads=4,
hidden_dim=256,
intermediate_dim=512,
max_sequence_length=128,
)
classifier.fit(x=preprocessed_features, y=labels, batch_size=2)
classifier = keras_nlp.models.XLMRobertaClassifier(
backbone=backbone,
preprocessor=preprocessor,
num_classes=4,
)
classifier.fit(x=features, y=labels, batch_size=2)
```
"""

Expand Down
Loading