Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions keras_nlp/models/albert/albert_backbone.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,12 @@ def albert_kernel_initializer(stddev=0.02):

@keras_nlp_export("keras_nlp.models.AlbertBackbone")
class AlbertBackbone(Backbone):
"""ALBERT encoder network.
"""An ALBERT encoder network.

This class implements a bi-directional Transformer-based encoder as
described in
["ALBERT: A Lite BERT for Self-supervised Learning of Language Representations"](https://arxiv.org/abs/1909.11942).
["ALBERT: A Lite BERT for Self-supervised Learning of Language Representations"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's remove the linebreak here, it's okay to exceed the limit if it's a hyperlink.

(https://arxiv.org/abs/1909.11942).
ALBERT is a more efficient variant of BERT, and uses parameter reduction
techniques such as cross-layer parameter sharing and factorized embedding
parameterization. This model class includes the embedding lookups and
Expand All @@ -46,7 +47,7 @@ class AlbertBackbone(Backbone):

The default constructor gives a fully customizable, randomly initialized
ALBERT encoder with any number of layers, heads, and embedding dimensions.
To load preset architectures and weights, use the `from_preset` constructor.
To load preset architectures and weights, use the `from_preset()` constructor.

Disclaimer: Pre-trained models are provided on an "as is" basis, without
warranties or conditions of any kind.
Expand Down Expand Up @@ -90,14 +91,14 @@ class AlbertBackbone(Backbone):
# Randomly initialized ALBERT encoder
model = keras_nlp.models.AlbertBackbone(
vocabulary_size=30000,
num_layers=12,
num_heads=12,
num_layers=4,
num_heads=4,
num_groups=1,
num_inner_repetitions=1,
embedding_dim=128,
hidden_dim=768,
intermediate_dim=3072,
max_sequence_length=12,
hidden_dim=256,
intermediate_dim=512,
max_sequence_length=128,
)
output = model(input_data)
```
Expand Down
94 changes: 42 additions & 52 deletions keras_nlp/models/albert/albert_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@
class AlbertClassifier(Task):
"""An end-to-end ALBERT model for classification tasks

This model attaches a classification head to a `keras_nlp.model.AlbertBackbone`
backbone, mapping from the backbone outputs to logit output suitable for
a classification task. For usage of this model with pre-trained weights, see
the `from_preset()` method.
This model attaches a classification head to a
`keras_nlp.model.AlbertBackbone` instance, mapping from the backbone outputs to logit output suitable for
a classification task. For usage of this model with pre-trained weights, use
the `from_preset()` constructor.

This model can optionally be configured with a `preprocessor` layer, in
which case it will automatically apply preprocessing to raw inputs during
Expand All @@ -55,49 +55,8 @@ class AlbertClassifier(Task):

Examples:

Example usage.
Raw string data.
```python
# Define the preprocessed inputs.
preprocessed_features = {
"token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
"segment_ids": tf.constant(
[[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
),
"padding_mask": tf.constant(
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
),
}
labels = [0, 3]

# Randomly initialize a ALBERT backbone.
backbone = AlbertBackbone(
vocabulary_size=1000,
num_layers=2,
num_heads=2,
embedding_dim=8,
hidden_dim=64,
intermediate_dim=128,
max_sequence_length=128,
name="encoder",
)

# Create a ALBERT classifier and fit your data.
classifier = keras_nlp.models.AlbertClassifier(
backbone,
num_classes=4,
preprocessor=None,
)
classifier.compile(
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
)
classifier.fit(x=preprocessed_features, y=labels, batch_size=2)

# Access backbone programatically (e.g., to change `trainable`)
classifier.backbone.trainable = False

Raw string inputs with customized preprocessing.
```python
# Create a dataset with raw string features in an `(x, y)` format.
features = ["The quick brown fox jumped.", "I forgot my homework."]
labels = [0, 3]

Expand All @@ -107,19 +66,25 @@ class AlbertClassifier(Task):
sequence_length=128,
)

# Create a AlbertClassifier and fit your data.
# Pretrained classifier.
classifier = keras_nlp.models.AlbertClassifier.from_preset(
"albert_base_en_uncased",
num_classes=4,
preprocessor=preprocessor,
)
classifier.fit(x=features, y=labels, batch_size=2)
classifier.predict(x=features, batch_size=2)

# Re-compile (e.g., with a new learning rate).
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you are missing in the bert classifier example where we run fit before "re compiling"

classifier.compile(
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
optimizer=keras.optimizers.Adam(5e-5),
jit_compile=True,
)
classifier.fit(x=features, y=labels, batch_size=2)
```

Preprocessed inputs.
Preprocessed integer data.
```python
# Create a dataset with preprocessed features in an `(x, y)` format.
preprocessed_features = {
Expand All @@ -133,17 +98,42 @@ class AlbertClassifier(Task):
}
labels = [0, 3]

# Create a ALBERT classifier and fit your data.
# Pretrained classifier without preprocessing.
classifier = keras_nlp.models.AlbertClassifier.from_preset(
"albert_base_en_uncased",
num_classes=4,
preprocessor=None,
)
classifier.compile(
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
)
classifier.fit(x=preprocessed_features, y=labels, batch_size=2)
```

Custom backbone and vocabulary.
```python
features = ["The quick brown fox jumped.", "I forgot my homework."]
labels = [0, 3]
vocab = ["[CLS]", "[SEP]","[UNK]", "[PAD]", "[MASK]"]
vocab += ["The", "quick", "brown", "fox", "jumped", "."]
tokenizer = keras_nlp.models.AlbertTokenizer(
vocabulary=vocab,
)
preprocessor = keras_nlp.models.AlbertPreprocessor(
tokenizer=tokenizer,
sequence_length=128,
)
backbone = keras_nlp.models.AlbertBackbone(
vocabulary_size=30552,
num_layers=4,
num_heads=4,
hidden_dim=256,
intermediate_dim=512,
max_sequence_length=128,
)
classifier = keras_nlp.models.AlbertClassifier(
backbone=backbone,
preprocessor=preprocessor,
num_classes=4,
)
classifier.fit(x=features, y=labels, batch_size=2)
"""

def __init__(
Expand Down
42 changes: 16 additions & 26 deletions keras_nlp/models/albert/albert_masked_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class AlbertMaskedLM(Task):
This model will train ALBERT on a masked language modeling task.
The model will predict labels for a number of masked tokens in the
input data. For usage of this model with pre-trained weights, see the
`from_preset()` method.
`from_preset()` constructor.

This model can optionally be configured with a `preprocessor` layer, in
which case inputs can be raw string features during `fit()`, `predict()`,
Expand All @@ -57,26 +57,31 @@ class AlbertMaskedLM(Task):

Example usage:

Raw string inputs and pretrained backbone.
Raw string data.
```python
# Create a dataset with raw string features. Labels are inferred.
features = ["The quick brown fox jumped.", "I forgot my homework."]

# Create a AlbertMaskedLM with a pretrained backbone and further train
# on an MLM task.
# Pretrained language model.
masked_lm = keras_nlp.models.AlbertMaskedLM.from_preset(
"albert_base_en_uncased",
)
masked_lm.fit(x=features, batch_size=2)
# Re-compile (e.g., with a new learning rate).
masked_lm.compile(
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
optimizer=keras.optimizers.Adam(5e-5),
jit_compile=True,
)
# Access backbone programatically (e.g., to change `trainable`).
masked_lm.backbone.trainable = False
# Fit again.
masked_lm.fit(x=features, batch_size=2)
```

Preprocessed inputs and custom backbone.
Preprocessed integer data.
```python
# Create a preprocessed dataset where 0 is the mask token.
preprocessed_features = {
# Create a preprocessed batch where 0 is the mask token.
features = {
"segment_ids": tf.constant(
[[1, 0, 0, 4, 0, 6, 7, 8]] * 2, shape=(2, 8)
),
Expand All @@ -91,24 +96,9 @@ class AlbertMaskedLM(Task):
# Labels are the original masked values.
labels = [[3, 5]] * 2

# Randomly initialize a ALBERT encoder
backbone = keras_nlp.models.AlbertBackbone(
vocabulary_size=1000,
num_layers=2,
num_heads=2,
embedding_dim=64,
hidden_dim=64,
intermediate_dim=128,
max_sequence_length=128)

# Create a ALBERT masked LM and fit the data.
masked_lm = keras_nlp.models.AlbertMaskedLM(
backbone,
preprocessor=None,
)
masked_lm.compile(
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
jit_compile=True
masked_lm = keras_nlp.models.AlbertMaskedLM.from_preset(
"albert_base_en_uncased",
preprocessor=None,
)
masked_lm.fit(x=preprocessed_features, y=labels, batch_size=2)
```
Expand Down
82 changes: 37 additions & 45 deletions keras_nlp/models/albert/albert_masked_lm_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,14 @@ class AlbertMaskedLMPreprocessor(AlbertPreprocessor):
`keras_nlp.models.AlbertMaskedLM` task model. Preprocessing will occur in
multiple steps.

- Tokenize any number of input segments using the `tokenizer`.
- Pack the inputs together with the appropriate `"<s>"`, `"</s>"` and
1. Tokenize any number of input segments using the `tokenizer`.
2. Pack the inputs together with the appropriate `"<s>"`, `"</s>"` and
`"<pad>"` tokens, i.e., adding a single `"<s>"` at the start of the
entire sequence, `"</s></s>"` between each segment,
and a `"</s>"` at the end of the entire sequence.
- Randomly select non-special tokens to mask, controlled by
3. Randomly select non-special tokens to mask, controlled by
`mask_selection_rate`.
- Construct a `(x, y, sample_weight)` tuple suitable for training with a
4. Construct a `(x, y, sample_weight)` tuple suitable for training with a
`keras_nlp.models.AlbertMaskedLM` task model.

Args:
Expand Down Expand Up @@ -68,6 +68,15 @@ class AlbertMaskedLMPreprocessor(AlbertPreprocessor):
left-to-right manner and fills up the buckets until we run
out of budget. It supports an arbitrary number of segments.

Call arguments:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove indent (should align with Args)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment has not been addressed, please fix it, thanks!

x: A tensor of single string sequences, or a tuple of multiple
tensor sequences to be packed together. Inputs may be batched or
unbatched. For single sequences, raw python inputs will be converted
to tensors. For multiple sequences, pass tensors directly.
y: Label data. Should always be `None` as the layer generates labels.
sample_weight: Label weights. Should always be `None` as the layer
generates label weights.

Examples:
```python
# Load the preprocessor from a preset.
Expand All @@ -76,54 +85,37 @@ class AlbertMaskedLMPreprocessor(AlbertPreprocessor):
)

# Tokenize and mask a single sentence.
sentence = tf.constant("The quick brown fox jumped.")
preprocessor(sentence)
preprocessor("The quick brown fox jumped.")

# Tokenize and mask a batch of sentences.
sentences = tf.constant(
["The quick brown fox jumped.", "Call me Ishmael."]
)
preprocessor(sentences)
preprocessor("The quick brown fox jumped.", "Call me Ishmael.")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

inner args should be in a list

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this one too - preprocessor(["The quick brown fox jumped.", "Call me Ishmael."])


# Tokenize and mask a dataset of sentences.
features = tf.constant(
["The quick brown fox jumped.", "Call me Ishmael."]
)
ds = tf.data.Dataset.from_tensor_slices((features))
ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)

# Alternatively, you can create a preprocessor from your own vocabulary.
vocab_data = tf.data.Dataset.from_tensor_slices(
["the quick brown fox", "the earth is round"]
)
# Tokenize and mask sentence pairs.
# In this case, always convert input to tensors before calling the layer.
first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
second = tf.constant(["The fox tripped.", "Oh look, a whale."])
preprocessor((first, second))
```

# Creating sentencepiece tokenizer for ALBERT LM preprocessor
bytes_io = io.BytesIO()

sentencepiece.SentencePieceTrainer.train(
sentence_iterator=vocab_data.as_numpy_iterator(),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]"
Mapping with `tf.data.Dataset`.
```python
preprocessor = keras_nlp.models.BertMaskedLMPreprocessor.from_preset(
"albert_base_en_uncased"
)

proto = bytes_io.getvalue()

tokenizer = keras_nlp.models.AlbertTokenizer(proto=proto)

preprocessor = keras_nlp.models.AlbertMaskedLMPreprocessor(
tokenizer=tokenizer
first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
second = tf.constant(["The fox tripped.", "Oh look, a whale."])
# Map single sentences.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

take a look at the source example, but you removed all the empty newlines, please add them back

ds = tf.data.Dataset.from_tensor_slices(first)
ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
# Map sentence pairs.
ds = tf.data.Dataset.from_tensor_slices((first, second))
# Watch out for tf.data's default unpacking of tuples here!
# Best to invoke the `preprocessor` directly in this case.
ds = ds.map(
lambda first, second: preprocessor(x=(first, second)),
num_parallel_calls=tf.data.AUTOTUNE,
)

```
"""

Expand Down
Loading