Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 60 additions & 57 deletions keras_nlp/models/albert/albert_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,74 +58,34 @@ class AlbertClassifier(Task):

Examples:

Example usage.
Raw string data.
```python
# Define the preprocessed inputs.
preprocessed_features = {
"token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
"segment_ids": tf.constant(
[[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
),
"padding_mask": tf.constant(
[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
),
}
labels = [0, 3]

# Randomly initialize a ALBERT backbone.
backbone = AlbertBackbone(
vocabulary_size=1000,
num_layers=2,
num_heads=2,
embedding_dim=8,
hidden_dim=64,
intermediate_dim=128,
max_sequence_length=128,
name="encoder",
)

# Create a ALBERT classifier and fit your data.
classifier = keras_nlp.models.AlbertClassifier(
backbone,
num_classes=4,
preprocessor=None,
)
classifier.compile(
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
)
classifier.fit(x=preprocessed_features, y=labels, batch_size=2)

# Access backbone programatically (e.g., to change `trainable`)
classifier.backbone.trainable = False

Raw string inputs with customized preprocessing.
```python
# Create a dataset with raw string features in an `(x, y)` format.
features = ["The quick brown fox jumped.", "I forgot my homework."]
labels = [0, 3]

# Use a shorter sequence length.
preprocessor = keras_nlp.models.AlbertPreprocessor.from_preset(
"albert_base_en_uncased",
sequence_length=128,
)

# Create a AlbertClassifier and fit your data.
# Pretrained classifier.
classifier = keras_nlp.models.AlbertClassifier.from_preset(
"albert_base_en_uncased",
num_classes=4,
preprocessor=preprocessor,
)
classifier.fit(x=features, y=labels, batch_size=2)
classifier.predict(x=features, batch_size=2)

# Re-compile (e.g., with a new learning rate).
classifier.compile(
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
optimizer=keras.optimizers.Adam(5e-5),
jit_compile=True,
)
# Access backbone programatically (e.g., to change `trainable`).
classifier.backbone.trainable = False
# Fit again.
classifier.fit(x=features, y=labels, batch_size=2)
```

Preprocessed inputs.
Preprocessed integer data.
```python
# Create a dataset with preprocessed features in an `(x, y)` format.
preprocessed_features = {
features = {
"token_ids": tf.ones(shape=(2, 12), dtype=tf.int64),
"segment_ids": tf.constant(
[[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0]] * 2, shape=(2, 12)
Expand All @@ -136,16 +96,59 @@ class AlbertClassifier(Task):
}
labels = [0, 3]

# Create a ALBERT classifier and fit your data.
# Pretrained classifier without preprocessing.
classifier = keras_nlp.models.AlbertClassifier.from_preset(
"albert_base_en_uncased",
num_classes=4,
preprocessor=None,
)
classifier.compile(
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
classifier.fit(x=features, y=labels, batch_size=2)
```

Custom backbone and vocabulary.
```python
features = ["The quick brown fox jumped.", "I forgot my homework."]
labels = [0, 3]

bytes_io = io.BytesIO()
ds = tf.data.Dataset.from_tensor_slices(features)
sentencepiece.SentencePieceTrainer.train(
sentence_iterator=ds.as_numpy_iterator(),
model_writer=bytes_io,
vocab_size=10,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]",
)
tokenizer = keras_nlp.models.AlbertTokenizer(
proto=bytes_io.getvalue(),
)
classifier.fit(x=preprocessed_features, y=labels, batch_size=2)
preprocessor = keras_nlp.models.AlbertPreprocessor(
tokenizer=tokenizer,
sequence_length=128,
)
backbone = keras_nlp.models.AlbertBackbone(
vocabulary_size=tokenizer.vocabulary_size(),
num_layers=4,
num_heads=4,
hidden_dim=256,
embedding_dim=128,
intermediate_dim=512,
max_sequence_length=128,
)
classifier = keras_nlp.models.AlbertClassifier(
backbone=backbone,
preprocessor=preprocessor,
num_classes=4,
)
classifier.fit(x=features, y=labels, batch_size=2)
```
"""

Expand Down
47 changes: 18 additions & 29 deletions keras_nlp/models/albert/albert_masked_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,60 +57,49 @@ class AlbertMaskedLM(Task):

Example usage:

Raw string inputs and pretrained backbone.
Raw string data.
```python
# Create a dataset with raw string features. Labels are inferred.
features = ["The quick brown fox jumped.", "I forgot my homework."]

# Create a AlbertMaskedLM with a pretrained backbone and further train
# on an MLM task.
# Pretrained language model.
masked_lm = keras_nlp.models.AlbertMaskedLM.from_preset(
"albert_base_en_uncased",
)
masked_lm.fit(x=features, batch_size=2)

# Re-compile (e.g., with a new learning rate).
masked_lm.compile(
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
optimizer=keras.optimizers.Adam(5e-5),
jit_compile=True,
)
# Access backbone programatically (e.g., to change `trainable`).
masked_lm.backbone.trainable = False
# Fit again.
masked_lm.fit(x=features, batch_size=2)
```

Preprocessed inputs and custom backbone.
Preprocessed integer data.
```python
# Create a preprocessed dataset where 0 is the mask token.
preprocessed_features = {
"segment_ids": tf.constant(
[[1, 0, 0, 4, 0, 6, 7, 8]] * 2, shape=(2, 8)
),
# Create preprocessed batch where 0 is the mask token.
features = {
"token_ids": tf.constant(
[[1, 2, 0, 4, 0, 6, 7, 8]] * 2, shape=(2, 8)
),
"padding_mask": tf.constant(
[[1, 1, 1, 1, 1, 1, 1, 1]] * 2, shape=(2, 8)
),
"mask_positions": tf.constant([[2, 4]] * 2, shape=(2, 2))
"mask_positions": tf.constant([[2, 4]] * 2, shape=(2, 2)),
"segment_ids": tf.constant([[0, 0, 0, 0, 0, 0, 0, 0]] * 2, shape=(2, 8))
}
# Labels are the original masked values.
labels = [[3, 5]] * 2

# Randomly initialize a ALBERT encoder
backbone = keras_nlp.models.AlbertBackbone(
vocabulary_size=1000,
num_layers=2,
num_heads=2,
embedding_dim=64,
hidden_dim=64,
intermediate_dim=128,
max_sequence_length=128)

# Create a ALBERT masked LM and fit the data.
masked_lm = keras_nlp.models.AlbertMaskedLM(
backbone,
masked_lm = keras_nlp.models.AlbertMaskedLM.from_preset(
"albert_base_en_uncased",
preprocessor=None,
)
masked_lm.compile(
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
jit_compile=True
)
masked_lm.fit(x=preprocessed_features, y=labels, batch_size=2)
masked_lm.fit(x=features, y=labels, batch_size=2)
```
"""

Expand Down
68 changes: 27 additions & 41 deletions keras_nlp/models/albert/albert_masked_lm_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,61 +69,47 @@ class AlbertMaskedLMPreprocessor(AlbertPreprocessor):
out of budget. It supports an arbitrary number of segments.

Examples:

Directly calling the layer on data.
```python
# Load the preprocessor from a preset.
preprocessor = keras_nlp.models.AlbertMaskedLMPreprocessor.from_preset(
"albert_base_en_uncased"
)

# Tokenize and mask a single sentence.
sentence = tf.constant("The quick brown fox jumped.")
preprocessor(sentence)

# Tokenize and mask a batch of sentences.
sentences = tf.constant(
["The quick brown fox jumped.", "Call me Ishmael."]
)
preprocessor(sentences)
preprocessor("The quick brown fox jumped.")

# Tokenize and mask a dataset of sentences.
features = tf.constant(
["The quick brown fox jumped.", "Call me Ishmael."]
)
ds = tf.data.Dataset.from_tensor_slices((features))
ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
# Tokenize and mask a batch of single sentences.
preprocessor(["The quick brown fox jumped.", "Call me Ishmael."])

# Alternatively, you can create a preprocessor from your own vocabulary.
vocab_data = tf.data.Dataset.from_tensor_slices(
["the quick brown fox", "the earth is round"]
)
# Tokenize and mask sentence pairs.
# In this case, always convert input to tensors before calling the layer.
first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
second = tf.constant(["The fox tripped.", "Oh look, a whale."])
preprocessor((first, second))
```

# Creating sentencepiece tokenizer for ALBERT LM preprocessor
bytes_io = io.BytesIO()

sentencepiece.SentencePieceTrainer.train(
sentence_iterator=vocab_data.as_numpy_iterator(),
model_writer=bytes_io,
vocab_size=12,
model_type="WORD",
pad_id=0,
unk_id=1,
bos_id=2,
eos_id=3,
pad_piece="<pad>",
unk_piece="<unk>",
bos_piece="[CLS]",
eos_piece="[SEP]",
user_defined_symbols="[MASK]"
Mapping with `tf.data.Dataset`.
```python
preprocessor = keras_nlp.models.AlbertMaskedLMPreprocessor.from_preset(
"albert_base_en_uncased"
)

proto = bytes_io.getvalue()
first = tf.constant(["The quick brown fox jumped.", "Call me Ishmael."])
second = tf.constant(["The fox tripped.", "Oh look, a whale."])

tokenizer = keras_nlp.models.AlbertTokenizer(proto=proto)
# Map single sentences.
ds = tf.data.Dataset.from_tensor_slices(first)
ds = ds.map(preprocessor, num_parallel_calls=tf.data.AUTOTUNE)

preprocessor = keras_nlp.models.AlbertMaskedLMPreprocessor(
tokenizer=tokenizer
# Map sentence pairs.
ds = tf.data.Dataset.from_tensor_slices((first, second))
# Watch out for tf.data's default unpacking of tuples here!
# Best to invoke the `preprocessor` directly in this case.
ds = ds.map(
lambda first, second: preprocessor(x=(first, second)),
num_parallel_calls=tf.data.AUTOTUNE,
)

```
"""

Expand Down
Loading