Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions keras_nlp/models/bart/bart_backbone.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,10 @@ class BartBackbone(Backbone):
),
}

# Pretrained BART encoder.
model = keras_nlp.models.BartBackbone.from_preset("bart_base_en")
model(input_data)

# Randomly initialized BART encoder-decoder model with a custom config
model = keras_nlp.models.BartBackbone(
vocabulary_size=50265,
Expand Down
2 changes: 1 addition & 1 deletion keras_nlp/models/bart/bart_preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ class BartPreprocessor(Preprocessor):

Examples:

Directly calling the layer on data
Directly calling the layer on data.
```python
preprocessor = keras_nlp.models.BartPreprocessor.from_preset("bart_base_en")

Expand Down
56 changes: 24 additions & 32 deletions keras_nlp/models/bart/bart_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,38 +52,30 @@ class BartTokenizer(BytePairTokenizer):

Examples:

Batched inputs.
>>> vocab = {"<s>": 0, "<pad>": 1, "</s>": 2, "<mask>": 3}
>>> vocab = {**vocab, "a": 4, "Ġquick": 5, "Ġfox": 6}
>>> merges = ["Ġ q", "u i", "c k", "ui ck", "Ġq uick"]
>>> merges += ["Ġ f", "o x", "Ġf ox"]
>>> tokenizer = keras_nlp.models.RobertaTokenizer(
... vocabulary=vocab, merges=merges
... )
>>> tokenizer(["a quick fox", "a fox quick"])
<tf.RaggedTensor [[4, 5, 6], [4, 6, 5]]>

Unbatched input.
>>> vocab = {"<s>": 0, "<pad>": 1, "</s>": 2, "<mask>": 3}
>>> vocab = {**vocab, "a": 4, "Ġquick": 5, "Ġfox": 6}
>>> merges = ["Ġ q", "u i", "c k", "ui ck", "Ġq uick"]
>>> merges += ["Ġ f", "o x", "Ġf ox"]
>>> tokenizer = keras_nlp.models.RobertaTokenizer(
... vocabulary=vocab, merges=merges
... )
>>> tokenizer("a quick fox")
<tf.Tensor: shape=(3,), dtype=int32, numpy=array([4, 5, 6], dtype=int32)>

Detokenization.
>>> vocab = {"<s>": 0, "<pad>": 1, "</s>": 2, "<mask>": 3}
>>> vocab = {**vocab, "a": 4, "Ġquick": 5, "Ġfox": 6}
>>> merges = ["Ġ q", "u i", "c k", "ui ck", "Ġq uick"]
>>> merges += ["Ġ f", "o x", "Ġf ox"]
>>> tokenizer = keras_nlp.models.RobertaTokenizer(
... vocabulary=vocab, merges=merges
... )
>>> tokenizer.detokenize(tokenizer("a quick fox")).numpy().decode('utf-8')
'a quick fox'
```python
# Unbatched input.
tokenizer = keras_nlp.models.BartTokenizer.from_preset(
"bart_base_en",
)
tokenizer("The quick brown fox jumped.")

# Batched input.
tokenizer(["The quick brown fox jumped.", "The fox slept."])

# Detokenization.
tokenizer.detokenize(tokenizer("The quick brown fox jumped."))

# Custom vocabulary.
vocab = {"<s>": 0, "<pad>": 1, "</s>": 2, "<mask>": 3}
vocab = {**vocab, "a": 4, "Ġquick": 5, "Ġfox": 6}
merges = ["Ġ q", "u i", "c k", "ui ck", "Ġq uick"]
merges += ["Ġ f", "o x", "Ġf ox"]
tokenizer = keras_nlp.models.BartTokenizer(
vocabulary=vocab,
merges=merges,
)
tokenizer("The quick brown fox jumped.")
```
"""

def __init__(
Expand Down