Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add BlenderbotTokenizerFast #13720

Merged
merged 19 commits into from
Oct 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ Flax), PyTorch, and/or TensorFlow.
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| BigBirdPegasus ||||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Blenderbot || ||||
| Blenderbot || ||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| BlenderbotSmall ||||||
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
Expand Down
7 changes: 7 additions & 0 deletions docs/source/model_doc/blenderbot.rst
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,13 @@ BlenderbotTokenizer
:members: build_inputs_with_special_tokens


BlenderbotTokenizerFast
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. autoclass:: transformers.BlenderbotTokenizerFast
:members: build_inputs_with_special_tokens


BlenderbotModel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down
2 changes: 2 additions & 0 deletions src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,7 @@
_import_structure["models.barthez"].append("BarthezTokenizerFast")
_import_structure["models.bert"].append("BertTokenizerFast")
_import_structure["models.big_bird"].append("BigBirdTokenizerFast")
_import_structure["models.blenderbot"].append("BlenderbotTokenizerFast")
_import_structure["models.camembert"].append("CamembertTokenizerFast")
_import_structure["models.deberta"].append("DebertaTokenizerFast")
_import_structure["models.distilbert"].append("DistilBertTokenizerFast")
Expand Down Expand Up @@ -2283,6 +2284,7 @@
from .models.barthez import BarthezTokenizerFast
from .models.bert import BertTokenizerFast
from .models.big_bird import BigBirdTokenizerFast
from .models.blenderbot import BlenderbotTokenizerFast
from .models.blenderbot_small import BlenderbotSmallTokenizerFast
from .models.camembert import CamembertTokenizerFast
from .models.clip import CLIPTokenizerFast
Expand Down
30 changes: 30 additions & 0 deletions src/transformers/convert_slow_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -893,12 +893,42 @@ def converted(self) -> Tokenizer:
return tokenizer


class BlenderbotConverter(Converter):
def converted(self) -> Tokenizer:
ot = self.original_tokenizer
vocab = ot.encoder
merges = list(ot.bpe_ranks.keys())

tokenizer = Tokenizer(
BPE(
vocab=vocab,
merges=merges,
dropout=None,
continuing_subword_prefix="",
end_of_word_suffix="",
fuse_unk=False,
)
)

tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=ot.add_prefix_space)
tokenizer.decoder = decoders.ByteLevel()
tokenizer.post_processor = processors.TemplateProcessing(
single=f"$A:0 {ot.eos_token}:0",
special_tokens=[
(ot.eos_token, ot.eos_token_id),
],
)

return tokenizer


SLOW_TO_FAST_CONVERTERS = {
"AlbertTokenizer": AlbertConverter,
"BartTokenizer": RobertaConverter,
"BarthezTokenizer": BarthezConverter,
"BertTokenizer": BertConverter,
"BigBirdTokenizer": BigBirdConverter,
"BlenderbotTokenizer": BlenderbotConverter,
"CamembertTokenizer": CamembertConverter,
"CLIPTokenizer": CLIPConverter,
"ConvBertTokenizer": BertConverter,
Expand Down
2 changes: 1 addition & 1 deletion src/transformers/models/auto/tokenization_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@
),
("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)),
("blenderbot-small", ("BlenderbotSmallTokenizer", None)),
("blenderbot", ("BlenderbotTokenizer", None)),
("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")),
("bart", ("BartTokenizer", "BartTokenizerFast")),
("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
Expand Down
8 changes: 7 additions & 1 deletion src/transformers/models/blenderbot/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,17 @@

from typing import TYPE_CHECKING

from ...file_utils import _LazyModule, is_tf_available, is_torch_available
from ...file_utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available


_import_structure = {
"configuration_blenderbot": ["BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BlenderbotConfig"],
"tokenization_blenderbot": ["BlenderbotTokenizer"],
}

if is_tokenizers_available():
_import_structure["tokenization_blenderbot_fast"] = ["BlenderbotTokenizerFast"]

if is_torch_available():
_import_structure["modeling_blenderbot"] = [
"BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST",
Expand All @@ -48,6 +51,9 @@
from .configuration_blenderbot import BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotConfig
from .tokenization_blenderbot import BlenderbotTokenizer

if is_tokenizers_available():
from .tokenization_blenderbot_fast import BlenderbotTokenizerFast

if is_torch_available():
from .modeling_blenderbot import (
BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST,
Expand Down
4 changes: 2 additions & 2 deletions src/transformers/models/blenderbot/tokenization_blenderbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# limitations under the License.
"""Tokenization class for Blenderbot."""

from typing import TYPE_CHECKING, List
from typing import TYPE_CHECKING, List, Optional

from ...utils import logging
from ..roberta.tokenization_roberta import RobertaTokenizer
Expand Down Expand Up @@ -58,7 +58,7 @@ class BlenderbotTokenizer(RobertaTokenizer):
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: List[int] = None):
def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A Blenderbot sequence has the following format:
Expand Down
96 changes: 96 additions & 0 deletions src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
# coding=utf-8
# Copyright 2021 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fast Tokenization class for Blenderbot."""

from typing import TYPE_CHECKING, List, Optional

from ...utils import logging
from ..roberta.tokenization_roberta_fast import RobertaTokenizerFast
from .tokenization_blenderbot import BlenderbotTokenizer


if TYPE_CHECKING:
from transformers.pipelines.conversational import Conversation

logger = logging.get_logger(__name__)


VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
"tokenizer_config_file": "tokenizer_config.json",
}

PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/vocab.json"},
"merges_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/merges.txt"},
"tokenizer_config_file": {
"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/tokenizer_config.json"
},
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot-3B": 128}


class BlenderbotTokenizerFast(RobertaTokenizerFast):
r"""
Construct a "fast" Blenderbot tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.BlenderbotFast` is nearly identical to :class:`~transformers.RobertaTokenizerFast` and runs
end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesn't add BOS token
to the beginning of sequences.
Refer to superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning
parameters.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class = BlenderbotTokenizer

def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None):
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A Blenderbot sequence has the following format:
- single sequence: `` X </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`):
Will be ignored
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
return token_ids_0 + [self.eos_token_id]

def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
inputs = []
for is_user, text in conversation.iter_texts():
if is_user:
# We need to space prefix as it's being done within blenderbot
inputs.append(" " + text)
else:
# Generated responses should contain them already.
inputs.append(text)

full_string = " ".join(inputs)
input_ids = self.encode(full_string)
if len(input_ids) > self.model_max_length:
input_ids = input_ids[-self.model_max_length :]
logger.warning(f"Trimmed input from conversation as it was longer than {self.model_max_length} tokens.")
return input_ids
9 changes: 9 additions & 0 deletions src/transformers/utils/dummy_tokenizers_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,15 @@ def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["tokenizers"])


class BlenderbotTokenizerFast:
def __init__(self, *args, **kwargs):
requires_backends(self, ["tokenizers"])

@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["tokenizers"])


class BlenderbotSmallTokenizerFast:
def __init__(self, *args, **kwargs):
requires_backends(self, ["tokenizers"])
Expand Down
5 changes: 5 additions & 0 deletions tests/test_modeling_blenderbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,11 @@ def get_config(self):
pad_token_id=self.pad_token_id,
)

def get_pipeline_config(self):
config = self.get_config()
config.max_position_embeddings = 100
return config

def prepare_config_and_inputs_for_common(self):
config, inputs_dict = self.prepare_config_and_inputs()
return config, inputs_dict
Expand Down
5 changes: 5 additions & 0 deletions tests/test_pipelines_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,11 @@ def gen_test(ModelClass, checkpoint, tiny_config, tokenizer_class, feature_extra
def test(self):
if ModelClass.__name__.endswith("ForCausalLM"):
tiny_config.is_encoder_decoder = False
if hasattr(tiny_config, "encoder_no_repeat_ngram_size"):
# specific for blenderbot which supports both decoder-only
# encoder/decoder but the test config only reflects
# encoder/decoder arch
tiny_config.encoder_no_repeat_ngram_size = 0
if ModelClass.__name__.endswith("WithLMHead"):
tiny_config.is_decoder = True
try:
Expand Down
17 changes: 16 additions & 1 deletion tests/test_tokenization_blenderbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,37 @@
"""Tests for Blenderbot Tokenizers, including common tests for BlenderbotSmallTokenizer."""
import unittest

from transformers import BlenderbotTokenizer, BlenderbotTokenizerFast
from transformers.file_utils import cached_property
from transformers.models.blenderbot.tokenization_blenderbot import BlenderbotTokenizer


class Blenderbot3BTokenizerTests(unittest.TestCase):
@cached_property
def tokenizer_3b(self):
return BlenderbotTokenizer.from_pretrained("facebook/blenderbot-3B")

@cached_property
def rust_tokenizer_3b(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(nit) maybe call this fast_tokenizer for consistancy

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In test files, it seems to me the rust_tokenizer naming is used instead of fast_tokenizer. However, I can change it here :)

return BlenderbotTokenizerFast.from_pretrained("facebook/blenderbot-3B")

def test_encode_decode_cycle(self):
tok = self.tokenizer_3b
src_text = " I am a small frog."
encoded = tok([src_text], padding=False, truncation=False)["input_ids"]
decoded = tok.batch_decode(encoded, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
assert src_text == decoded

def test_encode_decode_cycle_rust_tokenizer(self):
tok = self.rust_tokenizer_3b
src_text = " I am a small frog."
encoded = tok([src_text], padding=False, truncation=False)["input_ids"]
decoded = tok.batch_decode(encoded, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
assert src_text == decoded

def test_3B_tokenization_same_as_parlai(self):
assert self.tokenizer_3b.add_prefix_space
assert self.tokenizer_3b([" Sam", "Sam"]).input_ids == [[5502, 2], [5502, 2]]

def test_3B_tokenization_same_as_parlai_rust_tokenizer(self):
assert self.rust_tokenizer_3b.add_prefix_space
assert self.rust_tokenizer_3b([" Sam", "Sam"]).input_ids == [[5502, 2], [5502, 2]]