Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LayoutXLMTokenizer and LayoutXLMTokenizerFast #14030

Closed
wants to merge 12 commits into from
21 changes: 18 additions & 3 deletions docs/source/model_doc/layoutxlm.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,17 +40,32 @@ One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like

model = LayoutLMv2Model.from_pretrained('microsoft/layoutxlm-base')

Note that LayoutXLM requires a different tokenizer, based on :class:`~transformers.XLMRobertaTokenizer`. You can
Note that LayoutXLM requires a different tokenizer, based on :class:`~transformers.LayoutXLMTokenizer`. You can
initialize it as follows:

.. code-block::

from transformers import AutoTokenizer
from transformers import LayoutXLMTokenizer

tokenizer = AutoTokenizer.from_pretrained('microsoft/layoutxlm-base')
tokenizer = LayoutXLMTokenizer.from_pretrained('microsoft/layoutxlm-base')

As LayoutXLM's architecture is equivalent to that of LayoutLMv2, one can refer to :doc:`LayoutLMv2's documentation page
<layoutlmv2>` for all tips, code examples and notebooks.

This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here
<https://github.com/microsoft/unilm>`__.


LayoutXLMTokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. autoclass:: transformers.LayoutXLMTokenizer
:members: __call__, build_inputs_with_special_tokens, get_special_tokens_mask,
create_token_type_ids_from_sequences, save_vocabulary


LayoutXLMTokenizerFast
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. autoclass:: transformers.LayoutXLMTokenizerFast
:members: __call__
6 changes: 4 additions & 2 deletions src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@
"LayoutLMv2FeatureExtractor",
"LayoutLMv2Processor",
"LayoutLMv2Tokenizer",
"LayoutXLMTokenizer",
],
"models.led": ["LED_PRETRAINED_CONFIG_ARCHIVE_MAP", "LEDConfig", "LEDTokenizer"],
"models.longformer": ["LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "LongformerConfig", "LongformerTokenizer"],
Expand Down Expand Up @@ -400,7 +401,7 @@
_import_structure["models.gpt2"].append("GPT2TokenizerFast")
_import_structure["models.herbert"].append("HerbertTokenizerFast")
_import_structure["models.layoutlm"].append("LayoutLMTokenizerFast")
_import_structure["models.layoutlmv2"].append("LayoutLMv2TokenizerFast")
_import_structure["models.layoutlmv2"].extend(["LayoutLMv2TokenizerFast", "LayoutXLMTokenizerFast"])
_import_structure["models.led"].append("LEDTokenizerFast")
_import_structure["models.longformer"].append("LongformerTokenizerFast")
_import_structure["models.lxmert"].append("LxmertTokenizerFast")
Expand Down Expand Up @@ -2094,6 +2095,7 @@
LayoutLMv2FeatureExtractor,
LayoutLMv2Processor,
LayoutLMv2Tokenizer,
LayoutXLMTokenizer,
)
from .models.led import LED_PRETRAINED_CONFIG_ARCHIVE_MAP, LEDConfig, LEDTokenizer
from .models.longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig, LongformerTokenizer
Expand Down Expand Up @@ -2252,7 +2254,7 @@
from .models.gpt2 import GPT2TokenizerFast
from .models.herbert import HerbertTokenizerFast
from .models.layoutlm import LayoutLMTokenizerFast
from .models.layoutlmv2 import LayoutLMv2TokenizerFast
from .models.layoutlmv2 import LayoutLMv2TokenizerFast, LayoutXLMTokenizerFast
from .models.led import LEDTokenizerFast
from .models.longformer import LongformerTokenizerFast
from .models.lxmert import LxmertTokenizerFast
Expand Down
1 change: 1 addition & 0 deletions src/transformers/convert_slow_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -914,6 +914,7 @@ def converted(self) -> Tokenizer:
"HerbertTokenizer": HerbertConverter,
"LayoutLMTokenizer": BertConverter,
"LayoutLMv2Tokenizer": BertConverter,
"LayoutXLMTokenizer": XLMRobertaConverter,
"LongformerTokenizer": RobertaConverter,
"LEDTokenizer": RobertaConverter,
"LxmertTokenizer": BertConverter,
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/auto/tokenization_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@
("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)),
("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)),
(
"dpr",
(
Expand Down
4 changes: 4 additions & 0 deletions src/transformers/models/layoutlmv2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@
_import_structure = {
"configuration_layoutlmv2": ["LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "LayoutLMv2Config"],
"tokenization_layoutlmv2": ["LayoutLMv2Tokenizer"],
"tokenization_layoutxlm": ["LayoutXLMTokenizer"],
}

if is_tokenizers_available():
_import_structure["tokenization_layoutlmv2_fast"] = ["LayoutLMv2TokenizerFast"]
_import_structure["tokenization_layoutxlm_fast"] = ["LayoutXLMTokenizerFast"]

if is_vision_available():
_import_structure["feature_extraction_layoutlmv2"] = ["LayoutLMv2FeatureExtractor"]
Expand All @@ -47,9 +49,11 @@
if TYPE_CHECKING:
from .configuration_layoutlmv2 import LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP, LayoutLMv2Config
from .tokenization_layoutlmv2 import LayoutLMv2Tokenizer
from .tokenization_layoutxlm import LayoutXLMTokenizer

if is_tokenizers_available():
from .tokenization_layoutlmv2_fast import LayoutLMv2TokenizerFast
from .tokenization_layoutxlm_fast import LayoutXLMTokenizerFast

if is_vision_available():
from .feature_extraction_layoutlmv2 import LayoutLMv2FeatureExtractor
Expand Down
42 changes: 28 additions & 14 deletions src/transformers/models/layoutlmv2/processing_layoutlmv2.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,18 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for LayoutLMv2.
"""
""" Processor class for LayoutLMv2. """


from typing import List, Optional, Union

from ...file_utils import TensorType
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from .feature_extraction_layoutlmv2 import LayoutLMv2FeatureExtractor
from .tokenization_layoutlmv2 import LayoutLMv2Tokenizer
from .tokenization_layoutlmv2_fast import LayoutLMv2TokenizerFast
from .tokenization_layoutxlm import LayoutXLMTokenizer
from .tokenization_layoutxlm_fast import LayoutXLMTokenizerFast


class LayoutLMv2Processor:
Expand All @@ -33,28 +35,31 @@ class LayoutLMv2Processor:

It first uses :class:`~transformers.LayoutLMv2FeatureExtractor` to resize document images to a fixed size, and
optionally applies OCR to get words and normalized bounding boxes. These are then provided to
:class:`~transformers.LayoutLMv2Tokenizer` or :class:`~transformers.LayoutLMv2TokenizerFast`, which turns the words
:class:`~transformers.LayoutLMv2Tokenizer`, :class:`~transformers.LayoutLMv2TokenizerFast`,
:class:`~transformers.LayoutXLMTokenizer` or :class:`~transformers.LayoutXLMTokenizerFast` which turns the words
and bounding boxes into token-level :obj:`input_ids`, :obj:`attention_mask`, :obj:`token_type_ids`, :obj:`bbox`.
Optionally, one can provide integer :obj:`word_labels`, which are turned into token-level :obj:`labels` for token
classification tasks (such as FUNSD, CORD).

Args:
feature_extractor (:obj:`LayoutLMv2FeatureExtractor`):
An instance of :class:`~transformers.LayoutLMv2FeatureExtractor`. The feature extractor is a required
input.
tokenizer (:obj:`LayoutLMv2Tokenizer` or :obj:`LayoutLMv2TokenizerFast`):
An instance of :class:`~transformers.LayoutLMv2Tokenizer` or
:class:`~transformers.LayoutLMv2TokenizerFast`. The tokenizer is a required input.
tokenizer (:obj:`LayoutLMv2Tokenizer`, :obj:`LayoutLMv2TokenizerFast`, :obj:`LayoutXLMTokenizer`, or
:obj:`LayoutXLMTokenizerFast`): An instance of :class:`~transformers.LayoutLMv2Tokenizer`,
:class:`~transformers.LayoutLMv2TokenizerFast`, :class:`~transformers.LayoutXLMTokenizer` or
:class:`~transformers.LayoutXLMTokenizerFast`. The tokenizer is a required input.
"""

def __init__(self, feature_extractor, tokenizer):
if not isinstance(feature_extractor, LayoutLMv2FeatureExtractor):
raise ValueError(
f"`feature_extractor` has to be of type {LayoutLMv2FeatureExtractor.__class__}, but is {type(feature_extractor)}"
)
if not isinstance(tokenizer, (LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast)):
if not isinstance(
tokenizer, (LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast, LayoutXLMTokenizer, LayoutXLMTokenizerFast)
):
raise ValueError(
f"`tokenizer` has to be of type {LayoutLMv2Tokenizer.__class__} or {LayoutLMv2TokenizerFast.__class__}, but is {type(tokenizer)}"
f"`tokenizer` has to be of type {LayoutLMv2Tokenizer.__class__}, {LayoutLMv2TokenizerFast.__class__}, "
f"{LayoutXLMTokenizer.__class__}, or {LayoutXLMTokenizerFast.__class__}, but is {type(tokenizer)}"
)

self.feature_extractor = feature_extractor
Expand Down Expand Up @@ -82,7 +87,7 @@ def save_pretrained(self, save_directory):
self.tokenizer.save_pretrained(save_directory)

@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, **kwargs):
def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, use_xlm=False, **kwargs):
r"""
Instantiate a :class:`~transformers.LayoutLMv2Processor` from a pretrained LayoutLMv2 processor.

Expand Down Expand Up @@ -110,15 +115,24 @@ def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, **kwargs)
use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to instantiate a fast tokenizer.

use_xlm (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to instantiate a XLM tokenizer.

**kwargs
Additional keyword arguments passed along to both :class:`~transformers.SequenceFeatureExtractor` and
:class:`~transformers.PreTrainedTokenizer`
"""
feature_extractor = LayoutLMv2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
if use_fast:
tokenizer = LayoutLMv2TokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
if use_xlm:
tokenizer = LayoutXLMTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
else:
tokenizer = LayoutLMv2TokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
else:
tokenizer = LayoutLMv2Tokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
if use_xlm:
tokenizer = LayoutXLMTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
else:
tokenizer = LayoutLMv2Tokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)

return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)

Expand Down
Loading