Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LayoutXLMProcessor (and LayoutXLMTokenizer, LayoutXLMTokenizerFast) #14115

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 32 additions & 4 deletions docs/source/model_doc/layoutxlm.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,17 +40,45 @@ One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like

model = LayoutLMv2Model.from_pretrained('microsoft/layoutxlm-base')

Note that LayoutXLM requires a different tokenizer, based on :class:`~transformers.XLMRobertaTokenizer`. You can
initialize it as follows:
Note that LayoutXLM has its own tokenizer, based on
:class:`~transformers.LayoutXLMTokenizer`/:class:`~transformers.LayoutXLMTokenizerFast`. You can initialize it as
follows:

.. code-block::

from transformers import AutoTokenizer
from transformers import LayoutXLMTokenizer

tokenizer = AutoTokenizer.from_pretrained('microsoft/layoutxlm-base')
tokenizer = LayoutXLMTokenizer.from_pretrained('microsoft/layoutxlm-base')

Similar to LayoutLMv2, you can use :class:`~transformers.LayoutXLMProcessor` (which internally applies
:class:`~transformers.LayoutLMv2FeatureExtractor` and
:class:`~transformers.LayoutXLMTokenizer`/:class:`~transformers.LayoutXLMTokenizerFast` in sequence) to prepare all
data for the model.

As LayoutXLM's architecture is equivalent to that of LayoutLMv2, one can refer to :doc:`LayoutLMv2's documentation page
<layoutlmv2>` for all tips, code examples and notebooks.

This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here
<https://github.com/microsoft/unilm>`__.


LayoutXLMTokenizer
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. autoclass:: transformers.LayoutXLMTokenizer
:members: __call__, build_inputs_with_special_tokens, get_special_tokens_mask,
create_token_type_ids_from_sequences, save_vocabulary


LayoutXLMTokenizerFast
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. autoclass:: transformers.LayoutXLMTokenizerFast
:members: __call__


LayoutXLMProcessor
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. autoclass:: transformers.LayoutXLMProcessor
:members: __call__
8 changes: 8 additions & 0 deletions src/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@
"LayoutLMv2Processor",
"LayoutLMv2Tokenizer",
],
"models.layoutxlm": ["LayoutXLMProcessor"],
"models.led": ["LED_PRETRAINED_CONFIG_ARCHIVE_MAP", "LEDConfig", "LEDTokenizer"],
"models.longformer": ["LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "LongformerConfig", "LongformerTokenizer"],
"models.luke": ["LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP", "LukeConfig", "LukeTokenizer"],
Expand Down Expand Up @@ -365,6 +366,7 @@
_import_structure["models.big_bird"].append("BigBirdTokenizer")
_import_structure["models.camembert"].append("CamembertTokenizer")
_import_structure["models.deberta_v2"].append("DebertaV2Tokenizer")
_import_structure["models.layoutxlm"].append("LayoutXLMTokenizer")
_import_structure["models.m2m_100"].append("M2M100Tokenizer")
_import_structure["models.marian"].append("MarianTokenizer")
_import_structure["models.mbart"].append("MBartTokenizer")
Expand Down Expand Up @@ -411,6 +413,7 @@
_import_structure["models.herbert"].append("HerbertTokenizerFast")
_import_structure["models.layoutlm"].append("LayoutLMTokenizerFast")
_import_structure["models.layoutlmv2"].append("LayoutLMv2TokenizerFast")
_import_structure["models.layoutxlm"].append("LayoutXLMTokenizerFast")
_import_structure["models.led"].append("LEDTokenizerFast")
_import_structure["models.longformer"].append("LongformerTokenizerFast")
_import_structure["models.lxmert"].append("LxmertTokenizerFast")
Expand Down Expand Up @@ -477,6 +480,7 @@
_import_structure["models.detr"].append("DetrFeatureExtractor")
_import_structure["models.layoutlmv2"].append("LayoutLMv2FeatureExtractor")
_import_structure["models.layoutlmv2"].append("LayoutLMv2Processor")
_import_structure["models.layoutxlm"].append("LayoutXLMProcessor")
_import_structure["models.segformer"].append("SegformerFeatureExtractor")
_import_structure["models.vit"].append("ViTFeatureExtractor")
else:
Expand Down Expand Up @@ -2140,6 +2144,7 @@
LayoutLMv2Processor,
LayoutLMv2Tokenizer,
)
from .models.layoutxlm import LayoutXLMProcessor
from .models.led import LED_PRETRAINED_CONFIG_ARCHIVE_MAP, LEDConfig, LEDTokenizer
from .models.longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig, LongformerTokenizer
from .models.luke import LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP, LukeConfig, LukeTokenizer
Expand Down Expand Up @@ -2266,6 +2271,7 @@
from .models.big_bird import BigBirdTokenizer
from .models.camembert import CamembertTokenizer
from .models.deberta_v2 import DebertaV2Tokenizer
from .models.layoutxlm import LayoutXLMTokenizer
from .models.m2m_100 import M2M100Tokenizer
from .models.marian import MarianTokenizer
from .models.mbart import MBart50Tokenizer, MBartTokenizer
Expand Down Expand Up @@ -2302,6 +2308,7 @@
from .models.herbert import HerbertTokenizerFast
from .models.layoutlm import LayoutLMTokenizerFast
from .models.layoutlmv2 import LayoutLMv2TokenizerFast
from .models.layoutxlm import LayoutXLMTokenizerFast
from .models.led import LEDTokenizerFast
from .models.longformer import LongformerTokenizerFast
from .models.lxmert import LxmertTokenizerFast
Expand Down Expand Up @@ -2349,6 +2356,7 @@
from .models.deit import DeiTFeatureExtractor
from .models.detr import DetrFeatureExtractor
from .models.layoutlmv2 import LayoutLMv2FeatureExtractor, LayoutLMv2Processor
from .models.layoutxlm import LayoutXLMProcessor
from .models.segformer import SegformerFeatureExtractor
from .models.vit import ViTFeatureExtractor
else:
Expand Down
1 change: 1 addition & 0 deletions src/transformers/convert_slow_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -944,6 +944,7 @@ def converted(self) -> Tokenizer:
"HerbertTokenizer": HerbertConverter,
"LayoutLMTokenizer": BertConverter,
"LayoutLMv2Tokenizer": BertConverter,
"LayoutXLMTokenizer": XLMRobertaConverter,
"LongformerTokenizer": RobertaConverter,
"LEDTokenizer": RobertaConverter,
"LxmertTokenizer": BertConverter,
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
ibert,
layoutlm,
layoutlmv2,
layoutxlm,
led,
longformer,
luke,
Expand Down
1 change: 1 addition & 0 deletions src/transformers/models/auto/tokenization_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@
("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)),
("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)),
(
"dpr",
(
Expand Down
54 changes: 54 additions & 0 deletions src/transformers/models/layoutxlm/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# flake8: noqa
# There's no way to ignore "F401 '...' imported but unused" warnings in this
# module, but to preserve other warnings. So, don't check this module at all.

# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import TYPE_CHECKING

from ...file_utils import (
_LazyModule,
is_sentencepiece_available,
is_tokenizers_available,
is_torch_available,
is_vision_available,
)


_import_structure = {}

if is_sentencepiece_available():
_import_structure["tokenization_layoutxlm"] = ["LayoutXLMTokenizer"]

if is_tokenizers_available():
_import_structure["tokenization_layoutxlm_fast"] = ["LayoutXLMTokenizerFast"]

if is_vision_available():
_import_structure["processing_layoutxlm"] = ["LayoutXLMProcessor"]

if TYPE_CHECKING:
if is_sentencepiece_available():
from .tokenization_layoutxlm import LayoutXLMTokenizer

if is_tokenizers_available():
from .tokenization_layoutxlm_fast import LayoutXLMTokenizerFast

if is_vision_available():
from .processing_layoutlmv2 import LayoutXLMProcessor

else:
import sys

sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
Loading