huggingface · kingyiusuen · Oct 15, 2021 · Oct 15, 2021 · Oct 15, 2021 · Oct 15, 2021
diff --git a/docs/source/model_doc/layoutxlm.rst b/docs/source/model_doc/layoutxlm.rst
@@ -40,17 +40,32 @@ One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like
 
     model = LayoutLMv2Model.from_pretrained('microsoft/layoutxlm-base') 
 
-Note that LayoutXLM requires a different tokenizer, based on :class:`~transformers.XLMRobertaTokenizer`. You can
+Note that LayoutXLM requires a different tokenizer, based on :class:`~transformers.LayoutXLMTokenizer`. You can
 initialize it as follows:
 
 .. code-block::
 
-    from transformers import AutoTokenizer
+    from transformers import LayoutXLMTokenizer
 
-    tokenizer = AutoTokenizer.from_pretrained('microsoft/layoutxlm-base') 
+    tokenizer = LayoutXLMTokenizer.from_pretrained('microsoft/layoutxlm-base') 
 
 As LayoutXLM's architecture is equivalent to that of LayoutLMv2, one can refer to :doc:`LayoutLMv2's documentation page
 <layoutlmv2>` for all tips, code examples and notebooks.
 
 This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here
 <https://github.com/microsoft/unilm>`__.
+
+
+LayoutXLMTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutXLMTokenizer
+    :members: __call__, build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+LayoutXLMTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutXLMTokenizerFast
+    :members: __call__
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -228,6 +228,7 @@
         "LayoutLMv2FeatureExtractor",
         "LayoutLMv2Processor",
         "LayoutLMv2Tokenizer",
+        "LayoutXLMTokenizer",
     ],
     "models.led": ["LED_PRETRAINED_CONFIG_ARCHIVE_MAP", "LEDConfig", "LEDTokenizer"],
     "models.longformer": ["LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "LongformerConfig", "LongformerTokenizer"],
@@ -400,7 +401,7 @@
     _import_structure["models.gpt2"].append("GPT2TokenizerFast")
     _import_structure["models.herbert"].append("HerbertTokenizerFast")
     _import_structure["models.layoutlm"].append("LayoutLMTokenizerFast")
-    _import_structure["models.layoutlmv2"].append("LayoutLMv2TokenizerFast")
+    _import_structure["models.layoutlmv2"].extend(["LayoutLMv2TokenizerFast", "LayoutXLMTokenizerFast"])
     _import_structure["models.led"].append("LEDTokenizerFast")
     _import_structure["models.longformer"].append("LongformerTokenizerFast")
     _import_structure["models.lxmert"].append("LxmertTokenizerFast")
@@ -2094,6 +2095,7 @@
         LayoutLMv2FeatureExtractor,
         LayoutLMv2Processor,
         LayoutLMv2Tokenizer,
+        LayoutXLMTokenizer,
     )
     from .models.led import LED_PRETRAINED_CONFIG_ARCHIVE_MAP, LEDConfig, LEDTokenizer
     from .models.longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig, LongformerTokenizer
@@ -2252,7 +2254,7 @@
         from .models.gpt2 import GPT2TokenizerFast
         from .models.herbert import HerbertTokenizerFast
         from .models.layoutlm import LayoutLMTokenizerFast
-        from .models.layoutlmv2 import LayoutLMv2TokenizerFast
+        from .models.layoutlmv2 import LayoutLMv2TokenizerFast, LayoutXLMTokenizerFast
         from .models.led import LEDTokenizerFast
         from .models.longformer import LongformerTokenizerFast
         from .models.lxmert import LxmertTokenizerFast

diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
@@ -914,6 +914,7 @@ def converted(self) -> Tokenizer:
     "HerbertTokenizer": HerbertConverter,
     "LayoutLMTokenizer": BertConverter,
     "LayoutLMv2Tokenizer": BertConverter,
+    "LayoutXLMTokenizer": XLMRobertaConverter,
     "LongformerTokenizer": RobertaConverter,
     "LEDTokenizer": RobertaConverter,
     "LxmertTokenizer": BertConverter,

diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
@@ -124,6 +124,7 @@
             ("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)),
             ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
             ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
+            ("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)),
             (
                 "dpr",
                 (

diff --git a/src/transformers/models/layoutlmv2/__init__.py b/src/transformers/models/layoutlmv2/__init__.py
@@ -24,10 +24,12 @@
 _import_structure = {
     "configuration_layoutlmv2": ["LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "LayoutLMv2Config"],
     "tokenization_layoutlmv2": ["LayoutLMv2Tokenizer"],
+    "tokenization_layoutxlm": ["LayoutXLMTokenizer"],
 }
 
 if is_tokenizers_available():
     _import_structure["tokenization_layoutlmv2_fast"] = ["LayoutLMv2TokenizerFast"]
+    _import_structure["tokenization_layoutxlm_fast"] = ["LayoutXLMTokenizerFast"]
 
 if is_vision_available():
     _import_structure["feature_extraction_layoutlmv2"] = ["LayoutLMv2FeatureExtractor"]
@@ -47,9 +49,11 @@
 if TYPE_CHECKING:
     from .configuration_layoutlmv2 import LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP, LayoutLMv2Config
     from .tokenization_layoutlmv2 import LayoutLMv2Tokenizer
+    from .tokenization_layoutxlm import LayoutXLMTokenizer
 
     if is_tokenizers_available():
         from .tokenization_layoutlmv2_fast import LayoutLMv2TokenizerFast
+        from .tokenization_layoutxlm_fast import LayoutXLMTokenizerFast
 
     if is_vision_available():
         from .feature_extraction_layoutlmv2 import LayoutLMv2FeatureExtractor

diff --git a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
@@ -12,16 +12,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
-Processor class for LayoutLMv2.
-"""
+""" Processor class for LayoutLMv2. """
+
+
 from typing import List, Optional, Union
 
 from ...file_utils import TensorType
 from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from .feature_extraction_layoutlmv2 import LayoutLMv2FeatureExtractor
 from .tokenization_layoutlmv2 import LayoutLMv2Tokenizer
 from .tokenization_layoutlmv2_fast import LayoutLMv2TokenizerFast
+from .tokenization_layoutxlm import LayoutXLMTokenizer
+from .tokenization_layoutxlm_fast import LayoutXLMTokenizerFast
 
 
 class LayoutLMv2Processor:
@@ -33,28 +35,31 @@ class LayoutLMv2Processor:
 
     It first uses :class:`~transformers.LayoutLMv2FeatureExtractor` to resize document images to a fixed size, and
     optionally applies OCR to get words and normalized bounding boxes. These are then provided to
-    :class:`~transformers.LayoutLMv2Tokenizer` or :class:`~transformers.LayoutLMv2TokenizerFast`, which turns the words
+    :class:`~transformers.LayoutLMv2Tokenizer`, :class:`~transformers.LayoutLMv2TokenizerFast`,
+    :class:`~transformers.LayoutXLMTokenizer` or :class:`~transformers.LayoutXLMTokenizerFast` which turns the words
     and bounding boxes into token-level :obj:`input_ids`, :obj:`attention_mask`, :obj:`token_type_ids`, :obj:`bbox`.
-    Optionally, one can provide integer :obj:`word_labels`, which are turned into token-level :obj:`labels` for token
-    classification tasks (such as FUNSD, CORD).
 
     Args:
         feature_extractor (:obj:`LayoutLMv2FeatureExtractor`):
             An instance of :class:`~transformers.LayoutLMv2FeatureExtractor`. The feature extractor is a required
             input.
-        tokenizer (:obj:`LayoutLMv2Tokenizer` or :obj:`LayoutLMv2TokenizerFast`):
-            An instance of :class:`~transformers.LayoutLMv2Tokenizer` or
-            :class:`~transformers.LayoutLMv2TokenizerFast`. The tokenizer is a required input.
+        tokenizer (:obj:`LayoutLMv2Tokenizer`, :obj:`LayoutLMv2TokenizerFast`, :obj:`LayoutXLMTokenizer`, or
+            :obj:`LayoutXLMTokenizerFast`): An instance of :class:`~transformers.LayoutLMv2Tokenizer`,
+            :class:`~transformers.LayoutLMv2TokenizerFast`, :class:`~transformers.LayoutXLMTokenizer` or
+            :class:`~transformers.LayoutXLMTokenizerFast`. The tokenizer is a required input.
     """
 
     def __init__(self, feature_extractor, tokenizer):
         if not isinstance(feature_extractor, LayoutLMv2FeatureExtractor):
             raise ValueError(
                 f"`feature_extractor` has to be of type {LayoutLMv2FeatureExtractor.__class__}, but is {type(feature_extractor)}"
             )
-        if not isinstance(tokenizer, (LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast)):
+        if not isinstance(
+            tokenizer, (LayoutLMv2Tokenizer, LayoutLMv2TokenizerFast, LayoutXLMTokenizer, LayoutXLMTokenizerFast)
+        ):
             raise ValueError(
-                f"`tokenizer` has to be of type {LayoutLMv2Tokenizer.__class__} or {LayoutLMv2TokenizerFast.__class__}, but is {type(tokenizer)}"
+                f"`tokenizer` has to be of type {LayoutLMv2Tokenizer.__class__}, {LayoutLMv2TokenizerFast.__class__}, "
+                f"{LayoutXLMTokenizer.__class__}, or {LayoutXLMTokenizerFast.__class__}, but is {type(tokenizer)}"
             )
 
         self.feature_extractor = feature_extractor
@@ -82,7 +87,7 @@ def save_pretrained(self, save_directory):
         self.tokenizer.save_pretrained(save_directory)
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, use_xlm=False, **kwargs):
         r"""
         Instantiate a :class:`~transformers.LayoutLMv2Processor` from a pretrained LayoutLMv2 processor.
 
@@ -110,15 +115,24 @@ def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, **kwargs)
             use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`):
                 Whether or not to instantiate a fast tokenizer.
 
+            use_xlm (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                Whether or not to instantiate a XLM tokenizer.
+
             **kwargs
                 Additional keyword arguments passed along to both :class:`~transformers.SequenceFeatureExtractor` and
                 :class:`~transformers.PreTrainedTokenizer`
         """
         feature_extractor = LayoutLMv2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
         if use_fast:
-            tokenizer = LayoutLMv2TokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            if use_xlm:
+                tokenizer = LayoutXLMTokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            else:
+                tokenizer = LayoutLMv2TokenizerFast.from_pretrained(pretrained_model_name_or_path, **kwargs)
         else:
-            tokenizer = LayoutLMv2Tokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            if use_xlm:
+                tokenizer = LayoutXLMTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+            else:
+                tokenizer = LayoutLMv2Tokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
 
         return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)