-
Notifications
You must be signed in to change notification settings - Fork 25.8k
/
processing_layoutlmv2.py
160 lines (140 loc) · 7.63 KB
/
processing_layoutlmv2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Processor class for LayoutLMv2.
"""
from typing import List, Optional, Union
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType
class LayoutLMv2Processor(ProcessorMixin):
r"""
Constructs a LayoutLMv2 processor which combines a LayoutLMv2 feature extractor and a LayoutLMv2 tokenizer into a
single processor.
[`LayoutLMv2Processor`] offers all the functionalities you need to prepare data for the model.
It first uses [`LayoutLMv2FeatureExtractor`] to resize document images to a fixed size, and optionally applies OCR
to get words and normalized bounding boxes. These are then provided to [`LayoutLMv2Tokenizer`] or
[`LayoutLMv2TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
`attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
into token-level `labels` for token classification tasks (such as FUNSD, CORD).
Args:
feature_extractor (`LayoutLMv2FeatureExtractor`):
An instance of [`LayoutLMv2FeatureExtractor`]. The feature extractor is a required input.
tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`):
An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input.
"""
feature_extractor_class = "LayoutLMv2FeatureExtractor"
tokenizer_class = ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast")
def __call__(
self,
images,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
text_pair: Optional[Union[PreTokenizedInput, List[PreTokenizedInput]]] = None,
boxes: Union[List[List[int]], List[List[List[int]]]] = None,
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
return_tensors: Optional[Union[str, TensorType]] = None,
**kwargs
) -> BatchEncoding:
"""
This method first forwards the `images` argument to [`~LayoutLMv2FeatureExtractor.__call__`]. In case
[`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
bounding boxes along with the additional arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output,
together with resized `images`. In case [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to
`False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output, together with resized `images``.
Please refer to the docstring of the above two methods for more information.
"""
# verify input
if self.feature_extractor.apply_ocr and (boxes is not None):
raise ValueError(
"You cannot provide bounding boxes "
"if you initialized the feature extractor with apply_ocr set to True."
)
if self.feature_extractor.apply_ocr and (word_labels is not None):
raise ValueError(
"You cannot provide word labels if you initialized the feature extractor with apply_ocr set to True."
)
if return_overflowing_tokens is True and return_offsets_mapping is False:
raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.")
# first, apply the feature extractor
features = self.feature_extractor(images=images, return_tensors=return_tensors)
# second, apply the tokenizer
if text is not None and self.feature_extractor.apply_ocr and text_pair is None:
if isinstance(text, str):
text = [text] # add batch dimension (as the feature extractor always adds a batch dimension)
text_pair = features["words"]
encoded_inputs = self.tokenizer(
text=text if text is not None else features["words"],
text_pair=text_pair if text_pair is not None else None,
boxes=boxes if boxes is not None else features["boxes"],
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
return_tensors=return_tensors,
**kwargs,
)
# add pixel values
images = features.pop("pixel_values")
if return_overflowing_tokens is True:
images = self.get_overflowing_images(images, encoded_inputs["overflow_to_sample_mapping"])
encoded_inputs["image"] = images
return encoded_inputs
def get_overflowing_images(self, images, overflow_to_sample_mapping):
# in case there's an overflow, ensure each `input_ids` sample is mapped to its corresponding image
images_with_overflow = []
for sample_idx in overflow_to_sample_mapping:
images_with_overflow.append(images[sample_idx])
if len(images_with_overflow) != len(overflow_to_sample_mapping):
raise ValueError(
"Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got"
f" {len(images_with_overflow)} and {len(overflow_to_sample_mapping)}"
)
return images_with_overflow
def batch_decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
"""
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
to the docstring of this method for more information.
"""
return self.tokenizer.decode(*args, **kwargs)