Skip to content

Commit

Permalink
Remove methods from fast tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
NielsRogge committed Nov 2, 2021
1 parent 9ba1ed8 commit 20dd625
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 129 deletions.
2 changes: 1 addition & 1 deletion docs/source/model_doc/layoutxlm.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like
model = LayoutLMv2Model.from_pretrained('microsoft/layoutxlm-base')
Note that LayoutXLM requires a different tokenizer, based on
Note that LayoutXLM has its own tokenizer, based on
:class:`~transformers.LayoutXLMTokenizer`/:class:`~transformers.LayoutXLMTokenizerFast`. You can initialize it as
follows:

Expand Down
129 changes: 1 addition & 128 deletions src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,67 +308,6 @@ def _is_valid_text_input(t):
**kwargs,
)

@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
List[TextInput],
List[TextInputPair],
List[PreTokenizedInput],
],
is_pair: bool = None,
boxes: Optional[List[List[List[int]]]] = None,
word_labels: Optional[Union[List[int], List[List[int]]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
""" """

# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)

return self._batch_encode_plus(
batch_text_or_text_pairs=batch_text_or_text_pairs,
is_pair=is_pair,
boxes=boxes,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)

def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
batched_input = [(text, pair)] if pair else [text]
encodings = self._tokenizer.encode_batch(
Expand All @@ -377,73 +316,6 @@ def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bo

return encodings[0].tokens

@add_end_docstrings(ENCODE_KWARGS_DOCSTRING, LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
def encode_plus(
self,
text: Union[TextInput, PreTokenizedInput],
text_pair: Optional[PreTokenizedInput] = None,
boxes: Optional[List[List[int]]] = None,
word_labels: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs
) -> BatchEncoding:
"""
Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
``__call__`` should be used instead.
Args:
text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
text_pair (:obj:`List[str]` or :obj:`List[int]`, `optional`):
Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
list of list of strings (words of a batch of examples).
"""

# Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
padding=padding,
truncation=truncation,
max_length=max_length,
pad_to_multiple_of=pad_to_multiple_of,
verbose=verbose,
**kwargs,
)

return self._encode_plus(
text=text,
boxes=boxes,
text_pair=text_pair,
word_labels=word_labels,
add_special_tokens=add_special_tokens,
padding_strategy=padding_strategy,
truncation_strategy=truncation_strategy,
max_length=max_length,
stride=stride,
pad_to_multiple_of=pad_to_multiple_of,
return_tensors=return_tensors,
return_token_type_ids=return_token_type_ids,
return_attention_mask=return_attention_mask,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_offsets_mapping=return_offsets_mapping,
return_length=return_length,
verbose=verbose,
**kwargs,
)

def _batch_encode_plus(
self,
batch_text_or_text_pairs: Union[
Expand All @@ -468,6 +340,7 @@ def _batch_encode_plus(
return_offsets_mapping: bool = False,
return_length: bool = False,
verbose: bool = True,
**kwargs,
) -> BatchEncoding:

if not isinstance(batch_text_or_text_pairs, list):
Expand Down

0 comments on commit 20dd625

Please sign in to comment.