diff --git a/docs/source/quicktour.rst b/docs/source/quicktour.rst index d0ac2e9d8122c..d8a143cc0a671 100644 --- a/docs/source/quicktour.rst +++ b/docs/source/quicktour.rst @@ -230,19 +230,16 @@ final activations of the model. >>> ## PYTORCH CODE >>> print(pt_outputs) - SequenceClassifierOutput(loss=None, logits=tensor([[-4.0833, 4.3364], - [ 0.0818, -0.0418]], grad_fn=), hidden_states=None, attentions=None) + (tensor([[-4.0833, 4.3364], + [ 0.0818, -0.0418]], grad_fn=),) >>> ## TENSORFLOW CODE >>> print(tf_outputs) (,) -The model can return more than just the final activations, which is why the PyTorch output is a special class and the -TensorFlow output is a tuple. Here we only asked for the final activations, so we get a tuple with one element on the -TensorFlow side and a :class:`~transformers.modeling_outputs.SequenceClassifierOutput` with just the ``logits`` field -filled on the PyTorch side. - +The model can return more than just the final activations, which is why the output is a tuple. Here we only asked for +the final activations, so we get a tuple with one element. .. note:: All 🤗 Transformers models (PyTorch or TensorFlow) return the activations of the model *before* the final @@ -254,7 +251,7 @@ Let's apply the SoftMax activation to get predictions. >>> ## PYTORCH CODE >>> import torch.nn.functional as F - >>> pt_predictions = F.softmax(pt_outputs.logits, dim=-1) + >>> pt_predictions = F.softmax(pt_outputs[0], dim=-1) >>> ## TENSORFLOW CODE >>> import tensorflow as tf >>> tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1) @@ -341,8 +338,8 @@ code is easy to access and tweak if you need to. In our previous example, the model was called "distilbert-base-uncased-finetuned-sst-2-english", which means it's using the :doc:`DistilBERT ` architecture. As -:class:`~transformers.AutoModelForSequenceClassification` (or :class:`~transformers.TFAutoModelForSequenceClassification` -if you are using TensorFlow)` was used, the model automatically created is then a +:class:`~transformers.AutoModelForSequenceClassification` (or :class:`~transformers.TFAutoModelForSequenceClassification` +if you are using TensorFlow) was used, the model automatically created is then a :class:`~transformers.DistilBertForSequenceClassification`. You can look at its documentation for all details relevant to that specific model, or browse the source code. This is how you would directly instantiate model and tokenizer without the auto magic: diff --git a/docs/source/training.rst b/docs/source/training.rst index 7ddfcc40fb53e..799f96e94afe0 100644 --- a/docs/source/training.rst +++ b/docs/source/training.rst @@ -49,7 +49,7 @@ put it in train mode. .. code-block:: python from transformers import BertForSequenceClassification - model = BertForSequenceClassification.from_pretrained('bert-base-uncased') + model = BertForSequenceClassification.from_pretrained('bert-base-uncased', return_dict=True) model.train() This is useful because it allows us to make use of the pre-trained BERT diff --git a/examples/README.md b/examples/README.md index 7a330a74eff81..a298ea4ea3e6e 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1,7 +1,7 @@ # Examples Version 2.9 of 🤗 Transformers introduces a new [`Trainer`](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer.py) class for PyTorch, and its equivalent [`TFTrainer`](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer_tf.py) for TF 2. -Running the examples requires PyTorch 1.3.1+ or TensorFlow 2.1+. +Running the examples requires PyTorch 1.3.1+ or TensorFlow 2.2+. Here is the list of all our examples: - **grouped by task** (all official examples work for multiple models) diff --git a/examples/multiple-choice/utils_multiple_choice.py b/examples/multiple-choice/utils_multiple_choice.py index 2f6dd040dce05..d17dd902f6810 100644 --- a/examples/multiple-choice/utils_multiple_choice.py +++ b/examples/multiple-choice/utils_multiple_choice.py @@ -204,6 +204,8 @@ def gen(): ) def get_dataset(self): + self.dataset = self.dataset.apply(tf.data.experimental.assert_cardinality(len(self.features))) + return self.dataset def __len__(self): diff --git a/examples/question-answering/run_squad.py b/examples/question-answering/run_squad.py index fa11a33ca628c..faaffea50191b 100644 --- a/examples/question-answering/run_squad.py +++ b/examples/question-answering/run_squad.py @@ -199,9 +199,6 @@ def train(args, train_dataset, model, tokenizer): {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)} ) - if isinstance(model, torch.nn.DataParallel): - inputs["return_tuple"] = True - outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] @@ -316,8 +313,6 @@ def evaluate(args, model, tokenizer, prefix=""): inputs.update( {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)} ) - if isinstance(model, torch.nn.DataParallel): - inputs["return_tuple"] = True outputs = model(**inputs) for i, feature_index in enumerate(feature_indices): diff --git a/examples/question-answering/run_tf_squad.py b/examples/question-answering/run_tf_squad.py index 1c654c32bfa59..7e90416bf5d7b 100644 --- a/examples/question-answering/run_tf_squad.py +++ b/examples/question-answering/run_tf_squad.py @@ -21,6 +21,8 @@ from dataclasses import dataclass, field from typing import Optional +import tensorflow as tf + from transformers import ( AutoConfig, AutoTokenizer, @@ -68,6 +70,7 @@ class DataTrainingArguments: data_dir: Optional[str] = field( default=None, metadata={"help": "The input data dir. Should contain the .json files for the SQuAD task."} ) + use_tfds: Optional[bool] = field(default=True, metadata={"help": "If TFDS should be used or not."}) max_seq_length: int = field( default=128, metadata={ @@ -170,7 +173,7 @@ def main(): ) # Get datasets - if not data_args.data_dir: + if data_args.use_tfds: if data_args.version_2_with_negative: logger.warn("tensorflow_datasets does not handle version 2 of SQuAD. Switch to version 1 automatically") @@ -179,7 +182,7 @@ def main(): except ImportError: raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.") - tfds_examples = tfds.load("squad") + tfds_examples = tfds.load("squad", data_dir=data_args.data_dir) train_examples = ( SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=False) if training_args.do_train @@ -209,6 +212,8 @@ def main(): else None ) + train_dataset = train_dataset.apply(tf.data.experimental.assert_cardinality(len(train_examples))) + eval_dataset = ( squad_convert_examples_to_features( examples=eval_examples, @@ -223,6 +228,8 @@ def main(): else None ) + eval_dataset = eval_dataset.apply(tf.data.experimental.assert_cardinality(len(eval_examples))) + # Initialize our Trainer trainer = TFTrainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,) diff --git a/examples/seq2seq/test_seq2seq_examples.py b/examples/seq2seq/test_seq2seq_examples.py index 44e3d6c703dc2..c5027f4336101 100644 --- a/examples/seq2seq/test_seq2seq_examples.py +++ b/examples/seq2seq/test_seq2seq_examples.py @@ -144,7 +144,7 @@ def test_distill_checkpointing_with_teacher(self): evaluate_checkpoint(ckpts[0], dest_dir=Path(tempfile.mkdtemp())) def test_loss_fn(self): - model = AutoModelForSeq2SeqLM.from_pretrained(BART_TINY) + model = AutoModelForSeq2SeqLM.from_pretrained(BART_TINY, return_dict=True) input_ids, mask = model.dummy_inputs["input_ids"], model.dummy_inputs["attention_mask"] target_ids = torch.tensor([[0, 4, 8, 2], [0, 8, 2, 1]], dtype=torch.long, device=model.device) decoder_input_ids = target_ids[:, :-1].contiguous() # Why this line? diff --git a/examples/text-classification/run_tf_glue.py b/examples/text-classification/run_tf_glue.py index a1e4f7a90ae4f..5477447040d60 100644 --- a/examples/text-classification/run_tf_glue.py +++ b/examples/text-classification/run_tf_glue.py @@ -9,6 +9,7 @@ from typing import Dict, Optional import numpy as np +import tensorflow as tf import tensorflow_datasets as tfds from transformers import ( @@ -35,7 +36,11 @@ class Split(Enum): def get_tfds( - task_name: str, tokenizer: PreTrainedTokenizer, max_seq_length: Optional[int] = None, mode: Split = Split.train + task_name: str, + tokenizer: PreTrainedTokenizer, + max_seq_length: Optional[int] = None, + mode: Split = Split.train, + data_dir: str = None, ): if task_name == "mnli-mm" and mode == Split.dev: tfds_name = "mnli_mismatched" @@ -50,9 +55,11 @@ def get_tfds( else: tfds_name = task_name - ds = tfds.load("glue/" + tfds_name, split=mode.value) + ds, info = tfds.load("glue/" + tfds_name, split=mode.value, with_info=True, data_dir=data_dir) + ds = glue_convert_examples_to_features(ds, tokenizer, max_seq_length, task_name) + ds = ds.apply(tf.data.experimental.assert_cardinality(info.splits[mode.value].num_examples)) - return glue_convert_examples_to_features(ds, tokenizer, max_seq_length, task_name) + return ds logger = logging.getLogger(__name__) @@ -69,6 +76,7 @@ class GlueDataTrainingArguments: """ task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())}) + data_dir: Optional[str] = field(default=None, metadata={"help": "The input/output data dir for TFDS."}) max_seq_length: int = field( default=128, metadata={ @@ -171,13 +179,22 @@ def main(): # Get datasets train_dataset = ( - get_tfds(task_name=data_args.task_name, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length) + get_tfds( + task_name=data_args.task_name, + tokenizer=tokenizer, + max_seq_length=data_args.max_seq_length, + data_dir=data_args.data_dir, + ) if training_args.do_train else None ) eval_dataset = ( get_tfds( - task_name=data_args.task_name, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, mode=Split.dev + task_name=data_args.task_name, + tokenizer=tokenizer, + max_seq_length=data_args.max_seq_length, + mode=Split.dev, + data_dir=data_args.data_dir, ) if training_args.do_eval else None diff --git a/examples/token-classification/run_tf_ner.py b/examples/token-classification/run_tf_ner.py index 068f0617371cc..5f38d5f981afd 100644 --- a/examples/token-classification/run_tf_ner.py +++ b/examples/token-classification/run_tf_ner.py @@ -17,7 +17,6 @@ import logging import os -import warnings from dataclasses import dataclass, field from typing import Dict, List, Optional, Tuple @@ -185,11 +184,6 @@ def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[L for i in range(batch_size): for j in range(seq_len): - if label_ids[i, j] == -1: - label_ids[i, j] = -100 - warnings.warn( - "Using `-1` to mask the loss for the token is depreciated. Please use `-100` instead." - ) if label_ids[i, j] != -100: out_label_list[i].append(label_map[label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) diff --git a/examples/token-classification/utils_ner.py b/examples/token-classification/utils_ner.py index 42e07f642a8e8..af9680d26c9eb 100644 --- a/examples/token-classification/utils_ner.py +++ b/examples/token-classification/utils_ner.py @@ -146,7 +146,7 @@ class TFNerDataset: """ features: List[InputFeatures] - pad_token_label_id: int = -1 + pad_token_label_id: int = -100 # Use cross entropy ignore_index as padding label id so that only # real label ids contribute to the loss later. @@ -221,6 +221,8 @@ def gen(): ) def get_dataset(self): + self.dataset = self.dataset.apply(tf.data.experimental.assert_cardinality(len(self.features))) + return self.dataset def __len__(self): diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index e17b97240da75..a0fc396e51149 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -278,6 +278,7 @@ XLMForTokenClassification, XLMForQuestionAnswering, XLMForQuestionAnsweringSimple, + XLMForMultipleChoice, XLM_PRETRAINED_MODEL_ARCHIVE_LIST, ) from .modeling_bart import ( @@ -356,6 +357,8 @@ FlaubertForTokenClassification, FlaubertForQuestionAnswering, FlaubertForQuestionAnsweringSimple, + FlaubertForTokenClassification, + FlaubertForMultipleChoice, FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST, ) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index c8dd3572aebaf..af31087697c46 100644 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -49,8 +49,9 @@ class PretrainedConfig(object): Whether or not the model should returns all attentions. use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether or not the model should return the last key/values attentions (not used by all models). - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not the model should return tuples instead of :obj:`ModelOutput` objects. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the model should return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether the model is used as an encoder/decoder or not. is_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`): @@ -133,7 +134,7 @@ class PretrainedConfig(object): def __init__(self, **kwargs): # Attributes with defaults - self.return_tuple = kwargs.pop("return_tuple", False) + self.return_dict = kwargs.pop("return_dict", False) self.output_hidden_states = kwargs.pop("output_hidden_states", False) self.output_attentions = kwargs.pop("output_attentions", False) self.use_cache = kwargs.pop("use_cache", True) # Not used by all models @@ -194,12 +195,12 @@ def __init__(self, **kwargs): raise err @property - def use_return_tuple(self) -> bool: + def use_return_dict(self) -> bool: """ - :obj:`bool`: Whether or not the model should return a tuple. + :obj:`bool`: Whether or not return :class:`~transformers.file_utils.ModelOutput` instead of tuples. """ - # If torchscript is set, force return_tuple to avoid jit errors - return self.return_tuple or self.torchscript + # If torchscript is set, force `return_dict=False` to avoid jit errors + return self.return_dict and not self.torchscript @property def num_labels(self) -> int: diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index 5a46fb0624679..5bdf1f792d955 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -13,14 +13,17 @@ import sys import tarfile import tempfile +from collections import OrderedDict from contextlib import contextmanager +from dataclasses import fields from functools import partial, wraps from hashlib import sha256 from pathlib import Path -from typing import Dict, Optional, Union +from typing import Any, Dict, Optional, Tuple, Union from urllib.parse import urlparse from zipfile import ZipFile, is_zipfile +import numpy as np import requests from filelock import FileLock from tqdm.auto import tqdm @@ -190,8 +193,8 @@ def docstring_decorator(fn): RETURN_INTRODUCTION = r""" Returns: :class:`~{full_output_type}` or :obj:`tuple(torch.FloatTensor)`: - A :class:`~{full_output_type}` or a tuple of :obj:`torch.FloatTensor` (if ``return_tuple=True`` is passed or - when ``config.return_tuple=True``) comprising various elements depending on the configuration + A :class:`~{full_output_type}` (if ``return_dict=True`` is passed or when ``config.return_dict=True``) or a + tuple of :obj:`torch.FloatTensor` comprising various elements depending on the configuration (:class:`~transformers.{config_class}`) and inputs. """ @@ -257,7 +260,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import torch >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}') + >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0) # Batch size 1 @@ -274,7 +277,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import torch >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}') + >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> start_positions = torch.tensor([1]) @@ -293,7 +296,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import torch >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}') + >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 @@ -309,7 +312,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import torch >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}') + >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) >>> input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"] @@ -325,7 +328,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import torch >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}') + >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) @@ -340,7 +343,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> import torch >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}') + >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> choice0 = "It is eaten with a fork and a knife." @@ -362,7 +365,7 @@ def _prepare_output_docstrings(output_type, config_class): >>> from transformers import {tokenizer_class}, {model_class} >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') - >>> model = {model_class}.from_pretrained('{checkpoint}') + >>> model = {model_class}.from_pretrained('{checkpoint}', return_dict=True) >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs, labels=inputs["input_ids"]) @@ -900,30 +903,91 @@ def wrapper(*args, **kwargs): return wrapper -class ModelOutput: +def is_tensor(x): + """ Tests if ``x`` is a :obj:`torch.Tensor`, :obj:`tf.Tensor` or :obj:`np.ndarray`. """ + if is_torch_available(): + import torch + + if isinstance(x, torch.Tensor): + return True + if is_tf_available(): + import tensorflow as tf + + if isinstance(x, tf.Tensor): + return True + return isinstance(x, np.ndarray) + + +class ModelOutput(OrderedDict): """ Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows indexing by integer or slice (like - a tuple) or strings (like a dictionnary) that will ignore the ``None`` attributes. + a tuple) or strings (like a dictionnary) that will ignore the ``None`` attributes. Otherwise behaves like a + regular python dictionary. + + .. warning:: + You can't unpack a :obj:`ModelOutput` directly. Use the :meth:`~transformers.file_utils.ModelOutput.to_tuple` + method to convert it to a tuple before. """ - def to_tuple(self): - """ - Converts :obj:`self` to a tuple. + def __post_init__(self): + class_fields = fields(self) + + # Safety and consistency checks + assert len(class_fields), f"{self.__class__.__name__} has no fields." + assert all( + field.default is None for field in class_fields[1:] + ), f"{self.__class__.__name__} should not have more than one required field." + + first_field = getattr(self, class_fields[0].name) + other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:]) + + if other_fields_are_none and not is_tensor(first_field): + try: + iterator = iter(first_field) + first_field_iterator = True + except TypeError: + first_field_iterator = False + + # if we provided an iterator as first field and the iterator is a (key, value) iterator + # set the associated fields + if first_field_iterator: + for element in iterator: + if ( + not isinstance(element, (list, tuple)) + or not len(element) == 2 + or not isinstance(element[0], str) + ): + break + setattr(self, element[0], element[1]) + if element[1] is not None: + self[element[0]] = element[1] + else: + for field in class_fields: + v = getattr(self, field.name) + if v is not None: + self[field.name] = v - Return: A tuple containing all non-:obj:`None` attributes of the :obj:`self`. - """ - return tuple(getattr(self, f) for f in self.__dataclass_fields__.keys() if getattr(self, f, None) is not None) + def __delitem__(self, *args, **kwargs): + raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.") - def to_dict(self): - """ - Converts :obj:`self` to a Python dictionary. + def setdefault(self, *args, **kwargs): + raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.") - Return: A dictionary containing all non-:obj:`None` attributes of the :obj:`self`. - """ - return {f: getattr(self, f) for f in self.__dataclass_fields__.keys() if getattr(self, f, None) is not None} + def pop(self, *args, **kwargs): + raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.") + + def update(self, *args, **kwargs): + raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.") - def __getitem__(self, i): - return self.to_dict()[i] if isinstance(i, str) else self.to_tuple()[i] + def __getitem__(self, k): + if isinstance(k, str): + inner_dict = {k: v for (k, v) in self.items()} + return inner_dict[k] + else: + return self.to_tuple()[k] - def __len__(self): - return len(self.to_tuple()) + def to_tuple(self) -> Tuple[Any]: + """ + Convert self to a tuple containing all the attributes/keys that are not ``None``. + """ + return tuple(self[k] for k in self.keys()) diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py index 2f52d1f498544..ef96228b5ba1f 100644 --- a/src/transformers/modeling_albert.py +++ b/src/transformers/modeling_albert.py @@ -346,7 +346,7 @@ def forward( head_mask=None, output_attentions=False, output_hidden_states=False, - return_tuple=False, + return_dict=False, ): hidden_states = self.embedding_hidden_mapping_in(hidden_states) @@ -375,7 +375,7 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if return_tuple: + if not return_dict: return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions @@ -430,9 +430,9 @@ class AlbertForPretrainingOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - prediction_logits: torch.FloatTensor - sop_logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + prediction_logits: torch.FloatTensor = None + sop_logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -488,8 +488,9 @@ class AlbertForPretrainingOutput(ModelOutput): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -561,13 +562,13 @@ def forward( inputs_embeds=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -599,14 +600,14 @@ def forward( head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0])) - if return_tuple: + if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPooling( @@ -653,7 +654,7 @@ def forward( sentence_order_label=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs, ): r""" @@ -678,7 +679,7 @@ def forward( >>> import torch >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') - >>> model = AlbertForPreTraining.from_pretrained('albert-base-v2') + >>> model = AlbertForPreTraining.from_pretrained('albert-base-v2', return_dict=True) >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> outputs = model(input_ids) @@ -695,7 +696,7 @@ def forward( ) labels = kwargs.pop("masked_lm_labels") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.albert( input_ids, @@ -706,7 +707,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output, pooled_output = outputs[:2] @@ -721,7 +722,7 @@ def forward( sentence_order_loss = loss_fct(sop_scores.view(-1, 2), sentence_order_label.view(-1)) total_loss = masked_lm_loss + sentence_order_loss - if return_tuple: + if not return_dict: output = (prediction_scores, sop_scores) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output @@ -808,7 +809,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs ): r""" @@ -827,7 +828,7 @@ def forward( ) labels = kwargs.pop("masked_lm_labels") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.albert( input_ids=input_ids, @@ -838,7 +839,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_outputs = outputs[0] @@ -849,7 +850,7 @@ def forward( loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if return_tuple: + if not return_dict: output = (prediction_scores,) + outputs[2:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output @@ -895,7 +896,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -904,7 +905,7 @@ def forward( If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.albert( input_ids=input_ids, @@ -915,7 +916,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -933,7 +934,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -976,14 +977,14 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.albert( input_ids, @@ -994,7 +995,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1014,7 +1015,7 @@ def forward( else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -1057,7 +1058,7 @@ def forward( end_positions=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1069,7 +1070,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.albert( input_ids=input_ids, @@ -1080,7 +1081,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1107,7 +1108,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output @@ -1153,7 +1154,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1161,7 +1162,7 @@ def forward( Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1182,7 +1183,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -1196,7 +1197,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if return_tuple: + if not return_dict: output = (reshaped_logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/modeling_auto.py b/src/transformers/modeling_auto.py index b3dc19fc1c10d..5f6ad671edbea 100644 --- a/src/transformers/modeling_auto.py +++ b/src/transformers/modeling_auto.py @@ -98,6 +98,7 @@ ) from .modeling_encoder_decoder import EncoderDecoderModel from .modeling_flaubert import ( + FlaubertForMultipleChoice, FlaubertForQuestionAnsweringSimple, FlaubertForSequenceClassification, FlaubertForTokenClassification, @@ -142,6 +143,7 @@ from .modeling_t5 import T5ForConditionalGeneration, T5Model from .modeling_transfo_xl import TransfoXLLMHeadModel, TransfoXLModel from .modeling_xlm import ( + XLMForMultipleChoice, XLMForQuestionAnsweringSimple, XLMForSequenceClassification, XLMForTokenClassification, @@ -338,6 +340,7 @@ (XLNetConfig, XLNetForTokenClassification), (AlbertConfig, AlbertForTokenClassification), (ElectraConfig, ElectraForTokenClassification), + (FlaubertConfig, FlaubertForTokenClassification), ] ) @@ -353,6 +356,8 @@ (MobileBertConfig, MobileBertForMultipleChoice), (XLNetConfig, XLNetForMultipleChoice), (AlbertConfig, AlbertForMultipleChoice), + (XLMConfig, XLMForMultipleChoice), + (FlaubertConfig, FlaubertForMultipleChoice), ] ) diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py index 1104567a48c54..92a5bd43b0bbc 100644 --- a/src/transformers/modeling_bart.py +++ b/src/transformers/modeling_bart.py @@ -124,8 +124,9 @@ If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -304,7 +305,7 @@ def __init__(self, config: BartConfig, embed_tokens): self.layer_norm = LayerNorm(config.d_model) if config.normalize_before else None def forward( - self, input_ids, attention_mask=None, output_attentions=False, output_hidden_states=False, return_tuple=False + self, input_ids, attention_mask=None, output_attentions=False, output_hidden_states=False, return_dict=False ): """ Args: @@ -359,7 +360,7 @@ def forward( # T x B x C -> B x T x C x = x.transpose(0, 1) - if return_tuple: + if not return_dict: return tuple(v for v in [x, encoder_states, all_attentions] if v is not None) return BaseModelOutput(last_hidden_state=x, hidden_states=encoder_states, attentions=all_attentions) @@ -495,7 +496,7 @@ def forward( use_cache=False, output_attentions=False, output_hidden_states=False, - return_tuple=False, + return_dict=False, **unused, ): """ @@ -588,7 +589,7 @@ def forward( else: next_cache = None - if return_tuple: + if not return_dict: return tuple(v for v in [x, next_cache, all_hidden_states, all_self_attns] if v is not None) return BaseModelOutputWithPast( last_hidden_state=x, past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_self_attns @@ -850,7 +851,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs, ): @@ -862,7 +863,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # make masks if user doesn't supply if not use_cache: @@ -884,10 +885,10 @@ def forward( attention_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) - # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOuput when return_tuple=False - elif not return_tuple and not isinstance(encoder_outputs, BaseModelOutput): + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOuput when return_dict=False + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): encoder_outputs = BaseModelOutput( last_hidden_state=encoder_outputs[0], hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, @@ -905,10 +906,10 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) - if return_tuple: + if not return_dict: return decoder_outputs + encoder_outputs return Seq2SeqModelOutput( @@ -976,7 +977,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **unused, ): r""" @@ -1018,7 +1019,7 @@ def forward( FutureWarning, ) decoder_past_key_values = unused.pop("decoder_cached_states") - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False @@ -1033,7 +1034,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) lm_logits = F.linear(outputs[0], self.model.shared.weight, bias=self.final_logits_bias) @@ -1043,7 +1044,7 @@ def forward( # TODO(SS): do we need to ignore pad tokens in labels? masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) - if return_tuple: + if not return_dict: output = (lm_logits,) + outputs[1:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output @@ -1146,7 +1147,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1154,7 +1155,7 @@ def forward( Indices should be in :obj:`[0, ..., config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None: use_cache = False @@ -1167,7 +1168,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) x = outputs[0] # last hidden state eos_mask = input_ids.eq(self.config.eos_token_id) @@ -1180,7 +1181,7 @@ def forward( if labels is not None: loss = F.cross_entropy(logits.view(-1, self.config.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1232,7 +1233,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1244,7 +1245,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if start_positions is not None and end_positions is not None: use_cache = False @@ -1257,7 +1258,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1284,7 +1285,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits,) + outputs[1:] return ((total_loss,) + output) if total_loss is not None else output diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index 850cae298469c..11dd8f8b36d03 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -429,7 +429,7 @@ def forward( encoder_attention_mask=None, output_attentions=False, output_hidden_states=False, - return_tuple=False, + return_dict=False, ): all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -469,7 +469,7 @@ def custom_forward(*inputs): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if return_tuple: + if not return_dict: return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions @@ -609,9 +609,9 @@ class BertForPretrainingOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - prediction_logits: torch.FloatTensor - seq_relationship_logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + prediction_logits: torch.FloatTensor = None + seq_relationship_logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -674,8 +674,9 @@ class BertForPretrainingOutput(ModelOutput): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -743,13 +744,13 @@ def forward( encoder_attention_mask=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -800,12 +801,12 @@ def forward( encoder_attention_mask=encoder_extended_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) - if return_tuple: + if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPooling( @@ -847,7 +848,7 @@ def forward( next_sentence_label=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs ): r""" @@ -872,7 +873,7 @@ def forward( >>> import torch >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - >>> model = BertForPreTraining.from_pretrained('bert-base-uncased') + >>> model = BertForPreTraining.from_pretrained('bert-base-uncased', return_dict=True) >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) @@ -887,7 +888,7 @@ def forward( ) labels = kwargs.pop("masked_lm_labels") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, @@ -898,7 +899,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output, pooled_output = outputs[:2] @@ -911,7 +912,7 @@ def forward( next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss - if return_tuple: + if not return_dict: output = (prediction_scores, seq_relationship_score) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output @@ -955,7 +956,7 @@ def forward( encoder_attention_mask=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs ): r""" @@ -977,14 +978,14 @@ def forward( >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased') >>> config = BertConfig.from_pretrained("bert-base-cased") >>> config.is_decoder = True - >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config) + >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config, return_dict=True) >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) >>> prediction_logits = outputs.logits """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, @@ -997,7 +998,7 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1011,7 +1012,7 @@ def forward( loss_fct = CrossEntropyLoss() lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if return_tuple: + if not return_dict: output = (prediction_scores,) + outputs[2:] return ((lm_loss,) + output) if lm_loss is not None else output @@ -1065,7 +1066,7 @@ def forward( encoder_attention_mask=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs ): r""" @@ -1086,7 +1087,7 @@ def forward( assert "lm_labels" not in kwargs, "Use `BertWithLMHead` for autoregressive language modeling task." assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, @@ -1099,7 +1100,7 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1110,7 +1111,7 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if return_tuple: + if not return_dict: output = (prediction_scores,) + outputs[2:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output @@ -1161,7 +1162,7 @@ def forward( next_sentence_label=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1178,7 +1179,7 @@ def forward( >>> import torch >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') + >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased', return_dict=True) >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." @@ -1188,7 +1189,7 @@ def forward( >>> logits = outputs.logits >>> assert logits[0, 0] < logits[0, 1] # next sentence was random """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, @@ -1199,7 +1200,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -1211,7 +1212,7 @@ def forward( loss_fct = CrossEntropyLoss() next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), next_sentence_label.view(-1)) - if return_tuple: + if not return_dict: output = (seq_relationship_scores,) + outputs[2:] return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output @@ -1257,7 +1258,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1266,7 +1267,7 @@ def forward( If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, @@ -1277,7 +1278,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -1295,7 +1296,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -1337,7 +1338,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1345,7 +1346,7 @@ def forward( Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1367,7 +1368,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -1381,7 +1382,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if return_tuple: + if not return_dict: output = (reshaped_logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -1424,14 +1425,14 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, @@ -1442,7 +1443,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1464,7 +1465,7 @@ def forward( else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -1507,7 +1508,7 @@ def forward( end_positions=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1519,7 +1520,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.bert( input_ids, @@ -1530,7 +1531,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1557,7 +1558,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output diff --git a/src/transformers/modeling_camembert.py b/src/transformers/modeling_camembert.py index def89a214d45a..2e9a24d4d20cd 100644 --- a/src/transformers/modeling_camembert.py +++ b/src/transformers/modeling_camembert.py @@ -51,12 +51,6 @@ model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. - output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. - output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. """ diff --git a/src/transformers/modeling_ctrl.py b/src/transformers/modeling_ctrl.py index 7638bcf014fa3..653aaa501618a 100644 --- a/src/transformers/modeling_ctrl.py +++ b/src/transformers/modeling_ctrl.py @@ -295,8 +295,9 @@ def _init_weights(self, module): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -355,7 +356,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs, ): if "past" in kwargs: @@ -371,7 +372,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -472,7 +473,7 @@ def forward( attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:] all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions) - if return_tuple: + if not return_dict: return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None) return BaseModelOutputWithPast( @@ -526,7 +527,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs, ): r""" @@ -544,7 +545,7 @@ def forward( ) past_key_values = kwargs.pop("past") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, @@ -557,7 +558,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) hidden_states = transformer_outputs[0] @@ -573,7 +574,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - if return_tuple: + if not return_dict: output = (lm_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/modeling_distilbert.py b/src/transformers/modeling_distilbert.py index 9c3f4e03198e5..ca19495e7b296 100644 --- a/src/transformers/modeling_distilbert.py +++ b/src/transformers/modeling_distilbert.py @@ -279,7 +279,7 @@ def __init__(self, config): self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layers)]) def forward( - self, x, attn_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False, return_tuple=None + self, x, attn_mask=None, head_mask=None, output_attentions=False, output_hidden_states=False, return_dict=None ): """ Parameters @@ -324,7 +324,7 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_state,) - if return_tuple: + if not return_dict: return tuple(v for v in [hidden_state, all_hidden_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_state, hidden_states=all_hidden_states, attentions=all_attentions @@ -396,8 +396,9 @@ def _init_weights(self, module): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -444,13 +445,13 @@ def forward( inputs_embeds=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -477,7 +478,7 @@ def forward( head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) @@ -516,7 +517,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs ): r""" @@ -535,7 +536,7 @@ def forward( ) labels = kwargs.pop("masked_lm_labels") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict dlbrt_output = self.distilbert( input_ids=input_ids, @@ -544,7 +545,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) hidden_states = dlbrt_output[0] # (bs, seq_length, dim) prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) @@ -556,7 +557,7 @@ def forward( if labels is not None: mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)), labels.view(-1)) - if return_tuple: + if not return_dict: output = (prediction_logits,) + dlbrt_output[1:] return ((mlm_loss,) + output) if mlm_loss is not None else output @@ -601,7 +602,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -610,7 +611,7 @@ def forward( If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict distilbert_output = self.distilbert( input_ids=input_ids, @@ -619,7 +620,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) hidden_state = distilbert_output[0] # (bs, seq_len, dim) pooled_output = hidden_state[:, 0] # (bs, dim) @@ -637,7 +638,7 @@ def forward( loss_fct = nn.CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + distilbert_output[1:] return ((loss,) + output) if loss is not None else output @@ -682,7 +683,7 @@ def forward( end_positions=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -694,7 +695,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict distilbert_output = self.distilbert( input_ids=input_ids, @@ -703,7 +704,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) hidden_states = distilbert_output[0] # (bs, max_query_len, dim) @@ -730,7 +731,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits) + distilbert_output[1:] return ((total_loss,) + output) if total_loss is not None else output @@ -775,14 +776,14 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.distilbert( input_ids, @@ -791,7 +792,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -813,7 +814,7 @@ def forward( else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output @@ -849,7 +850,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -865,7 +866,7 @@ def forward( >>> import torch >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') - >>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased') + >>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased', return_dict=True) >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> choice0 = "It is eaten with a fork and a knife." @@ -879,7 +880,7 @@ def forward( >>> loss = outputs.loss >>> logits = outputs.logits """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -897,7 +898,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) hidden_state = outputs[0] # (bs * num_choices, seq_len, dim) @@ -914,7 +915,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if return_tuple: + if not return_dict: output = (reshaped_logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/modeling_dpr.py b/src/transformers/modeling_dpr.py index 7cffaabdc09df..fde9952461eab 100644 --- a/src/transformers/modeling_dpr.py +++ b/src/transformers/modeling_dpr.py @@ -134,8 +134,8 @@ class DPRReaderOutput(ModelOutput): """ start_logits: torch.FloatTensor - end_logits: torch.FloatTensor - relevance_logits: torch.FloatTensor + end_logits: torch.FloatTensor = None + relevance_logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -161,7 +161,7 @@ def forward( inputs_embeds: Optional[Tensor] = None, output_attentions: bool = False, output_hidden_states: bool = False, - return_tuple: bool = False, + return_dict: bool = False, ) -> Union[BaseModelOutputWithPooling, Tuple[Tensor, ...]]: outputs = self.bert_model( input_ids=input_ids, @@ -170,14 +170,14 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output, pooled_output = outputs[:2] pooled_output = sequence_output[:, 0, :] if self.projection_dim > 0: pooled_output = self.encode_proj(pooled_output) - if return_tuple: + if not return_dict: return (sequence_output, pooled_output) + outputs[2:] return BaseModelOutputWithPooling( @@ -217,7 +217,7 @@ def forward( inputs_embeds: Optional[Tensor] = None, output_attentions: bool = False, output_hidden_states: bool = False, - return_tuple: bool = False, + return_dict: bool = False, ) -> Union[DPRReaderOutput, Tuple[Tensor, ...]]: # notations: N - number of questions in a batch, M - number of passages per questions, L - sequence length n_passages, sequence_length = input_ids.size() if input_ids is not None else inputs_embeds.size()[:2] @@ -228,7 +228,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -244,7 +244,7 @@ def forward( end_logits = end_logits.view(n_passages, sequence_length) relevance_logits = relevance_logits.view(n_passages) - if return_tuple: + if not return_dict: return (start_logits, end_logits, relevance_logits) + outputs[2:] return DPRReaderOutput( @@ -361,6 +361,9 @@ def init_weights(self): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states tensors of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ DPR_READER_INPUTS_DOCSTRING = r""" @@ -388,6 +391,9 @@ def init_weights(self): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states tensors of all layers are returned. See ``hidden_states`` under returned tensors for more detail. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -412,7 +418,7 @@ def forward( inputs_embeds: Optional[Tensor] = None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ) -> Union[DPRContextEncoderOutput, Tuple[Tensor, ...]]: r""" Return: @@ -421,7 +427,7 @@ def forward( from transformers import DPRContextEncoder, DPRContextEncoderTokenizer tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base') - model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base') + model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', return_dict=True) input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"] embeddings = model(input_ids).pooler_output """ @@ -430,7 +436,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -459,10 +465,10 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) - if return_tuple: + if not return_dict: return outputs[1:] return DPRContextEncoderOutput( pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions @@ -490,7 +496,7 @@ def forward( inputs_embeds: Optional[Tensor] = None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ) -> Union[DPRQuestionEncoderOutput, Tuple[Tensor, ...]]: r""" Return: @@ -499,7 +505,7 @@ def forward( from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base') - model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base') + model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base', return_dict=True) input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"] embeddings = model(input_ids).pooler_output """ @@ -507,7 +513,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -536,10 +542,10 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) - if return_tuple: + if not return_dict: return outputs[1:] return DPRQuestionEncoderOutput( pooler_output=outputs.pooler_output, hidden_states=outputs.hidden_states, attentions=outputs.attentions @@ -565,7 +571,7 @@ def forward( inputs_embeds: Optional[Tensor] = None, output_attentions: bool = None, output_hidden_states: bool = None, - return_tuple=None, + return_dict=None, ) -> Union[DPRReaderOutput, Tuple[Tensor, ...]]: r""" Return: @@ -574,7 +580,7 @@ def forward( from transformers import DPRReader, DPRReaderTokenizer tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base') - model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base') + model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base', return_dict=True) encoded_inputs = tokenizer( questions=["What is love ?"], titles=["Haddaway"], @@ -591,7 +597,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -613,5 +619,5 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) diff --git a/src/transformers/modeling_electra.py b/src/transformers/modeling_electra.py index 8f24343ccaa79..1f2cb118c0f08 100644 --- a/src/transformers/modeling_electra.py +++ b/src/transformers/modeling_electra.py @@ -208,8 +208,8 @@ class ElectraForPretrainingOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -272,8 +272,9 @@ class ElectraForPretrainingOutput(ModelOutput): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -331,13 +332,13 @@ def forward( inputs_embeds=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -371,7 +372,7 @@ def forward( head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) return hidden_states @@ -428,7 +429,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -437,7 +438,7 @@ def forward( If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict discriminator_hidden_states = self.electra( input_ids, @@ -448,7 +449,7 @@ def forward( inputs_embeds, output_attentions, output_hidden_states, - return_tuple, + return_dict, ) sequence_output = discriminator_hidden_states[0] @@ -464,7 +465,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + discriminator_hidden_states[1:] return ((loss,) + output) if loss is not None else output @@ -505,7 +506,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): @@ -527,7 +528,7 @@ def forward( >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> logits = model(input_ids).logits """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict discriminator_hidden_states = self.electra( input_ids, @@ -538,7 +539,7 @@ def forward( inputs_embeds, output_attentions, output_hidden_states, - return_tuple, + return_dict, ) discriminator_sequence_output = discriminator_hidden_states[0] @@ -555,7 +556,7 @@ def forward( else: loss = loss_fct(logits.view(-1, discriminator_sequence_output.shape[1]), labels.float()) - if return_tuple: + if not return_dict: output = (logits,) + discriminator_hidden_states[1:] return ((loss,) + output) if loss is not None else output @@ -606,7 +607,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs ): r""" @@ -625,7 +626,7 @@ def forward( ) labels = kwargs.pop("masked_lm_labels") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict generator_hidden_states = self.electra( input_ids, @@ -636,7 +637,7 @@ def forward( inputs_embeds, output_attentions, output_hidden_states, - return_tuple, + return_dict, ) generator_sequence_output = generator_hidden_states[0] @@ -649,7 +650,7 @@ def forward( loss_fct = nn.CrossEntropyLoss() # -100 index = padding token loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if return_tuple: + if not return_dict: output = (prediction_scores,) + generator_hidden_states[1:] return ((loss,) + output) if loss is not None else output @@ -695,14 +696,14 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict discriminator_hidden_states = self.electra( input_ids, @@ -713,7 +714,7 @@ def forward( inputs_embeds, output_attentions, output_hidden_states, - return_tuple, + return_dict, ) discriminator_sequence_output = discriminator_hidden_states[0] @@ -732,7 +733,7 @@ def forward( else: loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + discriminator_hidden_states[1:] return ((loss,) + output) if loss is not None else output @@ -782,7 +783,7 @@ def forward( end_positions=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -794,7 +795,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict discriminator_hidden_states = self.electra( input_ids, @@ -831,7 +832,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits,) + discriminator_hidden_states[1:] return ((total_loss,) + output) if total_loss is not None else output @@ -876,7 +877,7 @@ def forward( inputs_embeds=None, labels=None, output_attentions=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -884,7 +885,7 @@ def forward( Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -905,7 +906,7 @@ def forward( head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = discriminator_hidden_states[0] @@ -919,7 +920,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if return_tuple: + if not return_dict: output = (reshaped_logits,) + discriminator_hidden_states[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/modeling_encoder_decoder.py b/src/transformers/modeling_encoder_decoder.py index 52d7058d1b2be..3eb92ad8f905d 100644 --- a/src/transformers/modeling_encoder_decoder.py +++ b/src/transformers/modeling_encoder_decoder.py @@ -273,7 +273,6 @@ def forward( attention_mask=attention_mask, inputs_embeds=inputs_embeds, head_mask=head_mask, - return_tuple=True, **kwargs_encoder, ) @@ -288,7 +287,6 @@ def forward( encoder_attention_mask=attention_mask, head_mask=decoder_head_mask, labels=labels, - return_tuple=True, **kwargs_decoder, ) diff --git a/src/transformers/modeling_flaubert.py b/src/transformers/modeling_flaubert.py index aeda892f7ff63..61ef9d8fc7fe8 100644 --- a/src/transformers/modeling_flaubert.py +++ b/src/transformers/modeling_flaubert.py @@ -25,6 +25,7 @@ from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .modeling_outputs import BaseModelOutput from .modeling_xlm import ( + XLMForMultipleChoice, XLMForQuestionAnswering, XLMForQuestionAnsweringSimple, XLMForSequenceClassification, @@ -109,8 +110,9 @@ If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -147,13 +149,13 @@ def forward( inputs_embeds=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # removed: src_enc=None, src_len=None if input_ids is not None: @@ -283,7 +285,7 @@ def forward( # move back sequence length to dimension 0 # tensor = tensor.transpose(0, 1) - if return_tuple: + if not return_dict: return tuple(v for v in [tensor, hidden_states, attentions] if v is not None) return BaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions) @@ -382,3 +384,22 @@ def __init__(self, config): super().__init__(config) self.transformer = FlaubertModel(config) self.init_weights() + + +@add_start_docstrings( + """Flaubert Model with a multiple choice classification head on top (a linear layer on top of + the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, + FLAUBERT_START_DOCSTRING, +) +class FlaubertForMultipleChoice(XLMForMultipleChoice): + """ + This class overrides :class:`~transformers.XLMForMultipleChoice`. Please check the + superclass for the appropriate documentation alongside usage examples. + """ + + config_class = FlaubertConfig + + def __init__(self, config): + super().__init__(config) + self.transformer = FlaubertModel(config) + self.init_weights() diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py index 0514586a5f248..a2168726ccd21 100644 --- a/src/transformers/modeling_gpt2.py +++ b/src/transformers/modeling_gpt2.py @@ -323,10 +323,10 @@ class GPT2DoubleHeadsModelOutput(ModelOutput): heads. """ - lm_loss: Optional[torch.FloatTensor] - mc_loss: Optional[torch.FloatTensor] - lm_logits: torch.FloatTensor - mc_logits: torch.FloatTensor + lm_loss: Optional[torch.FloatTensor] = None + mc_loss: Optional[torch.FloatTensor] = None + lm_logits: torch.FloatTensor = None + mc_logits: torch.FloatTensor = None past_key_values: Optional[List[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -395,8 +395,9 @@ class GPT2DoubleHeadsModelOutput(ModelOutput): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -448,7 +449,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs, ): if "past" in kwargs: @@ -464,7 +465,7 @@ def forward( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) use_cache = use_cache if use_cache is not None else self.config.use_cache - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -560,7 +561,7 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if return_tuple: + if not return_dict: return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None) return BaseModelOutputWithPast( @@ -616,7 +617,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs, ): r""" @@ -634,7 +635,7 @@ def forward( ) past_key_values = kwargs.pop("past") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, @@ -647,7 +648,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) hidden_states = transformer_outputs[0] @@ -662,7 +663,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - if return_tuple: + if not return_dict: output = (lm_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -713,7 +714,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs, ): r""" @@ -741,7 +742,7 @@ def forward( >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2') - >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2') + >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2, return_dict=True) >>> # Add a [CLS] to the vocabulary (we should train it also!) >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'}) @@ -773,7 +774,7 @@ def forward( ) past_key_values = kwargs.pop("past") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, @@ -786,7 +787,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) hidden_states = transformer_outputs[0] @@ -805,7 +806,7 @@ def forward( loss_fct = CrossEntropyLoss() lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - if return_tuple: + if not return_dict: output = (lm_logits, mc_logits) + transformer_outputs[1:] if mc_loss is not None: output = (mc_loss,) + output diff --git a/src/transformers/modeling_longformer.py b/src/transformers/modeling_longformer.py index c440af07a059e..da422b95f8425 100644 --- a/src/transformers/modeling_longformer.py +++ b/src/transformers/modeling_longformer.py @@ -694,7 +694,7 @@ def forward( attention_mask=None, output_attentions=False, output_hidden_states=False, - return_tuple=False, + return_dict=False, ): all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -724,7 +724,7 @@ def custom_forward(*inputs): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if return_tuple: + if not return_dict: return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions @@ -811,8 +811,9 @@ def _init_weights(self, module): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -942,7 +943,7 @@ def forward( inputs_embeds=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" @@ -953,7 +954,7 @@ def forward( >>> import torch >>> from transformers import LongformerModel, LongformerTokenizer - >>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096') + >>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096', return_dict=True) >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document @@ -965,14 +966,16 @@ def forward( ... # classification: the token ... # QA: question tokens ... # LM: potentially on the beginning of sentences and paragraphs - >>> sequence_output, pooled_output = model(input_ids, attention_mask=attention_mask) + >>> outputs = model(input_ids, attention_mask=attention_mask) + >>> sequence_output = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -1016,7 +1019,7 @@ def forward( attention_mask=extended_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) @@ -1026,7 +1029,7 @@ def forward( # unpad `sequence_output` because the calling function is expecting a length == input_ids.size(1) sequence_output = sequence_output[:, :-padding_len] - if return_tuple: + if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPooling( @@ -1063,7 +1066,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs ): r""" @@ -1082,7 +1085,7 @@ def forward( >>> import torch >>> from transformers import LongformerForMaskedLM, LongformerTokenizer - >>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096') + >>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096', return_dict=True) >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000) # long input document @@ -1102,7 +1105,7 @@ def forward( ) labels = kwargs.pop("masked_lm_labels") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.longformer( input_ids, @@ -1113,7 +1116,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] prediction_scores = self.lm_head(sequence_output) @@ -1123,7 +1126,7 @@ def forward( loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if return_tuple: + if not return_dict: output = (prediction_scores,) + outputs[2:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output @@ -1171,7 +1174,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1180,7 +1183,7 @@ def forward( If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if global_attention_mask is None: logger.info("Initializing global attention on CLS token...") @@ -1197,7 +1200,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] logits = self.classifier(sequence_output) @@ -1212,7 +1215,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -1272,7 +1275,7 @@ def forward( end_positions=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1291,7 +1294,7 @@ def forward( >>> import torch >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa") - >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa") + >>> model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-large-4096-finetuned-triviaqa", return_dict=True) >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" >>> encoding = tokenizer(question, text, return_tensors="pt") @@ -1310,7 +1313,7 @@ def forward( >>> answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # set global attention on question tokens if global_attention_mask is None: @@ -1327,7 +1330,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1354,7 +1357,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output @@ -1404,14 +1407,14 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.longformer( input_ids, @@ -1422,7 +1425,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1444,7 +1447,7 @@ def forward( else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -1489,7 +1492,7 @@ def forward( inputs_embeds=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1498,7 +1501,7 @@ def forward( of the input tensors. (see `input_ids` above) """ num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # set global attention on question tokens if global_attention_mask is None: @@ -1536,7 +1539,7 @@ def forward( inputs_embeds=flat_inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -1549,7 +1552,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if return_tuple: + if not return_dict: output = (reshaped_logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/modeling_mmbt.py b/src/transformers/modeling_mmbt.py index ec3138e2b40a6..18105269d00a5 100644 --- a/src/transformers/modeling_mmbt.py +++ b/src/transformers/modeling_mmbt.py @@ -23,7 +23,7 @@ from torch.nn import CrossEntropyLoss, MSELoss from .file_utils import add_start_docstrings, add_start_docstrings_to_callable, replace_return_docstrings -from .modeling_outputs import BaseModelOutputWithPooling +from .modeling_outputs import BaseModelOutputWithPooling, SequenceClassifierOutput from .modeling_utils import ModuleUtilsMixin @@ -148,8 +148,9 @@ def forward(self, input_modal, start_token=None, end_token=None, position_ids=No If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -182,7 +183,7 @@ def forward( encoder_attention_mask=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" Returns: @@ -198,7 +199,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -257,13 +258,13 @@ def forward( encoder_attention_mask=encoder_extended_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = encoder_outputs[0] pooled_output = self.transformer.pooler(sequence_output) - if return_tuple: + if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPooling( @@ -339,7 +340,9 @@ def forward( head_mask=None, inputs_embeds=None, labels=None, + return_dict=None, ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.mmbt( input_modal=input_modal, @@ -353,6 +356,7 @@ def forward( modal_position_ids=modal_position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -360,8 +364,7 @@ def forward( pooled_output = self.dropout(pooled_output) logits = self.classifier(pooled_output) - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here - + loss = None if labels is not None: if self.num_labels == 1: # We are doing regression @@ -370,6 +373,11 @@ def forward( else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - outputs = (loss,) + outputs - return outputs # (loss), logits, (hidden_states), (attentions) + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, + ) diff --git a/src/transformers/modeling_mobilebert.py b/src/transformers/modeling_mobilebert.py index 13c9ade0270c9..d3a4cd8e32baf 100644 --- a/src/transformers/modeling_mobilebert.py +++ b/src/transformers/modeling_mobilebert.py @@ -550,7 +550,7 @@ def forward( encoder_attention_mask=None, output_attentions=False, output_hidden_states=False, - return_tuple=False, + return_dict=False, ): all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None @@ -575,7 +575,7 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if return_tuple: + if not return_dict: return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions @@ -708,9 +708,9 @@ class MobileBertForPretrainingOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - prediction_logits: torch.FloatTensor - seq_relationship_logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + prediction_logits: torch.FloatTensor = None + seq_relationship_logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -773,8 +773,9 @@ class MobileBertForPretrainingOutput(ModelOutput): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -831,13 +832,13 @@ def forward( encoder_attention_mask=None, output_hidden_states=None, output_attentions=None, - return_tuple=None, + return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -890,12 +891,12 @@ def forward( encoder_attention_mask=encoder_extended_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) - if return_tuple: + if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPooling( @@ -958,7 +959,7 @@ def forward( next_sentence_label=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): @@ -979,7 +980,7 @@ def forward( >>> import torch >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased") - >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased") + >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased", return_dict=True) >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> outputs = model(input_ids) @@ -988,7 +989,7 @@ def forward( >>> seq_relationship_logits = outputs.seq_relationship_logits """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.mobilebert( input_ids, @@ -999,7 +1000,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output, pooled_output = outputs[:2] prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) @@ -1011,7 +1012,7 @@ def forward( next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss - if return_tuple: + if not return_dict: output = (prediction_scores, seq_relationship_score) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output @@ -1079,7 +1080,7 @@ def forward( encoder_attention_mask=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs ): r""" @@ -1097,7 +1098,7 @@ def forward( FutureWarning, ) labels = kwargs.pop("masked_lm_labels") - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.mobilebert( input_ids, @@ -1110,7 +1111,7 @@ def forward( encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1121,7 +1122,7 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if return_tuple: + if not return_dict: output = (prediction_scores,) + outputs[2:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output @@ -1169,7 +1170,7 @@ def forward( next_sentence_label=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1186,7 +1187,7 @@ def forward( >>> import torch >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased') - >>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased') + >>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased', return_dict=True) >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." @@ -1196,7 +1197,7 @@ def forward( >>> loss = outputs.loss >>> logits = outputs.logits """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.mobilebert( input_ids, @@ -1207,7 +1208,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -1218,7 +1219,7 @@ def forward( loss_fct = CrossEntropyLoss() next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) - if return_tuple: + if not return_dict: output = (seq_relationship_score,) + outputs[2:] return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output @@ -1263,7 +1264,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1272,7 +1273,7 @@ def forward( If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.mobilebert( input_ids, @@ -1283,7 +1284,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] pooled_output = self.dropout(pooled_output) @@ -1299,7 +1300,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -1342,7 +1343,7 @@ def forward( end_positions=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1354,7 +1355,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.mobilebert( input_ids, @@ -1365,7 +1366,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1392,7 +1393,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output @@ -1438,7 +1439,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1446,7 +1447,7 @@ def forward( Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -1468,7 +1469,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -1482,7 +1483,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if return_tuple: + if not return_dict: output = (reshaped_logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -1525,14 +1526,14 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.mobilebert( input_ids, @@ -1543,7 +1544,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1565,7 +1566,7 @@ def forward( else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/modeling_openai.py b/src/transformers/modeling_openai.py index 3efa7d353f6e9..04cf8fb8a4f4e 100644 --- a/src/transformers/modeling_openai.py +++ b/src/transformers/modeling_openai.py @@ -315,10 +315,10 @@ class OpenAIGPTDoubleHeadsModelOutput(ModelOutput): heads. """ - lm_loss: Optional[torch.FloatTensor] - mc_loss: Optional[torch.FloatTensor] - lm_logits: torch.FloatTensor - mc_logits: torch.FloatTensor + lm_loss: Optional[torch.FloatTensor] = None + mc_loss: Optional[torch.FloatTensor] = None + lm_logits: torch.FloatTensor = None + mc_logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -374,8 +374,9 @@ class OpenAIGPTDoubleHeadsModelOutput(ModelOutput): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -425,13 +426,13 @@ def forward( inputs_embeds=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -496,7 +497,7 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if return_tuple: + if not return_dict: return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) return BaseModelOutput( @@ -538,7 +539,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): @@ -548,7 +549,7 @@ def forward( All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, @@ -559,7 +560,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) hidden_states = transformer_outputs[0] lm_logits = self.lm_head(hidden_states) @@ -573,7 +574,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - if return_tuple: + if not return_dict: output = (lm_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -622,7 +623,7 @@ def forward( mc_labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs ): r""" @@ -650,7 +651,7 @@ def forward( import torch tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') - model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt') + model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt', return_dict=True) tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!) model.resize_token_embeddings(len(tokenizer)) @@ -662,7 +663,7 @@ def forward( lm_logits = outputs.lm_logits mc_logits = outputs.mc_logits """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if "lm_labels" in kwargs: warnings.warn( "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", @@ -680,7 +681,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) hidden_states = transformer_outputs[0] @@ -698,7 +699,7 @@ def forward( loss_fct = CrossEntropyLoss() mc_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - if return_tuple: + if not return_dict: output = (lm_logits, mc_logits) + transformer_outputs[1:] if mc_loss is not None: output = (mc_loss,) + output diff --git a/src/transformers/modeling_outputs.py b/src/transformers/modeling_outputs.py index f9cf15c40b9e6..3a91d17904d5a 100644 --- a/src/transformers/modeling_outputs.py +++ b/src/transformers/modeling_outputs.py @@ -63,7 +63,7 @@ class BaseModelOutputWithPooling(ModelOutput): """ last_hidden_state: torch.FloatTensor - pooler_output: torch.FloatTensor + pooler_output: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -179,7 +179,7 @@ class CausalLMOutput(ModelOutput): """ loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -213,8 +213,8 @@ class CausalLMOutputWithPast(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None past_key_values: Optional[List[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -243,8 +243,8 @@ class MaskedLMOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -291,8 +291,8 @@ class Seq2SeqLMOutput(ModelOutput): self-attention heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None decoder_past_key_values: Optional[List[torch.FloatTensor]] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -324,8 +324,8 @@ class NextSentencePredictorOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -353,8 +353,8 @@ class SequenceClassifierOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -401,8 +401,8 @@ class Seq2SeqSequenceClassifierOutput(ModelOutput): self-attention heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None decoder_past_key_values: Optional[List[torch.FloatTensor]] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -436,8 +436,8 @@ class MultipleChoiceModelOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -465,8 +465,8 @@ class TokenClassifierOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -496,9 +496,9 @@ class QuestionAnsweringModelOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - start_logits: torch.FloatTensor - end_logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + start_logits: torch.FloatTensor = None + end_logits: torch.FloatTensor = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -547,9 +547,9 @@ class Seq2SeqQuestionAnsweringModelOutput(ModelOutput): self-attention heads. """ - loss: Optional[torch.FloatTensor] - start_logits: torch.FloatTensor - end_logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + start_logits: torch.FloatTensor = None + end_logits: torch.FloatTensor = None decoder_past_key_values: Optional[List[torch.FloatTensor]] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None diff --git a/src/transformers/modeling_reformer.py b/src/transformers/modeling_reformer.py index 6beed9df7afb6..8109d6b98f919 100644 --- a/src/transformers/modeling_reformer.py +++ b/src/transformers/modeling_reformer.py @@ -39,13 +39,7 @@ add_start_docstrings, add_start_docstrings_to_callable, ) -from .modeling_outputs import ( - BaseModelOutput, - CausalLMOutput, - MaskedLMOutput, - QuestionAnsweringModelOutput, - SequenceClassifierOutput, -) +from .modeling_outputs import CausalLMOutput, MaskedLMOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput from .modeling_utils import PreTrainedModel, apply_chunking_to_forward @@ -1851,8 +1845,8 @@ class ReformerModelWithLMHeadOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None past_buckets_states: Optional[List[Tuple[torch.LongTensor, torch.FloatTensor]]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -1922,8 +1916,9 @@ class ReformerModelWithLMHeadOutput(ModelOutput): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -1962,7 +1957,7 @@ def _prune_heads(self, heads_to_prune): @add_code_sample_docstrings( tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/reformer-crime-and-punishment", - output_type=BaseModelOutput, + output_type=ReformerModelOutput, config_class=_CONFIG_FOR_DOC, ) def forward( @@ -1977,40 +1972,14 @@ def forward( use_cache=None, output_hidden_states=None, output_attentions=None, - return_tuple=None, + return_dict=None, ): - r""" - Return: - :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - past_buckets_states (:obj:`List[Tuple(torch.LongTensor, torch.FloatTensor)]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): - List of :obj:`tuple(torch.LongTensor, torch.FloatTensor` of length :obj:`config.n_layers`, with :obj:`tuple(0)` being the previous `buckets` of shape - :obj:`(batch_size, num_heads, num_hashes, sequence_length)`) - and :obj:`tuple(1)` being the previous `hidden_states` of shape - :obj:`(batch_size, sequence_length, hidden_size)`). - - Contains pre-computed buckets and hidden-states that can be used (see - ``past_buckets_states`` input) to speed up sequential decoding. - all_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - all_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ - use_cache = use_cache if use_cache is not None else self.config.use_cache output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -2102,7 +2071,7 @@ def forward( hidden_states = encoder_outputs.all_hidden_states if output_hidden_states else None attentions = encoder_outputs.all_attentions if output_attentions else None - if return_tuple: + if not return_dict: return tuple(v for v in [sequence_output, past_buckets_states, hidden_states, attentions] if v is not None) return ReformerModelOutput( last_hidden_state=sequence_output, @@ -2208,7 +2177,7 @@ def forward( use_cache=None, output_hidden_states=None, output_attentions=None, - return_tuple=None, + return_dict=None, labels=None, ): r""" @@ -2218,7 +2187,7 @@ def forward( All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict reformer_outputs = self.reformer( input_ids, @@ -2231,7 +2200,7 @@ def forward( use_cache=use_cache, output_hidden_states=output_hidden_states, output_attentions=output_attentions, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = reformer_outputs[0] @@ -2246,7 +2215,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + reformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -2326,7 +2295,7 @@ def forward( labels=None, output_hidden_states=None, output_attentions=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): @@ -2334,7 +2303,7 @@ def forward( Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict reformer_outputs = self.reformer( input_ids, @@ -2346,7 +2315,7 @@ def forward( use_cache=False, # no causal mask output_hidden_states=output_hidden_states, output_attentions=output_attentions, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = reformer_outputs[0] @@ -2357,7 +2326,7 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + reformer_outputs[1:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output @@ -2408,7 +2377,7 @@ def forward( labels=None, output_hidden_states=None, output_attentions=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -2427,7 +2396,7 @@ def forward( num_hashes=num_hashes, output_hidden_states=output_hidden_states, output_attentions=output_attentions, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -2443,7 +2412,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -2511,7 +2480,7 @@ def forward( end_positions=None, output_hidden_states=None, output_attentions=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -2523,7 +2492,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict reformer_outputs = self.reformer( input_ids, @@ -2535,7 +2504,7 @@ def forward( use_cache=False, # no causal mask output_hidden_states=output_hidden_states, output_attentions=output_attentions, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = reformer_outputs[0] @@ -2562,7 +2531,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits) + reformer_outputs[1:] return ((total_loss,) + output) if total_loss is not None else output diff --git a/src/transformers/modeling_roberta.py b/src/transformers/modeling_roberta.py index 00a0ecc397c7e..7779e81eceef8 100644 --- a/src/transformers/modeling_roberta.py +++ b/src/transformers/modeling_roberta.py @@ -143,8 +143,9 @@ def create_position_ids_from_inputs_embeds(self, inputs_embeds): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -208,7 +209,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs ): r""" @@ -227,7 +228,7 @@ def forward( ) labels = kwargs.pop("masked_lm_labels") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.roberta( input_ids, @@ -238,7 +239,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] prediction_scores = self.lm_head(sequence_output) @@ -248,7 +249,7 @@ def forward( loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if return_tuple: + if not return_dict: output = (prediction_scores,) + outputs[2:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output @@ -321,7 +322,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -330,7 +331,7 @@ def forward( If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.roberta( input_ids, @@ -341,7 +342,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] logits = self.classifier(sequence_output) @@ -356,7 +357,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -401,7 +402,7 @@ def forward( inputs_embeds=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -409,7 +410,7 @@ def forward( Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -431,7 +432,7 @@ def forward( inputs_embeds=flat_inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -444,7 +445,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if return_tuple: + if not return_dict: output = (reshaped_logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -490,14 +491,14 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.roberta( input_ids, @@ -508,7 +509,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -530,7 +531,7 @@ def forward( else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -595,7 +596,7 @@ def forward( end_positions=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -607,7 +608,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.roberta( input_ids, @@ -618,7 +619,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -645,7 +646,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output diff --git a/src/transformers/modeling_t5.py b/src/transformers/modeling_t5.py index 925ab53e36954..d7665ba2017d3 100644 --- a/src/transformers/modeling_t5.py +++ b/src/transformers/modeling_t5.py @@ -675,7 +675,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -683,7 +683,7 @@ def forward( output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -787,7 +787,7 @@ def forward( if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) - if return_tuple: + if not return_dict: return tuple( v for v in [hidden_states, present_key_value_states, all_hidden_states, all_attentions] @@ -868,8 +868,9 @@ def forward( If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -930,7 +931,7 @@ def forward( head_mask=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs, ): r""" @@ -957,7 +958,7 @@ def forward( assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." use_cache = use_cache if use_cache is not None else self.config.use_cache - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # Encode if needed (training, first prediction pass) if encoder_outputs is None: @@ -968,9 +969,9 @@ def forward( head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) - elif not return_tuple and not isinstance(encoder_outputs, BaseModelOutput): + elif not return_dict and not isinstance(encoder_outputs, BaseModelOutput): encoder_outputs = BaseModelOutput( last_hidden_state=encoder_outputs[0], hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, @@ -1005,11 +1006,11 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) past = (encoder_outputs, decoder_outputs[1]) if use_cache is True else None - if return_tuple: + if not return_dict: if past is not None: decoder_outputs = decoder_outputs[:1] + (past,) + decoder_outputs[2:] return decoder_outputs + encoder_outputs @@ -1081,7 +1082,7 @@ def forward( head_mask=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, **kwargs, ): r""" @@ -1100,13 +1101,14 @@ def forward( >>> from transformers import T5Tokenizer, T5ForConditionalGeneration >>> tokenizer = T5Tokenizer.from_pretrained('t5-small') - >>> model = T5ForConditionalGeneration.from_pretrained('t5-small') + >>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True) >>> input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1 >>> outputs = model(input_ids=input_ids, labels=input_ids) - >>> loss, prediction_scores = outputs[:2] + >>> loss = outputs.loss + >>> logits = outputs.logits >>> tokenizer = T5Tokenizer.from_pretrained('t5-small') - >>> model = T5ForConditionalGeneration.from_pretrained('t5-small') + >>> model = T5ForConditionalGeneration.from_pretrained('t5-small', return_dict=True) >>> input_ids = tokenizer.encode("summarize: Hello, my dog is cute", return_tensors="pt") # Batch size 1 >>> outputs = model.generate(input_ids) """ @@ -1126,7 +1128,7 @@ def forward( assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." use_cache = use_cache if use_cache is not None else self.config.use_cache - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # Encode if needed (training, first prediction pass) if encoder_outputs is None: @@ -1138,9 +1140,9 @@ def forward( head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) - elif not return_tuple and not isinstance(encoder_outputs, BaseModelOutput): + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): encoder_outputs = BaseModelOutput( last_hidden_state=encoder_outputs[0], hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, @@ -1174,7 +1176,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = decoder_outputs[0] @@ -1190,7 +1192,7 @@ def forward( # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 past = (encoder_outputs, decoder_outputs[1]) if use_cache is True else None - if return_tuple: + if not return_dict: if past is not None: decoder_outputs = decoder_outputs[:1] + (past,) + decoder_outputs[2:] output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs diff --git a/src/transformers/modeling_tf_flaubert.py b/src/transformers/modeling_tf_flaubert.py index d10324de088e8..cf721be25ccd9 100644 --- a/src/transformers/modeling_tf_flaubert.py +++ b/src/transformers/modeling_tf_flaubert.py @@ -22,7 +22,7 @@ from .configuration_flaubert import FlaubertConfig from .file_utils import add_start_docstrings -from .modeling_tf_utils import keras_serializable, shape_list +from .modeling_tf_utils import cast_bool_to_primitive, keras_serializable, shape_list from .modeling_tf_xlm import ( TFXLMForMultipleChoice, TFXLMForQuestionAnsweringSimple, @@ -30,6 +30,7 @@ TFXLMForTokenClassification, TFXLMMainLayer, TFXLMModel, + TFXLMPredLayer, TFXLMWithLMHeadModel, get_masks, ) @@ -123,6 +124,8 @@ def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.layerdrop = getattr(config, "layerdrop", 0.0) self.pre_norm = getattr(config, "pre_norm", False) + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states def call( self, @@ -135,9 +138,9 @@ def call( cache=None, head_mask=None, inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, training=False, - output_attentions=False, - output_hidden_states=False, ): # removed: src_enc=None, src_len=None if isinstance(inputs, (tuple, list)): @@ -150,7 +153,9 @@ def call( cache = inputs[6] if len(inputs) > 6 else cache head_mask = inputs[7] if len(inputs) > 7 else head_mask inputs_embeds = inputs[8] if len(inputs) > 8 else inputs_embeds - assert len(inputs) <= 9, "Too many inputs." + output_attentions = inputs[9] if len(inputs) > 9 else output_attentions + output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states + assert len(inputs) <= 11, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -161,10 +166,15 @@ def call( cache = inputs.get("cache", cache) head_mask = inputs.get("head_mask", head_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) - assert len(inputs) <= 9, "Too many inputs." + output_attentions = inputs.get("output_attentions", output_attentions) + output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + assert len(inputs) <= 11, "Too many inputs." else: input_ids = inputs + output_attentions = output_attentions if output_attentions is not None else self.output_attentions + output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states + if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: @@ -257,9 +267,12 @@ def call( # self attention if not self.pre_norm: - attn_outputs = self.attentions[i]([tensor, attn_mask, None, cache, head_mask[i]], training=training) + attn_outputs = self.attentions[i]( + [tensor, attn_mask, None, cache, head_mask[i], output_attentions], training=training + ) attn = attn_outputs[0] - attentions = attentions + (attn_outputs[1],) + if cast_bool_to_primitive(output_attentions, self.output_attentions) is True: + attentions = attentions + (attn_outputs[1],) attn = self.dropout(attn, training=training) tensor = tensor + attn tensor = self.layer_norm1[i](tensor) @@ -269,7 +282,7 @@ def call( [tensor_normalized, attn_mask, None, cache, head_mask[i]], training=training ) attn = attn_outputs[0] - if output_attentions: + if cast_bool_to_primitive(output_attentions, self.output_attentions) is True: attentions = attentions + (attn_outputs[1],) attn = self.dropout(attn, training=training) tensor = tensor + attn @@ -292,7 +305,7 @@ def call( tensor = tensor * mask[..., tf.newaxis] # Add last hidden state - if output_hidden_states: + if cast_bool_to_primitive(output_hidden_states, self.output_hidden_states) is True: hidden_states = hidden_states + (tensor,) # update cache length @@ -303,9 +316,9 @@ def call( # tensor = tensor.transpose(0, 1) outputs = (tensor,) - if output_hidden_states: + if cast_bool_to_primitive(output_hidden_states, self.output_hidden_states) is True: outputs = outputs + (hidden_states,) - if output_attentions: + if cast_bool_to_primitive(output_attentions, self.output_attentions) is True: outputs = outputs + (attentions,) return outputs # outputs, (hidden_states), (attentions) @@ -321,6 +334,7 @@ class TFFlaubertWithLMHeadModel(TFXLMWithLMHeadModel): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFFlaubertMainLayer(config, name="transformer") + self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name="pred_layer_._proj") @add_start_docstrings( diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 10b355f22a9d1..f241fc8dcad2b 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -17,7 +17,6 @@ import functools import logging import os -import warnings from typing import Dict, List, Optional, Union import h5py @@ -174,11 +173,7 @@ def compute_loss(self, labels, logits): ) # make sure only labels that are not equal to -100 # are taken into account as loss - if tf.math.reduce_any(labels == -1).numpy() is True: - warnings.warn("Using `-1` to mask the loss for the token is deprecated. Please use `-100` instead.") - active_loss = tf.reshape(labels, (-1,)) != -1 - else: - active_loss = tf.reshape(labels, (-1,)) != -100 + active_loss = tf.reshape(labels, (-1,)) != -100 reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss) labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss) diff --git a/src/transformers/modeling_tf_xlm.py b/src/transformers/modeling_tf_xlm.py index e912891c212df..7a5f029e56cfb 100644 --- a/src/transformers/modeling_tf_xlm.py +++ b/src/transformers/modeling_tf_xlm.py @@ -19,6 +19,7 @@ import itertools import logging import math +import warnings import numpy as np import tensorflow as tf @@ -827,6 +828,9 @@ def __init__(self, config, *inputs, **kwargs): self.transformer = TFXLMMainLayer(config, name="transformer") self.sequence_summary = TFSequenceSummary(config, initializer_range=config.init_std, name="sequence_summary") + self.logits_proj = tf.keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="logits_proj" + ) @property def dummy_inputs(self): @@ -835,7 +839,10 @@ def dummy_inputs(self): Returns: tf.Tensor with dummy inputs """ - return {"input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS)} + return { + "input_ids": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS), + "langs": tf.constant(MULTIPLE_CHOICE_DUMMY_INPUTS), + } @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="xlm-mlm-en-2048") @@ -892,7 +899,7 @@ def call( output_attentions = inputs[9] if len(inputs) > 9 else output_attentions output_hidden_states = inputs[10] if len(inputs) > 10 else output_hidden_states labels = inputs[11] if len(inputs) > 11 else labels - assert len(inputs) <= 11, "Too many inputs." + assert len(inputs) <= 12, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") attention_mask = inputs.get("attention_mask", attention_mask) @@ -921,17 +928,31 @@ def call( flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None + flat_langs = tf.reshape(langs, (-1, seq_length)) if langs is not None else None + flat_inputs_embeds = ( + tf.reshape(inputs_embeds, (-1, inputs_embeds.shape[-2], inputs_embeds.shape[-1])) + if inputs_embeds is not None + else None + ) + + if lengths is not None: + warnings.warn( + "The `lengths` parameter cannot be used with the XLM multiple choice models. Please use the " + "attention mask instead.", + FutureWarning, + ) + lengths = None flat_inputs = [ flat_input_ids, flat_attention_mask, - langs, + flat_langs, flat_token_type_ids, flat_position_ids, lengths, cache, head_mask, - inputs_embeds, + flat_inputs_embeds, output_attentions, output_hidden_states, ] @@ -939,6 +960,7 @@ def call( transformer_outputs = self.transformer(flat_inputs, training=training) output = transformer_outputs[0] logits = self.sequence_summary(output) + logits = self.logits_proj(logits) reshaped_logits = tf.reshape(logits, (-1, num_choices)) outputs = (reshaped_logits,) + transformer_outputs[1:] # add hidden states and attention if they are here diff --git a/src/transformers/modeling_transfo_xl.py b/src/transformers/modeling_transfo_xl.py index ca98fe5abc5d7..bdad2f406d930 100644 --- a/src/transformers/modeling_transfo_xl.py +++ b/src/transformers/modeling_transfo_xl.py @@ -618,7 +618,7 @@ class TransfoXLModelOutput(ModelOutput): """ last_hidden_state: torch.FloatTensor - mems: List[torch.FloatTensor] + mems: List[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -650,9 +650,9 @@ class TransfoXLLMHeadModelOutput(ModelOutput): heads. """ - losses: Optional[torch.FloatTensor] - prediction_scores: torch.FloatTensor - mems: List[torch.FloatTensor] + losses: Optional[torch.FloatTensor] = None + prediction_scores: torch.FloatTensor = None + mems: List[torch.FloatTensor] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -695,8 +695,9 @@ class TransfoXLLMHeadModelOutput(ModelOutput): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -836,13 +837,13 @@ def forward( inputs_embeds=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library # so we transpose here from shape [bsz, len] to shape [len, bsz] @@ -941,7 +942,7 @@ def forward( # We transpose back here to shape [bsz, len, hidden_dim] core_out = core_out.transpose(0, 1).contiguous() - if return_tuple: + if not return_dict: return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None) return TransfoXLModelOutput( @@ -1013,7 +1014,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): @@ -1023,7 +1024,7 @@ def forward( All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None: bsz, tgt_len = input_ids.size(0), input_ids.size(1) elif inputs_embeds is not None: @@ -1038,7 +1039,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) last_hidden = transformer_outputs[0] @@ -1048,7 +1049,7 @@ def forward( prediction_scores = softmax_output.view(bsz, tgt_len, -1) if labels is None else () loss = softmax_output.view(bsz, tgt_len - 1) if labels is not None else None - if return_tuple: + if not return_dict: output = (prediction_scores,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 7296ba4ac4d77..d1d548ffbd785 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1167,7 +1167,7 @@ def forward( cls_index: Optional[torch.LongTensor] = None, is_impossible: Optional[torch.LongTensor] = None, p_mask: Optional[torch.FloatTensor] = None, - return_tuple: bool = False, + return_dict: bool = False, ) -> Union[SquadHeadOutput, Tuple[torch.FloatTensor]]: """ Args: @@ -1184,8 +1184,8 @@ def forward( p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`): Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token should be masked. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`False`): - Whether or not to return a plain tuple instead of a :class:`~transformers.file_utils.ModelOuput`. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to return a :class:`~transformers.file_utils.ModelOuput` instead of a plain tuple. Returns: """ @@ -1214,7 +1214,7 @@ def forward( # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss total_loss += cls_loss * 0.5 - return (total_loss,) if return_tuple else SquadHeadOutput(loss=total_loss) + return SquadHeadOutput(loss=total_loss) if return_dict else (total_loss,) else: # during inference, compute the end logits based on beam search @@ -1244,7 +1244,7 @@ def forward( start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs) cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index) - if return_tuple: + if not return_dict: return (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) else: return SquadHeadOutput( diff --git a/src/transformers/modeling_xlm.py b/src/transformers/modeling_xlm.py index e7396df689e71..27a8ed21aa860 100644 --- a/src/transformers/modeling_xlm.py +++ b/src/transformers/modeling_xlm.py @@ -19,6 +19,7 @@ import itertools import logging import math +import warnings from dataclasses import dataclass from typing import Optional, Tuple @@ -40,6 +41,7 @@ from .modeling_outputs import ( BaseModelOutput, MaskedLMOutput, + MultipleChoiceModelOutput, QuestionAnsweringModelOutput, SequenceClassifierOutput, TokenClassifierOutput, @@ -365,8 +367,9 @@ class XLMForQuestionAnsweringOutput(ModelOutput): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -480,13 +483,13 @@ def forward( inputs_embeds=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None: bs, slen = input_ids.size() @@ -593,7 +596,7 @@ def forward( # move back sequence length to dimension 0 # tensor = tensor.transpose(0, 1) - if return_tuple: + if not return_dict: return tuple(v for v in [tensor, hidden_states, attentions] if v is not None) return BaseModelOutput(last_hidden_state=tensor, hidden_states=hidden_states, attentions=attentions) @@ -691,7 +694,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): @@ -701,7 +704,7 @@ def forward( All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, @@ -715,13 +718,13 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) output = transformer_outputs[0] outputs = self.pred_layer(output, labels) # (loss, logits) or (logits,) depending on if labels are provided. - if return_tuple: + if not return_dict: return outputs + transformer_outputs[1:] return MaskedLMOutput( @@ -768,7 +771,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -777,7 +780,7 @@ def forward( If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, @@ -791,7 +794,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) output = transformer_outputs[0] @@ -807,7 +810,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -855,7 +858,7 @@ def forward( end_positions=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -867,7 +870,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, @@ -881,7 +884,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = transformer_outputs[0] @@ -908,7 +911,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits) + transformer_outputs[1:] return ((total_loss,) + output) if total_loss is not None else output @@ -955,7 +958,7 @@ def forward( p_mask=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -982,7 +985,7 @@ def forward( >>> import torch >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') - >>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048') + >>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048', return_dict=True) >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> start_positions = torch.tensor([1]) @@ -991,7 +994,7 @@ def forward( >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) >>> loss = outputs.loss """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict transformer_outputs = self.transformer( input_ids, @@ -1005,7 +1008,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) output = transformer_outputs[0] @@ -1017,10 +1020,10 @@ def forward( cls_index=cls_index, is_impossible=is_impossible, p_mask=p_mask, - return_tuple=return_tuple, + return_dict=return_dict, ) - if return_tuple: + if not return_dict: return outputs + transformer_outputs[1:] return XLMForQuestionAnsweringOutput( @@ -1072,14 +1075,14 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.transformer( input_ids, @@ -1093,7 +1096,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1115,10 +1118,112 @@ def forward( else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output return TokenClassifierOutput( loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + + +@add_start_docstrings( + """XLM Model with a multiple choice classification head on top (a linear layer on top of + the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, + XLM_START_DOCSTRING, +) +class XLMForMultipleChoice(XLMPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.transformer = XLMModel(config) + self.sequence_summary = SequenceSummary(config) + self.logits_proj = nn.Linear(config.num_labels, 1) + + self.init_weights() + + @add_start_docstrings_to_callable(XLM_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="xlm-mlm-en-2048", + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + langs=None, + token_type_ids=None, + position_ids=None, + lengths=None, + cache=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.Tensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for computing the multiple choice classification loss. + Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + langs = langs.view(-1, langs.size(-1)) if langs is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + if lengths is not None: + warnings.warn( + "The `lengths` parameter cannot be used with the XLM multiple choice models. Please use the " + "attention mask instead.", + FutureWarning, + ) + lengths = None + + transformer_outputs = self.transformer( + input_ids=input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + output = transformer_outputs[0] + logits = self.sequence_summary(output) + logits = self.logits_proj(logits) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/src/transformers/modeling_xlm_roberta.py b/src/transformers/modeling_xlm_roberta.py index 775e3451c4283..b76d9744407bf 100644 --- a/src/transformers/modeling_xlm_roberta.py +++ b/src/transformers/modeling_xlm_roberta.py @@ -53,12 +53,6 @@ config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. - output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. - output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. """ diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py index 4448313817e05..e0892661ffbb2 100644 --- a/src/transformers/modeling_xlnet.py +++ b/src/transformers/modeling_xlnet.py @@ -627,8 +627,8 @@ class XLNetLMHeadModelOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None mems: Optional[List[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -661,8 +661,8 @@ class XLNetForSequenceClassificationOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None mems: Optional[List[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -695,8 +695,8 @@ class XLNetForTokenClassificationOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None mems: Optional[List[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -731,8 +731,8 @@ class XLNetForMultipleChoiceOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None mems: Optional[List[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -767,9 +767,9 @@ class XLNetForQuestionAnsweringSimpleOutput(ModelOutput): heads. """ - loss: Optional[torch.FloatTensor] - start_logits: torch.FloatTensor - end_logits: torch.FloatTensor + loss: Optional[torch.FloatTensor] = None + start_logits: torch.FloatTensor = None + end_logits: torch.FloatTensor = None mems: Optional[List[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None @@ -891,8 +891,9 @@ class XLNetForQuestionAnsweringOutput(ModelOutput): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -1051,13 +1052,13 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end @@ -1239,7 +1240,7 @@ def forward( else: attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions) - if return_tuple: + if not return_dict: return tuple(v for v in [output, new_mems, hidden_states, attentions] if v is not None) return XLNetModelOutput( @@ -1325,7 +1326,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`): @@ -1344,7 +1345,7 @@ def forward( import torch tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') - model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased') + model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased', return_dict=True) # We show how to setup inputs to predict a next token using a bi-directional context. input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very ", add_special_tokens=False)).unsqueeze(0) # We will predict the masked token @@ -1369,7 +1370,7 @@ def forward( loss = outputs.loss next_token_logits = outputs.logits # Logits have shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) transformer_outputs = self.transformer( @@ -1385,7 +1386,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) logits = self.lm_loss(transformer_outputs[0]) @@ -1396,7 +1397,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1447,7 +1448,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`) @@ -1456,7 +1457,7 @@ def forward( If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) transformer_outputs = self.transformer( @@ -1472,7 +1473,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) output = transformer_outputs[0] @@ -1489,7 +1490,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1539,7 +1540,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1547,7 +1548,7 @@ def forward( Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) outputs = self.transformer( @@ -1563,7 +1564,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1584,7 +1585,7 @@ def forward( else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1634,7 +1635,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1642,7 +1643,7 @@ def forward( Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] @@ -1669,7 +1670,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) output = transformer_outputs[0] @@ -1683,7 +1684,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels.view(-1)) - if return_tuple: + if not return_dict: output = (reshaped_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1734,7 +1735,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1746,7 +1747,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) outputs = self.transformer( @@ -1762,7 +1763,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -1789,7 +1790,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits) + outputs[1:] return ((total_loss,) + output) if total_loss is not None else output @@ -1842,7 +1843,7 @@ def forward( use_cache=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -1869,7 +1870,7 @@ def forward( >>> import torch >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') - >>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased') + >>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased', return_dict=True) >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> start_positions = torch.tensor([1]) @@ -1878,7 +1879,7 @@ def forward( >>> loss = outputs.loss """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) transformer_outputs = self.transformer( @@ -1894,7 +1895,7 @@ def forward( use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) hidden_states = transformer_outputs[0] start_logits = self.start_logits(hidden_states, p_mask=p_mask) @@ -1924,7 +1925,7 @@ def forward( # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss total_loss += cls_loss * 0.5 - if return_tuple: + if not return_dict: return (total_loss,) + transformer_outputs[1:] else: return XLNetForQuestionAnsweringOutput( @@ -1966,7 +1967,7 @@ def forward( hidden_states, start_states=start_states, cls_index=cls_index ) # Shape (batch size,): one single `cls_logits` for each sample - if return_tuple: + if not return_dict: outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) return outputs + transformer_outputs[1:] else: diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 2e6b1f4917a76..8eba3c8e9c918 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -2122,6 +2122,6 @@ def pipeline( "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. " "Trying to load the model with Tensorflow." ) - model = model_class.from_pretrained(model, config=config, return_tuple=True, **model_kwargs) + model = model_class.from_pretrained(model, config=config, **model_kwargs) return task_class(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, task=task, **kwargs) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 2df9113e1e27b..e2be9f5a7fcc1 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -661,9 +661,7 @@ def _prepare_inputs( if self.args.past_index >= 0 and self._past is not None: inputs["mems"] = self._past - # Our model outputs do not work with DataParallel, so forcing return tuple. - if isinstance(model, nn.DataParallel): - inputs["return_tuple"] = True + return inputs def training_step( diff --git a/src/transformers/trainer_tf.py b/src/transformers/trainer_tf.py index 9a2c8181ebe40..c9c06edfbf579 100644 --- a/src/transformers/trainer_tf.py +++ b/src/transformers/trainer_tf.py @@ -1,12 +1,15 @@ """Tensorflow trainer class.""" +import datetime import logging import math import os +import sys from typing import Callable, Dict, Optional, Tuple import numpy as np import tensorflow as tf +from packaging.version import parse from .modeling_tf_utils import TFPreTrainedModel from .optimization_tf import GradientAccumulator, create_optimizer @@ -21,6 +24,15 @@ logger = logging.getLogger(__name__) +if parse(tf.__version__).release < (2, 2, 0): + logger.info( + "You need to run the TensorFlow trainer with at least the version 2.2.0, your version is {}".format( + tf.__version__ + ) + ) + sys.exit(1) + + class TFTrainer: """ TFTrainer is a simple but feature-complete training and eval loop for TensorFlow, @@ -57,7 +69,7 @@ class TFTrainer: compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None prediction_loss_only: bool tb_writer: Optional[tf.summary.SummaryWriter] = None - optimizers: Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule] = None + optimizers: Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule] = (None, None) global_step: Optional[int] = None epoch_logging: Optional[float] = None @@ -70,7 +82,10 @@ def __init__( compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, prediction_loss_only=False, tb_writer: Optional[tf.summary.SummaryWriter] = None, - optimizers: Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule] = None, + optimizers: Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule] = ( + None, + None, + ), ): self.model = model self.args = args @@ -78,7 +93,7 @@ def __init__( self.eval_dataset = eval_dataset self.compute_metrics = compute_metrics self.prediction_loss_only = prediction_loss_only - self.optimizers = optimizers + self.optimizer, self.lr_scheduler = optimizers self.gradient_accumulator = GradientAccumulator() self.global_step = 0 self.epoch_logging = 0 @@ -105,23 +120,19 @@ def get_train_tfdataset(self) -> tf.data.Dataset: if self.train_dataset is None: raise ValueError("Trainer: training requires a train_dataset.") - self.num_train_examples = self.train_dataset.reduce(tf.constant(0), lambda x, _: x + 1).numpy() + self.total_train_batch_size = self.args.train_batch_size * self.args.gradient_accumulation_steps + self.num_train_examples = tf.data.experimental.cardinality(self.train_dataset).numpy() - if self.args.max_steps > 0: - self.train_steps = self.args.max_steps - else: - self.train_steps: int = math.ceil(self.num_train_examples / self.args.train_batch_size) + if self.num_train_examples < 0: + raise ValueError("The training dataset must have an asserted cardinality") ds = ( - self.train_dataset.cache() - .shuffle(self.num_train_examples) - .batch(self.args.train_batch_size, drop_remainder=self.args.dataloader_drop_last) + self.train_dataset.repeat() + .shuffle(self.num_train_examples, seed=self.args.seed) + .batch(self.total_train_batch_size, drop_remainder=self.args.dataloader_drop_last) .prefetch(tf.data.experimental.AUTOTUNE) ) - if self.args.max_steps > 0: - self.train_dataset = self.train_dataset.repeat(-1) - return self.args.strategy.experimental_distribute_dataset(ds) def get_eval_tfdataset(self, eval_dataset: Optional[tf.data.Dataset] = None) -> tf.data.Dataset: @@ -136,13 +147,20 @@ def get_eval_tfdataset(self, eval_dataset: Optional[tf.data.Dataset] = None) -> raise ValueError("Trainer: evaluation requires an eval_dataset.") eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset + num_examples = tf.data.experimental.cardinality(eval_dataset).numpy() + + if num_examples < 0: + raise ValueError("The training dataset must have an asserted cardinality") + + approx = math.floor if self.args.dataloader_drop_last else math.ceil + steps = approx(num_examples / self.args.eval_batch_size) ds = ( - eval_dataset.cache() + eval_dataset.repeat() .batch(self.args.eval_batch_size, drop_remainder=self.args.dataloader_drop_last) .prefetch(tf.data.experimental.AUTOTUNE) ) - return self.args.strategy.experimental_distribute_dataset(ds) + return self.args.strategy.experimental_distribute_dataset(ds), steps, num_examples def get_test_tfdataset(self, test_dataset: tf.data.Dataset) -> tf.data.Dataset: """ @@ -151,11 +169,23 @@ def get_test_tfdataset(self, test_dataset: tf.data.Dataset) -> tf.data.Dataset: Args: test_dataset (:class:`~tf.data.Dataset`): The dataset to use. """ - ds = test_dataset.batch(self.args.eval_batch_size, drop_remainder=self.args.dataloader_drop_last) - return self.args.strategy.experimental_distribute_dataset(ds) + num_examples = tf.data.experimental.cardinality(test_dataset).numpy() - def get_optimizers( + if num_examples < 0: + raise ValueError("The training dataset must have an asserted cardinality") + + approx = math.floor if self.args.dataloader_drop_last else math.ceil + steps = approx(num_examples / self.args.eval_batch_size) + ds = ( + test_dataset.repeat() + .batch(self.args.eval_batch_size, drop_remainder=self.args.dataloader_drop_last) + .prefetch(tf.data.experimental.AUTOTUNE) + ) + + return self.args.strategy.experimental_distribute_dataset(ds), steps, num_examples + + def create_optimizer_and_scheduler( self, num_training_steps: int, ) -> Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule]: """ @@ -164,20 +194,16 @@ def get_optimizers( We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the TFTrainer's init through :obj:`optimizers`, or override this method in a subclass. """ - if self.optimizers is not None: - return self.optimizers - - optimizer, scheduler = create_optimizer( - self.args.learning_rate, - num_training_steps, - self.args.warmup_steps, - adam_beta1=self.args.adam_beta1, - adam_beta2=self.args.adam_beta2, - adam_epsilon=self.args.adam_epsilon, - weight_decay_rate=self.args.weight_decay, - ) - - return optimizer, scheduler + if not self.optimizer and not self.lr_scheduler: + self.optimizer, self.lr_scheduler = create_optimizer( + self.args.learning_rate, + num_training_steps, + self.args.warmup_steps, + adam_beta1=self.args.adam_beta1, + adam_beta2=self.args.adam_beta2, + adam_epsilon=self.args.adam_epsilon, + weight_decay_rate=self.args.weight_decay, + ) def _setup_wandb(self): """ @@ -195,29 +221,13 @@ def _setup_wandb(self): logger.info('Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"') wandb.init(project=os.getenv("WANDB_PROJECT", "huggingface"), config=vars(self.args)) - @tf.function - def _evaluate_steps(self, per_replica_features, per_replica_labels): - """ - One step evaluation across replica. - Args: - per_replica_features: the batched features. - per_replica_labels: the batched labels. - Returns: - The loss corresponding to the given batch. - """ - per_replica_loss, per_replica_logits = self.args.strategy.experimental_run_v2( - self._run_model, args=(per_replica_features, per_replica_labels, False) - ) - - try: - reduced_loss = self.args.strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_loss, axis=0) - except ValueError: - reduced_loss = self.args.strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_loss, None) - - return reduced_loss, per_replica_logits - def _prediction_loop( - self, dataset: tf.data.Dataset, description: str, prediction_loss_only: Optional[bool] = None + self, + dataset: tf.data.Dataset, + steps: int, + num_examples: int, + description: str, + prediction_loss_only: Optional[bool] = None, ) -> PredictionOutput: """ Prediction/evaluation loop, shared by `evaluate()` and `predict()`. @@ -228,21 +238,20 @@ def _prediction_loop( prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only logger.info("***** Running %s *****", description) + logger.info(" Num examples = %d", num_examples) logger.info(" Batch size = %d", self.args.eval_batch_size) label_ids: np.ndarray = None preds: np.ndarray = None - - step: int = 1 + self.eval_loss = tf.keras.metrics.Sum() # Reset the past mems state at the beginning of the evaluation if necessary. if self.args.past_index >= 0: self._past = None - for features, labels in dataset: - step = tf.convert_to_tensor(step, dtype=tf.int64) - loss, logits = self._evaluate_steps(features, labels) - loss = tf.reduce_mean(loss) + for step, batch in enumerate(dataset): + logits = self.distributed_test_steps(batch) + _, labels = batch if not prediction_loss_only: if isinstance(logits, tuple): @@ -274,14 +283,15 @@ def _prediction_loop( else: label_ids = np.append(label_ids, labels.numpy(), axis=0) - step += 1 + if step == steps: + break if self.compute_metrics is not None and preds is not None and label_ids is not None: metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids)) else: metrics = {} - metrics["eval_loss"] = loss.numpy() + metrics["eval_loss"] = self.eval_loss.result().numpy() / (steps * self.args.eval_batch_size) for key in list(metrics.keys()): if not key.startswith("eval_"): @@ -322,9 +332,9 @@ def evaluate(self, eval_dataset: Optional[tf.data.Dataset] = None) -> Dict[str, Returns: A dictionary containing the evaluation loss and the potential metrics computed from the predictions. """ - eval_ds = self.get_eval_tfdataset(eval_dataset) + eval_ds, steps, num_examples = self.get_eval_tfdataset(eval_dataset) - output = self._prediction_loop(eval_ds, description="Evaluation") + output = self._prediction_loop(eval_ds, steps, num_examples, description="Evaluation") logs = {**output.metrics} logs["epoch"] = self.epoch_logging @@ -333,6 +343,19 @@ def evaluate(self, eval_dataset: Optional[tf.data.Dataset] = None) -> Dict[str, return output.metrics + def test_step(self, features, labels): + per_example_loss, logits = self._run_model(features, labels, False) + + self.eval_loss.update_state(per_example_loss) + + return logits + + @tf.function + def distributed_test_steps(self, batch): + logits = self.args.strategy.run(self.test_step, batch) + + return logits + def train(self) -> None: """ Train method to train the model. @@ -346,24 +369,18 @@ def train(self) -> None: if self.args.max_steps > 0: t_total = self.args.max_steps - steps_per_epoch = self.args.max_steps + self.steps_per_epoch = self.args.max_steps else: - if self.args.dataloader_drop_last: - approx = math.floor - else: - approx = math.ceil - - steps_per_epoch = approx( - self.num_train_examples / (self.args.train_batch_size * self.args.gradient_accumulation_steps) - ) - t_total = steps_per_epoch * self.args.num_train_epochs + approx = math.floor if self.args.dataloader_drop_last else math.ceil + self.steps_per_epoch = approx(self.num_train_examples / self.total_train_batch_size) + t_total = self.steps_per_epoch * self.args.num_train_epochs with self.args.strategy.scope(): - optimizer, lr_scheduler = self.get_optimizers(num_training_steps=t_total) - iterations = optimizer.iterations + self.create_optimizer_and_scheduler(num_training_steps=t_total) + iterations = self.optimizer.iterations self.global_step = iterations.numpy() folder = os.path.join(self.args.output_dir, PREFIX_CHECKPOINT_DIR) - ckpt = tf.train.Checkpoint(optimizer=optimizer, model=self.model) + ckpt = tf.train.Checkpoint(optimizer=self.optimizer, model=self.model) self.model.ckpt_manager = tf.train.CheckpointManager(ckpt, folder, max_to_keep=self.args.save_total_limit) if self.model.ckpt_manager.latest_checkpoint: @@ -384,141 +401,138 @@ def train(self) -> None: else: epochs_trained = 1 - tf.summary.experimental.set_step(iterations) + tf.summary.experimental.set_step(iterations) - epochs = 1 if self.args.max_steps > 0 else self.args.num_train_epochs + epochs = 1 if self.args.max_steps > 0 else self.args.num_train_epochs - if self.args.fp16: - policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16") - tf.keras.mixed_precision.experimental.set_policy(policy) + if self.args.fp16: + policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16") + tf.keras.mixed_precision.experimental.set_policy(policy) - with self.tb_writer.as_default(): - tf.summary.text("args", self.args.to_json_string()) + with self.tb_writer.as_default(): + tf.summary.text("args", self.args.to_json_string()) - self.tb_writer.flush() + self.tb_writer.flush() - logger.info("***** Running training *****") - logger.info(" Num examples = %d", self.num_train_examples) - logger.info(" Num Epochs = %d", epochs) - logger.info(" Instantaneous batch size per device = %d", self.args.per_device_train_batch_size) - logger.info( - " Total train batch size (w. parallel, distributed & accumulation) = %d", self.args.train_batch_size - ) - logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) - logger.info(" Total optimization steps = %d", t_total) - - for epoch_iter in range(epochs_trained, int(epochs + 1)): - # Reset the past mems state at the beginning of each epoch if necessary. - if self.args.past_index >= 0: - self._past = None - for step, training_loss in enumerate(self._training_steps(train_ds, optimizer)): - self.global_step = iterations.numpy() - self.epoch_logging = epoch_iter - 1 + (step + 1) / steps_per_epoch - - if self.args.debug: - logs = {} - logs["loss"] = training_loss.numpy() - logs["epoch"] = self.epoch_logging - - self._log(logs) - - if self.global_step == 1 and self.args.debug: - with self.tb_writer.as_default(): - tf.summary.trace_export( - name="training", step=self.global_step, profiler_outdir=self.args.logging_dir - ) - - if self.args.evaluate_during_training and self.global_step % self.args.eval_steps == 0: - self.evaluate() - - if ( - self.global_step % self.args.logging_steps == 0 - or self.global_step == 1 - and self.args.logging_first_step - ): - logs = {} - logs["loss"] = training_loss.numpy() - logs["learning_rate"] = lr_scheduler(self.global_step).numpy() - logs["epoch"] = self.epoch_logging - - self._log(logs) - - if self.global_step % self.args.save_steps == 0: - ckpt_save_path = self.model.ckpt_manager.save() - logger.info("Saving checkpoint for step {} at {}".format(self.global_step, ckpt_save_path)) - - if self.args.max_steps > 0 and self.global_step % self.args.max_steps == 0: - break + logger.info("***** Running training *****") + logger.info(" Num examples = %d", self.num_train_examples) + logger.info(" Num Epochs = %d", epochs) + logger.info(" Instantaneous batch size per device = %d", self.args.per_device_train_batch_size) + logger.info( + " Total train batch size (w. parallel, distributed & accumulation) = %d", self.total_train_batch_size + ) + logger.info(" Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps) + logger.info(" Steps per epoch = %d", self.steps_per_epoch) + logger.info(" Total optimization steps = %d", t_total) - if self.args.past_index and hasattr(self, "_past"): - # Clean the state at the end of training - delattr(self, "_past") + self.train_loss = tf.keras.metrics.Sum() + start_time = datetime.datetime.now() - def _training_steps(self, ds, optimizer): - """ - Returns a generator over training steps (i.e. parameters update). - """ - for i, loss in enumerate(self._accumulate_next_gradients(ds)): - if i % self.args.gradient_accumulation_steps == 0: - self._apply_gradients(optimizer) - yield loss + for epoch_iter in range(epochs_trained, int(epochs + 1)): + # Reset the past mems state at the beginning of each epoch if necessary. + if self.args.past_index >= 0: + self._past = None - @tf.function - def _apply_gradients(self, optimizer): - """Applies the gradients (cross-replica).""" - self.args.strategy.experimental_run_v2(self._step, args=(optimizer,)) + for step, batch in enumerate(train_ds): + self.global_step = iterations.numpy() + self.epoch_logging = epoch_iter - 1 + (step + 1) / self.steps_per_epoch - def _step(self, optimizer): - """Applies gradients and resets accumulation.""" - gradient_scale = self.gradient_accumulator.step * self.args.strategy.num_replicas_in_sync - gradients = [ - gradient / tf.cast(gradient_scale, gradient.dtype) for gradient in self.gradient_accumulator.gradients - ] - gradients = [(tf.clip_by_value(grad, -self.args.max_grad_norm, self.args.max_grad_norm)) for grad in gradients] + self.distributed_training_steps(batch) - optimizer.apply_gradients(list(zip(gradients, self.model.trainable_variables))) - self.gradient_accumulator.reset() + training_loss = self.train_loss.result() / ((step + 1) * self.total_train_batch_size) - def _accumulate_next_gradients(self, ds): - """Accumulates the gradients from the next element in dataset.""" - iterator = iter(ds) + if self.args.debug: + logs = {} + logs["loss"] = training_loss.numpy() + logs["epoch"] = self.epoch_logging - @tf.function - def _accumulate_next(): - per_replica_features, per_replica_labels = next(iterator) + self._log(logs) - return self._accumulate_gradients(per_replica_features, per_replica_labels) + if self.global_step == 1 and self.args.debug: + with self.tb_writer.as_default(): + tf.summary.trace_export( + name="training", step=self.global_step, profiler_outdir=self.args.logging_dir + ) - while True: - try: - yield _accumulate_next() - except tf.errors.OutOfRangeError: - break + if ( + self.global_step > 0 + and self.args.evaluate_during_training + and self.global_step % self.args.eval_steps == 0 + ): + self.evaluate() - def _accumulate_gradients(self, per_replica_features, per_replica_labels): - """Accumulates the gradients across all the replica.""" - per_replica_loss = self.args.strategy.experimental_run_v2( - self._forward, args=(per_replica_features, per_replica_labels) - ) + if (self.global_step > 0 and self.global_step % self.args.logging_steps == 0) or ( + self.global_step == 1 and self.args.logging_first_step + ): + logs = {} + logs["loss"] = training_loss.numpy() + logs["learning_rate"] = self.lr_scheduler(self.global_step).numpy() + logs["epoch"] = self.epoch_logging + + self._log(logs) + + if self.global_step > 0 and self.global_step % self.args.save_steps == 0: + ckpt_save_path = self.model.ckpt_manager.save() + + logger.info("Saving checkpoint for step {} at {}".format(self.global_step, ckpt_save_path)) + + if self.global_step > 0 and self.global_step % self.steps_per_epoch == 0: + break - try: - reduced_loss = self.args.strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_loss, axis=0) - except ValueError: - reduced_loss = self.args.strategy.reduce(tf.distribute.ReduceOp.MEAN, per_replica_loss, None) + self.train_loss.reset_states() - return reduced_loss + end_time = datetime.datetime.now() - def _forward(self, features, labels): - """Forwards a training example and accumulates the gradients.""" + logger.info("Training took: {}".format(str(end_time - start_time))) + + if self.args.past_index and hasattr(self, "_past"): + # Clean the state at the end of training + delattr(self, "_past") + + def training_step(self, features, labels): per_example_loss, _ = self._run_model(features, labels, True) - gradients = tf.gradients(per_example_loss, self.model.trainable_variables) + scaled_loss = per_example_loss / self.total_train_batch_size + gradients = tf.gradients(scaled_loss, self.model.trainable_variables) gradients = [ g if g is not None else tf.zeros_like(v) for g, v in zip(gradients, self.model.trainable_variables) ] - self.gradient_accumulator(gradients) + if self.args.gradient_accumulation_steps > 1: + self.gradient_accumulator(gradients) + + self.train_loss.update_state(per_example_loss) + + if self.args.gradient_accumulation_steps == 1: + return gradients + + def apply_gradients(self, features, labels): + if self.args.gradient_accumulation_steps == 1: + gradients = self.training_step(features, labels) + + self.optimizer.apply_gradients(list(zip(gradients, self.model.trainable_variables))) + else: + for _ in tf.range(self.args.gradient_accumulation_steps): + reduced_features = features[: self.args.train_batch_size / self.args.n_replicas] + reduced_labels = labels[: self.args.train_batch_size / self.args.n_replicas] + + self.training_step(reduced_features, reduced_labels) + + features = tf.concat( + [features[self.args.train_batch_size / self.args.n_replicas :], reduced_features], axis=0 + ) + + gradients = self.gradient_accumulator.gradients + gradients = [ + (tf.clip_by_value(grad, -self.args.max_grad_norm, self.args.max_grad_norm)) for grad in gradients + ] + + self.optimizer.apply_gradients(list(zip(gradients, self.model.trainable_variables))) + self.gradient_accumulator.reset() - return per_example_loss + @tf.function + def distributed_training_steps(self, batch): + with self.args.strategy.scope(): + self.args.strategy.run(self.apply_gradients, batch) def _run_model(self, features, labels, training): """ @@ -530,14 +544,16 @@ def _run_model(self, features, labels, training): """ if self.args.past_index >= 0 and getattr(self, "_past", None) is not None: features["mems"] = self._past + if isinstance(labels, (dict)): outputs = self.model(features, training=training, **labels)[:2] else: outputs = self.model(features, labels=labels, training=training)[:2] + loss, logits = outputs[:2] + if self.args.past_index >= 0: self._past = outputs[self.args.past_index] - loss += sum(self.model.losses) * (1.0 / self.args.n_replicas) return loss, logits @@ -560,9 +576,9 @@ def predict(self, test_dataset: tf.data.Dataset) -> PredictionOutput: metrics (:obj:`Dict[str, float]`, `optional`): The potential dictionary of metrics (if the dataset contained labels). """ - test_ds = self.get_test_tfdataset(test_dataset) + test_ds, steps, num_examples = self.get_test_tfdataset(test_dataset) - return self._prediction_loop(test_ds, description="Prediction") + return self._prediction_loop(test_ds, steps, num_examples, description="Prediction") def save_model(self, output_dir: Optional[str] = None): """ diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py index 1ec50caffc9f6..0adf344645073 100644 --- a/src/transformers/training_args_tf.py +++ b/src/transformers/training_args_tf.py @@ -162,7 +162,7 @@ def train_batch_size(self) -> int: "version. Using `--per_device_train_batch_size` is preferred." ) per_device_batch_size = self.per_gpu_train_batch_size or self.per_device_train_batch_size - return per_device_batch_size * max(1, self.n_replicas) + return per_device_batch_size * self.n_replicas @property def eval_batch_size(self) -> int: @@ -175,7 +175,7 @@ def eval_batch_size(self) -> int: "version. Using `--per_device_eval_batch_size` is preferred." ) per_device_batch_size = self.per_gpu_eval_batch_size or self.per_device_eval_batch_size - return per_device_batch_size * max(1, self.n_replicas) + return per_device_batch_size * self.n_replicas @property @tf_required diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py index f1e031bc3228c..73676ed249fee 100644 --- a/templates/adding_a_new_model/modeling_xxx.py +++ b/templates/adding_a_new_model/modeling_xxx.py @@ -260,8 +260,9 @@ def _init_weights(self, module): If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail. output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`None`): If set to ``True``, the hidden states of all layers are returned. See ``hidden_states`` under returned tensors for more detail. - return_tuple (:obj:`bool`, `optional`, defaults to :obj:`None`): - If set to ``True``, the output of the model will be a plain tuple instead of a ``dataclass``. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`None`): + If set to ``True``, the model will return a :class:`~transformers.file_utils.ModelOutput` instead of a + plain tuple. """ @@ -310,13 +311,13 @@ def forward( inputs_embeds=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") @@ -351,7 +352,7 @@ def forward( sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) - if return_tuple: + if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPooling( @@ -393,7 +394,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): @@ -402,7 +403,7 @@ def forward( Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.transformer( input_ids, @@ -413,7 +414,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -424,7 +425,7 @@ def forward( loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) - if return_tuple: + if not return_dict: output = (prediction_scores,) + outputs[2:] return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output @@ -470,7 +471,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -479,7 +480,7 @@ def forward( If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.transformer( input_ids, @@ -490,7 +491,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -508,7 +509,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -550,7 +551,7 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -558,7 +559,7 @@ def forward( Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None @@ -580,7 +581,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) pooled_output = outputs[1] @@ -594,7 +595,7 @@ def forward( loss_fct = CrossEntropyLoss() loss = loss_fct(reshaped_logits, labels) - if return_tuple: + if not return_dict: output = (reshaped_logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -637,14 +638,14 @@ def forward( labels=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - 1]``. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.transformer( input_ids, @@ -655,7 +656,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -677,7 +678,7 @@ def forward( else: loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - if return_tuple: + if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output @@ -720,7 +721,7 @@ def forward( end_positions=None, output_attentions=None, output_hidden_states=None, - return_tuple=None, + return_dict=None, ): r""" start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): @@ -732,7 +733,7 @@ def forward( Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence are not taken into account for computing the loss. """ - return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple + return_dict = return_dict if return_dict is not None else self.config.use_return_dict outputs = self.transformer( input_ids, @@ -743,7 +744,7 @@ def forward( inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_tuple=return_tuple, + return_dict=return_dict, ) sequence_output = outputs[0] @@ -770,7 +771,7 @@ def forward( end_loss = loss_fct(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2 - if return_tuple: + if not return_dict: output = (start_logits, end_logits) + outputs[2:] return ((total_loss,) + output) if total_loss is not None else output diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 097c387543cc5..8207f70f3190a 100644 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -66,7 +66,7 @@ def _prepare_for_class(self, inputs_dict, model_class): if model_class in MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values(): return { k: v.unsqueeze(1).expand(-1, self.model_tester.num_choices, -1).contiguous() - if isinstance(v, torch.Tensor) and v.ndim != 0 + if isinstance(v, torch.Tensor) and v.ndim > 1 else v for k, v in inputs_dict.items() } @@ -74,6 +74,7 @@ def _prepare_for_class(self, inputs_dict, model_class): def test_save_load(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True for model_class in self.all_model_classes: model = model_class(config) @@ -803,8 +804,6 @@ def test_multigpu_data_parallel_forward(self): # Wrap model in nn.DataParallel model = torch.nn.DataParallel(model) - # Our model outputs do not work with DataParallel, so forcing return tuple. - inputs_dict["return_tuple"] = True with torch.no_grad(): _ = model(**self._prepare_for_class(inputs_dict, model_class)) diff --git a/tests/test_modeling_flaubert.py b/tests/test_modeling_flaubert.py index af2918cb947ed..d4342e21843f4 100644 --- a/tests/test_modeling_flaubert.py +++ b/tests/test_modeling_flaubert.py @@ -32,6 +32,7 @@ FlaubertForQuestionAnsweringSimple, FlaubertForSequenceClassification, FlaubertForTokenClassification, + FlaubertForMultipleChoice, ) from transformers.modeling_flaubert import FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST @@ -90,6 +91,7 @@ def prepare_config_and_inputs(self): sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) is_impossible_labels = ids_tensor([self.batch_size], 2).float() + choice_labels = ids_tensor([self.batch_size], self.num_choices) config = FlaubertConfig( vocab_size=self.vocab_size, @@ -118,6 +120,7 @@ def prepare_config_and_inputs(self): sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ) @@ -133,6 +136,7 @@ def create_and_check_flaubert_model( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = FlaubertModel(config=config) @@ -158,6 +162,7 @@ def create_and_check_flaubert_lm_head( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = FlaubertWithLMHeadModel(config) @@ -183,6 +188,7 @@ def create_and_check_flaubert_simple_qa( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = FlaubertForQuestionAnsweringSimple(config) @@ -212,6 +218,7 @@ def create_and_check_flaubert_qa( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = FlaubertForQuestionAnswering(config) @@ -278,6 +285,7 @@ def create_and_check_flaubert_sequence_classif( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = FlaubertForSequenceClassification(config) @@ -304,6 +312,7 @@ def create_and_check_flaubert_token_classif( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): config.num_labels = self.num_labels @@ -319,6 +328,38 @@ def create_and_check_flaubert_token_classif( self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) self.check_loss_output(result) + def create_and_check_flaubert_multiple_choice( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + config.num_choices = self.num_choices + model = FlaubertForMultipleChoice(config=config) + model.to(torch_device) + model.eval() + multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + loss, logits = model( + multiple_choice_inputs_ids, + attention_mask=multiple_choice_input_mask, + token_type_ids=multiple_choice_token_type_ids, + labels=choice_labels, + ) + result = { + "loss": loss, + "logits": logits, + } + self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) + self.check_loss_output(result) + def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() ( @@ -329,6 +370,7 @@ def prepare_config_and_inputs_for_common(self): sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ) = config_and_inputs inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "lengths": input_lengths} @@ -346,6 +388,7 @@ class FlaubertModelTest(ModelTesterMixin, unittest.TestCase): FlaubertForQuestionAnsweringSimple, FlaubertForSequenceClassification, FlaubertForTokenClassification, + FlaubertForMultipleChoice, ) if is_torch_available() else () @@ -382,6 +425,10 @@ def test_flaubert_token_classif(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_flaubert_token_classif(*config_and_inputs) + def test_flaubert_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_multiple_choice(*config_and_inputs) + @slow def test_model_from_pretrained(self): for model_name in FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: diff --git a/tests/test_modeling_t5.py b/tests/test_modeling_t5.py index 9c5cd9dbce828..09949f07b2489 100644 --- a/tests/test_modeling_t5.py +++ b/tests/test_modeling_t5.py @@ -329,7 +329,6 @@ def test_export_to_onnx(self): import tempfile config_and_inputs = self.model_tester.prepare_config_and_inputs() - config_and_inputs[0].return_tuple = True model = T5Model(config_and_inputs[0]).to(torch_device) with tempfile.TemporaryDirectory() as tmpdirname: torch.onnx.export( diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index 839c064209d56..88bfaa63cdc10 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -80,8 +80,8 @@ class TFModelTesterMixin: def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): if model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values(): inputs_dict = { - k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices, 1)) - if isinstance(v, tf.Tensor) and v.ndim != 0 + k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1)) + if isinstance(v, tf.Tensor) and v.ndim > 0 else v for k, v in inputs_dict.items() } diff --git a/tests/test_modeling_tf_flaubert.py b/tests/test_modeling_tf_flaubert.py index 1b3e6d8823971..399c78ca53da5 100644 --- a/tests/test_modeling_tf_flaubert.py +++ b/tests/test_modeling_tf_flaubert.py @@ -18,11 +18,340 @@ from transformers import is_tf_available from transformers.testing_utils import require_tf, slow +from .test_configuration_common import ConfigTester +from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor + if is_tf_available(): import tensorflow as tf import numpy as np - from transformers import TFFlaubertModel + + from transformers import ( + FlaubertConfig, + TFFlaubertModel, + TFFlaubertWithLMHeadModel, + TFFlaubertForSequenceClassification, + TFFlaubertForQuestionAnsweringSimple, + TFFlaubertForTokenClassification, + TFFlaubertForMultipleChoice, + TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST, + ) + + +class TFFlaubertModelTester: + def __init__( + self, parent, + ): + self.parent = parent + self.batch_size = 13 + self.seq_length = 7 + self.is_training = True + self.use_input_lengths = True + self.use_token_type_ids = True + self.use_labels = True + self.gelu_activation = True + self.sinusoidal_embeddings = False + self.causal = False + self.asm = False + self.n_langs = 2 + self.vocab_size = 99 + self.n_special = 0 + self.hidden_size = 32 + self.num_hidden_layers = 5 + self.num_attention_heads = 4 + self.hidden_dropout_prob = 0.1 + self.attention_probs_dropout_prob = 0.1 + self.max_position_embeddings = 512 + self.type_vocab_size = 16 + self.type_sequence_label_size = 2 + self.initializer_range = 0.02 + self.num_labels = 3 + self.num_choices = 4 + self.summary_type = "last" + self.use_proj = True + self.scope = None + self.bos_token_id = 0 + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + input_mask = ids_tensor([self.batch_size, self.seq_length], 2, dtype=tf.float32) + + input_lengths = None + if self.use_input_lengths: + input_lengths = ( + ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2 + ) # small variation of seq_length + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs) + + sequence_labels = None + token_labels = None + is_impossible_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = FlaubertConfig( + vocab_size=self.vocab_size, + n_special=self.n_special, + emb_dim=self.hidden_size, + n_layers=self.num_hidden_layers, + n_heads=self.num_attention_heads, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + gelu_activation=self.gelu_activation, + sinusoidal_embeddings=self.sinusoidal_embeddings, + asm=self.asm, + causal=self.causal, + n_langs=self.n_langs, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + summary_type=self.summary_type, + use_proj=self.use_proj, + bos_token_id=self.bos_token_id, + ) + + return ( + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ) + + def create_and_check_flaubert_model( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + model = TFFlaubertModel(config=config) + inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids} + outputs = model(inputs) + + inputs = [input_ids, input_mask] + outputs = model(inputs) + sequence_output = outputs[0] + result = { + "sequence_output": sequence_output.numpy(), + } + self.parent.assertListEqual( + list(result["sequence_output"].shape), [self.batch_size, self.seq_length, self.hidden_size] + ) + + def create_and_check_flaubert_lm_head( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + model = TFFlaubertWithLMHeadModel(config) + + inputs = {"input_ids": input_ids, "lengths": input_lengths, "langs": token_type_ids} + outputs = model(inputs) + + logits = outputs[0] + + result = { + "logits": logits.numpy(), + } + + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.vocab_size]) + + def create_and_check_flaubert_qa( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + model = TFFlaubertForQuestionAnsweringSimple(config) + + inputs = {"input_ids": input_ids, "lengths": input_lengths} + + start_logits, end_logits = model(inputs) + + result = { + "start_logits": start_logits.numpy(), + "end_logits": end_logits.numpy(), + } + + self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) + self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) + + def create_and_check_flaubert_sequence_classif( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + model = TFFlaubertForSequenceClassification(config) + + inputs = {"input_ids": input_ids, "lengths": input_lengths} + + (logits,) = model(inputs) + + result = { + "logits": logits.numpy(), + } + + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.type_sequence_label_size]) + + def create_and_check_flaubert_for_token_classification( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + config.num_labels = self.num_labels + model = TFFlaubertForTokenClassification(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + (logits,) = model(inputs) + result = { + "logits": logits.numpy(), + } + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) + + def create_and_check_flaubert_for_multiple_choice( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + config.num_choices = self.num_choices + model = TFFlaubertForMultipleChoice(config=config) + multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1)) + multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1)) + multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1)) + inputs = { + "input_ids": multiple_choice_inputs_ids, + "attention_mask": multiple_choice_input_mask, + "token_type_ids": multiple_choice_token_type_ids, + } + (logits,) = model(inputs) + result = {"logits": logits.numpy()} + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ) = config_and_inputs + inputs_dict = { + "input_ids": input_ids, + "token_type_ids": token_type_ids, + "langs": token_type_ids, + "lengths": input_lengths, + } + return config, inputs_dict + + +@require_tf +class TFFlaubertModelTest(TFModelTesterMixin, unittest.TestCase): + + all_model_classes = ( + ( + TFFlaubertModel, + TFFlaubertWithLMHeadModel, + TFFlaubertForSequenceClassification, + TFFlaubertForQuestionAnsweringSimple, + TFFlaubertForTokenClassification, + TFFlaubertForMultipleChoice, + ) + if is_tf_available() + else () + ) + all_generative_model_classes = ( + (TFFlaubertWithLMHeadModel,) if is_tf_available() else () + ) # TODO (PVP): Check other models whether language generation is also applicable + + def setUp(self): + self.model_tester = TFFlaubertModelTester(self) + self.config_tester = ConfigTester(self, config_class=FlaubertConfig, emb_dim=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_flaubert_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_model(*config_and_inputs) + + def test_flaubert_lm_head(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_lm_head(*config_and_inputs) + + def test_flaubert_qa(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_qa(*config_and_inputs) + + def test_flaubert_sequence_classif(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_sequence_classif(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_for_token_classification(*config_and_inputs) + + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_flaubert_for_multiple_choice(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + for model_name in TF_FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = TFFlaubertModel.from_pretrained(model_name) + self.assertIsNotNone(model) @require_tf diff --git a/tests/test_modeling_tf_xlm.py b/tests/test_modeling_tf_xlm.py index 26cdb0a39c6f8..1903f4a8dfb4a 100644 --- a/tests/test_modeling_tf_xlm.py +++ b/tests/test_modeling_tf_xlm.py @@ -32,6 +32,7 @@ TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple, TFXLMForTokenClassification, + TFXLMForMultipleChoice, TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST, ) @@ -91,6 +92,7 @@ def prepare_config_and_inputs(self): sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32) + choice_labels = ids_tensor([self.batch_size], self.num_choices) config = XLMConfig( vocab_size=self.vocab_size, @@ -120,6 +122,7 @@ def prepare_config_and_inputs(self): sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ) @@ -132,6 +135,7 @@ def create_and_check_xlm_model( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = TFXLMModel(config=config) @@ -157,6 +161,7 @@ def create_and_check_xlm_lm_head( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = TFXLMWithLMHeadModel(config) @@ -181,6 +186,7 @@ def create_and_check_xlm_qa( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = TFXLMForQuestionAnsweringSimple(config) @@ -206,6 +212,7 @@ def create_and_check_xlm_sequence_classif( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = TFXLMForSequenceClassification(config) @@ -229,6 +236,7 @@ def create_and_check_xlm_for_token_classification( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): config.num_labels = self.num_labels @@ -240,6 +248,32 @@ def create_and_check_xlm_for_token_classification( } self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.seq_length, self.num_labels]) + def create_and_check_xlm_for_multiple_choice( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + config.num_choices = self.num_choices + model = TFXLMForMultipleChoice(config=config) + multiple_choice_inputs_ids = tf.tile(tf.expand_dims(input_ids, 1), (1, self.num_choices, 1)) + multiple_choice_input_mask = tf.tile(tf.expand_dims(input_mask, 1), (1, self.num_choices, 1)) + multiple_choice_token_type_ids = tf.tile(tf.expand_dims(token_type_ids, 1), (1, self.num_choices, 1)) + inputs = { + "input_ids": multiple_choice_inputs_ids, + "attention_mask": multiple_choice_input_mask, + "token_type_ids": multiple_choice_token_type_ids, + } + (logits,) = model(inputs) + result = {"logits": logits.numpy()} + self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) + def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() ( @@ -250,6 +284,7 @@ def prepare_config_and_inputs_for_common(self): sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ) = config_and_inputs inputs_dict = { @@ -265,13 +300,13 @@ def prepare_config_and_inputs_for_common(self): class TFXLMModelTest(TFModelTesterMixin, unittest.TestCase): all_model_classes = ( - # TODO The multiple choice model is missing and should be added. ( TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple, TFXLMForTokenClassification, + TFXLMForMultipleChoice, ) if is_tf_available() else () @@ -307,6 +342,10 @@ def test_for_token_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_xlm_for_token_classification(*config_and_inputs) + def test_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_xlm_for_multiple_choice(*config_and_inputs) + @slow def test_model_from_pretrained(self): for model_name in TF_XLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: diff --git a/tests/test_modeling_xlm.py b/tests/test_modeling_xlm.py index 2a5cd4096ae2c..efa9346cee51f 100644 --- a/tests/test_modeling_xlm.py +++ b/tests/test_modeling_xlm.py @@ -33,6 +33,7 @@ XLMForQuestionAnswering, XLMForSequenceClassification, XLMForQuestionAnsweringSimple, + XLMForMultipleChoice, ) from transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_LIST @@ -63,7 +64,7 @@ def __init__( self.max_position_embeddings = 512 self.type_sequence_label_size = 2 self.initializer_range = 0.02 - self.num_labels = 3 + self.num_labels = 2 self.num_choices = 4 self.summary_type = "last" self.use_proj = True @@ -91,6 +92,7 @@ def prepare_config_and_inputs(self): sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) is_impossible_labels = ids_tensor([self.batch_size], 2).float() + choice_labels = ids_tensor([self.batch_size], self.num_choices) config = XLMConfig( vocab_size=self.vocab_size, @@ -109,6 +111,7 @@ def prepare_config_and_inputs(self): initializer_range=self.initializer_range, summary_type=self.summary_type, use_proj=self.use_proj, + num_labels=self.num_labels, bos_token_id=self.bos_token_id, ) @@ -120,6 +123,7 @@ def prepare_config_and_inputs(self): sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ) @@ -135,6 +139,7 @@ def create_and_check_xlm_model( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = XLMModel(config=config) @@ -160,6 +165,7 @@ def create_and_check_xlm_lm_head( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = XLMWithLMHeadModel(config) @@ -185,6 +191,7 @@ def create_and_check_xlm_simple_qa( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = XLMForQuestionAnsweringSimple(config) @@ -214,6 +221,7 @@ def create_and_check_xlm_qa( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = XLMForQuestionAnswering(config) @@ -280,6 +288,7 @@ def create_and_check_xlm_sequence_classif( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): model = XLMForSequenceClassification(config) @@ -306,6 +315,7 @@ def create_and_check_xlm_token_classif( sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ): config.num_labels = self.num_labels @@ -321,6 +331,38 @@ def create_and_check_xlm_token_classif( self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.seq_length, self.num_labels]) self.check_loss_output(result) + def create_and_check_xlm_for_multiple_choice( + self, + config, + input_ids, + token_type_ids, + input_lengths, + sequence_labels, + token_labels, + is_impossible_labels, + choice_labels, + input_mask, + ): + config.num_choices = self.num_choices + model = XLMForMultipleChoice(config=config) + model.to(torch_device) + model.eval() + multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() + loss, logits = model( + multiple_choice_inputs_ids, + attention_mask=multiple_choice_input_mask, + token_type_ids=multiple_choice_token_type_ids, + labels=choice_labels, + ) + result = { + "loss": loss, + "logits": logits, + } + self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_choices]) + self.check_loss_output(result) + def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() ( @@ -331,6 +373,7 @@ def prepare_config_and_inputs_for_common(self): sequence_labels, token_labels, is_impossible_labels, + choice_labels, input_mask, ) = config_and_inputs inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "lengths": input_lengths} @@ -348,6 +391,7 @@ class XLMModelTest(ModelTesterMixin, unittest.TestCase): XLMForSequenceClassification, XLMForQuestionAnsweringSimple, XLMForTokenClassification, + XLMForMultipleChoice, ) if is_torch_available() else () @@ -387,6 +431,10 @@ def test_xlm_token_classif(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_xlm_token_classif(*config_and_inputs) + def test_xlm_for_multiple_choice(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_xlm_for_multiple_choice(*config_and_inputs) + @slow def test_model_from_pretrained(self): for model_name in XLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: