huggingface · sgugger · Jul 30, 2020 · Jul 29, 2020 · Jul 29, 2020 · Jul 29, 2020
diff --git a/docs/source/quicktour.rst b/docs/source/quicktour.rst
@@ -230,19 +230,16 @@ final activations of the model.
 
     >>> ## PYTORCH CODE
     >>> print(pt_outputs)
-    SequenceClassifierOutput(loss=None, logits=tensor([[-4.0833,  4.3364],
-            [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
+    (tensor([[-4.0833,  4.3364],
+            [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>),)
     >>> ## TENSORFLOW CODE
     >>> print(tf_outputs)
     (<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
     array([[-4.0832963 ,  4.336414  ],
            [ 0.08181786, -0.04179301]], dtype=float32)>,)
 
-The model can return more than just the final activations, which is why the PyTorch output is a special class and the
-TensorFlow output is a tuple. Here we only asked for the final activations, so we get a tuple with one element on the
-TensorFlow side and a :class:`~transformers.modeling_outputs.SequenceClassifierOutput` with just the ``logits`` field
-filled on the PyTorch side.
-
+The model can return more than just the final activations, which is why the output is a tuple. Here we only asked for
+the final activations, so we get a tuple with one element.
 .. note::
 
     All 🤗 Transformers models (PyTorch or TensorFlow) return the activations of the model *before* the final
@@ -254,7 +251,7 @@ Let's apply the SoftMax activation to get predictions.
 
     >>> ## PYTORCH CODE
     >>> import torch.nn.functional as F
-    >>> pt_predictions = F.softmax(pt_outputs.logits, dim=-1)
+    >>> pt_predictions = F.softmax(pt_outputs[0], dim=-1)
     >>> ## TENSORFLOW CODE
     >>> import tensorflow as tf
     >>> tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
@@ -341,8 +338,8 @@ code is easy to access and tweak if you need to.
 
 In our previous example, the model was called "distilbert-base-uncased-finetuned-sst-2-english", which means it's
 using the :doc:`DistilBERT </model_doc/distilbert>` architecture. As
-:class:`~transformers.AutoModelForSequenceClassification` (or  :class:`~transformers.TFAutoModelForSequenceClassification`
-if you are using TensorFlow)` was used, the model automatically created is then a
+:class:`~transformers.AutoModelForSequenceClassification` (or :class:`~transformers.TFAutoModelForSequenceClassification`
+if you are using TensorFlow) was used, the model automatically created is then a
 :class:`~transformers.DistilBertForSequenceClassification`. You can look at its documentation for all details relevant
 to that specific model, or browse the source code. This is how you would directly instantiate model and tokenizer
 without the auto magic:

diff --git a/docs/source/training.rst b/docs/source/training.rst
@@ -49,7 +49,7 @@ put it in train mode.
 .. code-block:: python
 
     from transformers import BertForSequenceClassification
-    model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', return_dict=True)
     model.train()
 
 This is useful because it allows us to make use of the pre-trained BERT

diff --git a/examples/README.md b/examples/README.md
@@ -1,7 +1,7 @@
 # Examples
 
 Version 2.9 of 🤗 Transformers introduces a new [`Trainer`](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer.py) class for PyTorch, and its equivalent [`TFTrainer`](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer_tf.py) for TF 2.
-Running the examples requires PyTorch 1.3.1+ or TensorFlow 2.1+.
+Running the examples requires PyTorch 1.3.1+ or TensorFlow 2.2+.
 
 Here is the list of all our examples:
 - **grouped by task** (all official examples work for multiple models)

diff --git a/examples/multiple-choice/utils_multiple_choice.py b/examples/multiple-choice/utils_multiple_choice.py
@@ -204,6 +204,8 @@ def gen():
             )
 
         def get_dataset(self):
+            self.dataset = self.dataset.apply(tf.data.experimental.assert_cardinality(len(self.features)))
+
             return self.dataset
 
         def __len__(self):

diff --git a/examples/question-answering/run_squad.py b/examples/question-answering/run_squad.py
@@ -199,9 +199,6 @@ def train(args, train_dataset, model, tokenizer):
                         {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
                     )
 
-            if isinstance(model, torch.nn.DataParallel):
-                inputs["return_tuple"] = True
-
             outputs = model(**inputs)
             # model outputs are always tuple in transformers (see doc)
             loss = outputs[0]
@@ -316,8 +313,6 @@ def evaluate(args, model, tokenizer, prefix=""):
                     inputs.update(
                         {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
                     )
-            if isinstance(model, torch.nn.DataParallel):
-                inputs["return_tuple"] = True
             outputs = model(**inputs)
 
         for i, feature_index in enumerate(feature_indices):

diff --git a/examples/question-answering/run_tf_squad.py b/examples/question-answering/run_tf_squad.py
@@ -21,6 +21,8 @@
 from dataclasses import dataclass, field
 from typing import Optional
 
+import tensorflow as tf
+
 from transformers import (
     AutoConfig,
     AutoTokenizer,
@@ -68,6 +70,7 @@ class DataTrainingArguments:
     data_dir: Optional[str] = field(
         default=None, metadata={"help": "The input data dir. Should contain the .json files for the SQuAD task."}
     )
+    use_tfds: Optional[bool] = field(default=True, metadata={"help": "If TFDS should be used or not."})
     max_seq_length: int = field(
         default=128,
         metadata={
@@ -170,7 +173,7 @@ def main():
         )
 
     # Get datasets
-    if not data_args.data_dir:
+    if data_args.use_tfds:
         if data_args.version_2_with_negative:
             logger.warn("tensorflow_datasets does not handle version 2 of SQuAD. Switch to version 1 automatically")
 
@@ -179,7 +182,7 @@ def main():
         except ImportError:
             raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")
 
-        tfds_examples = tfds.load("squad")
+        tfds_examples = tfds.load("squad", data_dir=data_args.data_dir)
         train_examples = (
             SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=False)
             if training_args.do_train
@@ -209,6 +212,8 @@ def main():
         else None
     )
 
+    train_dataset = train_dataset.apply(tf.data.experimental.assert_cardinality(len(train_examples)))
+
     eval_dataset = (
         squad_convert_examples_to_features(
             examples=eval_examples,
@@ -223,6 +228,8 @@ def main():
         else None
     )
 
+    eval_dataset = eval_dataset.apply(tf.data.experimental.assert_cardinality(len(eval_examples)))
+
     # Initialize our Trainer
     trainer = TFTrainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset,)
 

diff --git a/examples/seq2seq/test_seq2seq_examples.py b/examples/seq2seq/test_seq2seq_examples.py
@@ -144,7 +144,7 @@ def test_distill_checkpointing_with_teacher(self):
         evaluate_checkpoint(ckpts[0], dest_dir=Path(tempfile.mkdtemp()))
 
     def test_loss_fn(self):
-        model = AutoModelForSeq2SeqLM.from_pretrained(BART_TINY)
+        model = AutoModelForSeq2SeqLM.from_pretrained(BART_TINY, return_dict=True)
         input_ids, mask = model.dummy_inputs["input_ids"], model.dummy_inputs["attention_mask"]
         target_ids = torch.tensor([[0, 4, 8, 2], [0, 8, 2, 1]], dtype=torch.long, device=model.device)
         decoder_input_ids = target_ids[:, :-1].contiguous()  # Why this line?

diff --git a/examples/text-classification/run_tf_glue.py b/examples/text-classification/run_tf_glue.py
@@ -9,6 +9,7 @@
 from typing import Dict, Optional
 
 import numpy as np
+import tensorflow as tf
 import tensorflow_datasets as tfds
 
 from transformers import (
@@ -35,7 +36,11 @@ class Split(Enum):
 
 
 def get_tfds(
-    task_name: str, tokenizer: PreTrainedTokenizer, max_seq_length: Optional[int] = None, mode: Split = Split.train
+    task_name: str,
+    tokenizer: PreTrainedTokenizer,
+    max_seq_length: Optional[int] = None,
+    mode: Split = Split.train,
+    data_dir: str = None,
 ):
     if task_name == "mnli-mm" and mode == Split.dev:
         tfds_name = "mnli_mismatched"
@@ -50,9 +55,11 @@ def get_tfds(
     else:
         tfds_name = task_name
 
-    ds = tfds.load("glue/" + tfds_name, split=mode.value)
+    ds, info = tfds.load("glue/" + tfds_name, split=mode.value, with_info=True, data_dir=data_dir)
+    ds = glue_convert_examples_to_features(ds, tokenizer, max_seq_length, task_name)
+    ds = ds.apply(tf.data.experimental.assert_cardinality(info.splits[mode.value].num_examples))
 
-    return glue_convert_examples_to_features(ds, tokenizer, max_seq_length, task_name)
+    return ds
 
 
 logger = logging.getLogger(__name__)
@@ -69,6 +76,7 @@ class GlueDataTrainingArguments:
     """
 
     task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(glue_processors.keys())})
+    data_dir: Optional[str] = field(default=None, metadata={"help": "The input/output data dir for TFDS."})
     max_seq_length: int = field(
         default=128,
         metadata={
@@ -171,13 +179,22 @@ def main():
 
     # Get datasets
     train_dataset = (
-        get_tfds(task_name=data_args.task_name, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length)
+        get_tfds(
+            task_name=data_args.task_name,
+            tokenizer=tokenizer,
+            max_seq_length=data_args.max_seq_length,
+            data_dir=data_args.data_dir,
+        )
         if training_args.do_train
         else None
     )
     eval_dataset = (
         get_tfds(
-            task_name=data_args.task_name, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, mode=Split.dev
+            task_name=data_args.task_name,
+            tokenizer=tokenizer,
+            max_seq_length=data_args.max_seq_length,
+            mode=Split.dev,
+            data_dir=data_args.data_dir,
         )
         if training_args.do_eval
         else None

diff --git a/examples/token-classification/run_tf_ner.py b/examples/token-classification/run_tf_ner.py
@@ -17,7 +17,6 @@
 
 import logging
 import os
-import warnings
 from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Tuple
 
@@ -185,11 +184,6 @@ def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[L
 
         for i in range(batch_size):
             for j in range(seq_len):
-                if label_ids[i, j] == -1:
-                    label_ids[i, j] = -100
-                    warnings.warn(
-                        "Using `-1` to mask the loss for the token is depreciated. Please use `-100` instead."
-                    )
                 if label_ids[i, j] != -100:
                     out_label_list[i].append(label_map[label_ids[i][j]])
                     preds_list[i].append(label_map[preds[i][j]])

diff --git a/examples/token-classification/utils_ner.py b/examples/token-classification/utils_ner.py
@@ -146,7 +146,7 @@ class TFNerDataset:
         """
 
         features: List[InputFeatures]
-        pad_token_label_id: int = -1
+        pad_token_label_id: int = -100
         # Use cross entropy ignore_index as padding label id so that only
         # real label ids contribute to the loss later.
 
@@ -221,6 +221,8 @@ def gen():
                 )
 
         def get_dataset(self):
+            self.dataset = self.dataset.apply(tf.data.experimental.assert_cardinality(len(self.features)))
+
             return self.dataset
 
         def __len__(self):

diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -278,6 +278,7 @@
         XLMForTokenClassification,
         XLMForQuestionAnswering,
         XLMForQuestionAnsweringSimple,
+        XLMForMultipleChoice,
         XLM_PRETRAINED_MODEL_ARCHIVE_LIST,
     )
     from .modeling_bart import (
@@ -356,6 +357,8 @@
         FlaubertForTokenClassification,
         FlaubertForQuestionAnswering,
         FlaubertForQuestionAnsweringSimple,
+        FlaubertForTokenClassification,
+        FlaubertForMultipleChoice,
         FLAUBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
     )
 

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
@@ -49,8 +49,9 @@ class PretrainedConfig(object):
                 Whether or not the model should returns all attentions.
             use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
                 Whether or not the model should return the last key/values attentions (not used by all models).
-            return_tuple (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the model should return tuples instead of :obj:`ModelOutput` objects.
+            return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the model should return a :class:`~transformers.file_utils.ModelOutput` instead of a
+                plain tuple.
             is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
                 Whether the model is used as an encoder/decoder or not.
             is_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
@@ -133,7 +134,7 @@ class PretrainedConfig(object):
 
     def __init__(self, **kwargs):
         # Attributes with defaults
-        self.return_tuple = kwargs.pop("return_tuple", False)
+        self.return_dict = kwargs.pop("return_dict", False)
         self.output_hidden_states = kwargs.pop("output_hidden_states", False)
         self.output_attentions = kwargs.pop("output_attentions", False)
         self.use_cache = kwargs.pop("use_cache", True)  # Not used by all models
@@ -194,12 +195,12 @@ def __init__(self, **kwargs):
                 raise err
 
     @property
-    def use_return_tuple(self) -> bool:
+    def use_return_dict(self) -> bool:
         """
-        :obj:`bool`: Whether or not the model should return a tuple.
+        :obj:`bool`: Whether or not return :class:`~transformers.file_utils.ModelOutput` instead of tuples.
         """
-        # If torchscript is set, force return_tuple to avoid jit errors
-        return self.return_tuple or self.torchscript
+        # If torchscript is set, force `return_dict=False` to avoid jit errors
+        return self.return_dict and not self.torchscript
 
     @property
     def num_labels(self) -> int: