diff --git a/examples/token-classification/run_tf_ner.py b/examples/token-classification/run_tf_ner.py
index 056a24c74fdaec..068f0617371cce 100644
--- a/examples/token-classification/run_tf_ner.py
+++ b/examples/token-classification/run_tf_ner.py
@@ -17,6 +17,7 @@
 
 import logging
 import os
+import warnings
 from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Tuple
 
@@ -184,7 +185,12 @@ def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[L
 
         for i in range(batch_size):
             for j in range(seq_len):
-                if label_ids[i, j] != -1:
+                if label_ids[i, j] == -1:
+                    label_ids[i, j] = -100
+                    warnings.warn(
+                        "Using `-1` to mask the loss for the token is depreciated. Please use `-100` instead."
+                    )
+                if label_ids[i, j] != -100:
                     out_label_list[i].append(label_map[label_ids[i][j]])
                     preds_list[i].append(label_map[preds[i][j]])
 
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index e7aaed67189235..8f68fedff049bb 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -424,6 +424,9 @@
         TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
         TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
         TF_MODEL_WITH_LM_HEAD_MAPPING,
+        TF_MODEL_FOR_CAUSAL_LM_MAPPING,
+        TF_MODEL_FOR_MASKED_LM_MAPPING,
+        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
         TFAutoModel,
         TFAutoModelForMultipleChoice,
         TFAutoModelForPreTraining,
@@ -431,6 +434,9 @@
         TFAutoModelForSequenceClassification,
         TFAutoModelForTokenClassification,
         TFAutoModelWithLMHead,
+        TFAutoModelForCausalLM,
+        TFAutoModelForMaskedLM,
+        TFAutoModelForSeq2SeqLM,
     )
 
     from .modeling_tf_albert import (
@@ -449,6 +455,7 @@
     from .modeling_tf_bert import (
         TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST,
         TFBertEmbeddings,
+        TFBertLMHeadModel,
         TFBertForMaskedLM,
         TFBertForMultipleChoice,
         TFBertForNextSentencePrediction,
diff --git a/src/transformers/modeling_auto.py b/src/transformers/modeling_auto.py
index a6c0688f734b71..15e55cb866da16 100644
--- a/src/transformers/modeling_auto.py
+++ b/src/transformers/modeling_auto.py
@@ -73,6 +73,7 @@
 from .modeling_camembert import (
     CamembertForMaskedLM,
     CamembertForMultipleChoice,
+    CamembertForQuestionAnswering,
     CamembertForSequenceClassification,
     CamembertForTokenClassification,
     CamembertModel,
@@ -306,6 +307,7 @@
     [
         (DistilBertConfig, DistilBertForQuestionAnswering),
         (AlbertConfig, AlbertForQuestionAnswering),
+        (CamembertConfig, CamembertForQuestionAnswering),
         (BartConfig, BartForQuestionAnswering),
         (LongformerConfig, LongformerForQuestionAnswering),
         (XLMRobertaConfig, XLMRobertaForQuestionAnswering),
@@ -336,7 +338,6 @@
     ]
 )
 
-
 MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
     [
         (CamembertConfig, CamembertForMultipleChoice),
diff --git a/src/transformers/modeling_tf_albert.py b/src/transformers/modeling_tf_albert.py
index 3c2a7bbf98f89f..0d9b699ddda24b 100644
--- a/src/transformers/modeling_tf_albert.py
+++ b/src/transformers/modeling_tf_albert.py
@@ -29,6 +29,7 @@
 )
 from .modeling_tf_bert import ACT2FN, TFBertSelfAttention
 from .modeling_tf_utils import (
+    TFMaskedLanguageModelingLoss,
     TFMultipleChoiceLoss,
     TFPreTrainedModel,
     TFQuestionAnsweringLoss,
@@ -822,7 +823,7 @@ def call(self, pooled_output, training: bool):
 
 
 @add_start_docstrings("""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING)
-class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
+class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
 
@@ -834,8 +835,26 @@ def get_output_embeddings(self):
 
     @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2")
-    def call(self, inputs, **kwargs):
+    def call(
+        self,
+        inputs=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        labels=None,
+        training=False,
+    ):
         r"""
+        labels (:obj::obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+
     Returns:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
         prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`
@@ -852,14 +871,35 @@ def call(self, inputs, **kwargs):
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
         """
-        outputs = self.albert(inputs, **kwargs)
+        if isinstance(inputs, (tuple, list)):
+            labels = inputs[8] if len(inputs) > 8 else labels
+            if len(inputs) > 8:
+                inputs = inputs[:8]
+        elif isinstance(inputs, (dict, BatchEncoding)):
+            labels = inputs.pop("labels", labels)
+
+        outputs = self.albert(
+            inputs,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            training=training,
+        )
 
         sequence_output = outputs[0]
-        prediction_scores = self.predictions(sequence_output, training=kwargs.get("training", False))
+        prediction_scores = self.predictions(sequence_output, training=training)
 
         # Add hidden states and attention if they are here
         outputs = (prediction_scores,) + outputs[2:]
 
+        if labels is not None:
+            loss = self.compute_loss(labels, prediction_scores)
+            outputs = (loss,) + outputs
+
         return outputs  # prediction_scores, (hidden_states), (attentions)
 
 
diff --git a/src/transformers/modeling_tf_auto.py b/src/transformers/modeling_tf_auto.py
index 7e9c2cd111fe27..bc954a7f4a655c 100644
--- a/src/transformers/modeling_tf_auto.py
+++ b/src/transformers/modeling_tf_auto.py
@@ -16,6 +16,7 @@
 
 
 import logging
+import warnings
 from collections import OrderedDict
 
 from .configuration_auto import (
@@ -54,6 +55,7 @@
     TFBertForQuestionAnswering,
     TFBertForSequenceClassification,
     TFBertForTokenClassification,
+    TFBertLMHeadModel,
     TFBertModel,
 )
 from .modeling_tf_camembert import (
@@ -140,126 +142,158 @@
 
 TF_MODEL_MAPPING = OrderedDict(
     [
+        (T5Config, TFT5Model),
+        (DistilBertConfig, TFDistilBertModel),
         (AlbertConfig, TFAlbertModel),
         (CamembertConfig, TFCamembertModel),
-        (CTRLConfig, TFCTRLModel),
-        (DistilBertConfig, TFDistilBertModel),
-        (ElectraConfig, TFElectraModel),
-        (FlaubertConfig, TFFlaubertModel),
-        (GPT2Config, TFGPT2Model),
-        (MobileBertConfig, TFMobileBertModel),
-        (OpenAIGPTConfig, TFOpenAIGPTModel),
+        (XLMRobertaConfig, TFXLMRobertaModel),
         (RobertaConfig, TFRobertaModel),
         (BertConfig, TFBertModel),
-        (T5Config, TFT5Model),
+        (OpenAIGPTConfig, TFOpenAIGPTModel),
+        (GPT2Config, TFGPT2Model),
+        (MobileBertConfig, TFMobileBertModel),
         (TransfoXLConfig, TFTransfoXLModel),
-        (XLMConfig, TFXLMModel),
-        (XLMRobertaConfig, TFXLMRobertaModel),
         (XLNetConfig, TFXLNetModel),
+        (FlaubertConfig, TFFlaubertModel),
+        (XLMConfig, TFXLMModel),
+        (CTRLConfig, TFCTRLModel),
+        (ElectraConfig, TFElectraModel),
     ]
 )
 
 TF_MODEL_FOR_PRETRAINING_MAPPING = OrderedDict(
     [
+        (T5Config, TFT5ForConditionalGeneration),
+        (DistilBertConfig, TFDistilBertForMaskedLM),
         (AlbertConfig, TFAlbertForPreTraining),
         (CamembertConfig, TFCamembertForMaskedLM),
-        (CTRLConfig, TFCTRLLMHeadModel),
-        (DistilBertConfig, TFDistilBertForMaskedLM),
-        (ElectraConfig, TFElectraForPreTraining),
-        (FlaubertConfig, TFFlaubertWithLMHeadModel),
-        (GPT2Config, TFGPT2LMHeadModel),
-        (MobileBertConfig, TFMobileBertForPreTraining),
-        (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel),
+        (XLMRobertaConfig, TFXLMRobertaForMaskedLM),
         (RobertaConfig, TFRobertaForMaskedLM),
         (BertConfig, TFBertForPreTraining),
-        (T5Config, TFT5ForConditionalGeneration),
+        (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel),
+        (GPT2Config, TFGPT2LMHeadModel),
+        (MobileBertConfig, TFMobileBertForPreTraining),
         (TransfoXLConfig, TFTransfoXLLMHeadModel),
-        (XLMConfig, TFXLMWithLMHeadModel),
-        (XLMRobertaConfig, TFXLMRobertaForMaskedLM),
         (XLNetConfig, TFXLNetLMHeadModel),
+        (FlaubertConfig, TFFlaubertWithLMHeadModel),
+        (XLMConfig, TFXLMWithLMHeadModel),
+        (CTRLConfig, TFCTRLLMHeadModel),
+        (ElectraConfig, TFElectraForPreTraining),
     ]
 )
 
 TF_MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
     [
+        (T5Config, TFT5ForConditionalGeneration),
+        (DistilBertConfig, TFDistilBertForMaskedLM),
         (AlbertConfig, TFAlbertForMaskedLM),
         (CamembertConfig, TFCamembertForMaskedLM),
-        (CTRLConfig, TFCTRLLMHeadModel),
-        (DistilBertConfig, TFDistilBertForMaskedLM),
-        (ElectraConfig, TFElectraForMaskedLM),
-        (FlaubertConfig, TFFlaubertWithLMHeadModel),
-        (GPT2Config, TFGPT2LMHeadModel),
-        (MobileBertConfig, TFMobileBertForMaskedLM),
-        (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel),
+        (XLMRobertaConfig, TFXLMRobertaForMaskedLM),
         (RobertaConfig, TFRobertaForMaskedLM),
         (BertConfig, TFBertForMaskedLM),
-        (T5Config, TFT5ForConditionalGeneration),
+        (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel),
+        (GPT2Config, TFGPT2LMHeadModel),
+        (MobileBertConfig, TFMobileBertForMaskedLM),
         (TransfoXLConfig, TFTransfoXLLMHeadModel),
-        (XLMConfig, TFXLMWithLMHeadModel),
-        (XLMRobertaConfig, TFXLMRobertaForMaskedLM),
         (XLNetConfig, TFXLNetLMHeadModel),
+        (FlaubertConfig, TFFlaubertWithLMHeadModel),
+        (XLMConfig, TFXLMWithLMHeadModel),
+        (CTRLConfig, TFCTRLLMHeadModel),
+        (ElectraConfig, TFElectraForMaskedLM),
     ]
 )
 
-TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
+TF_MODEL_FOR_CAUSAL_LM_MAPPING = OrderedDict(
     [
-        (AlbertConfig, TFAlbertForMultipleChoice),
-        (CamembertConfig, TFCamembertForMultipleChoice),
-        (DistilBertConfig, TFDistilBertForMultipleChoice),
-        (FlaubertConfig, TFFlaubertForMultipleChoice),
-        (MobileBertConfig, TFMobileBertForMultipleChoice),
-        (RobertaConfig, TFRobertaForMultipleChoice),
-        (BertConfig, TFBertForMultipleChoice),
-        (XLMConfig, TFXLMForMultipleChoice),
-        (XLMRobertaConfig, TFXLMRobertaForMultipleChoice),
-        (XLNetConfig, TFXLNetForMultipleChoice),
+        (BertConfig, TFBertLMHeadModel),
+        (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel),
+        (GPT2Config, TFGPT2LMHeadModel),
+        (TransfoXLConfig, TFTransfoXLLMHeadModel),
+        (XLNetConfig, TFXLNetLMHeadModel),
+        (
+            XLMConfig,
+            TFXLMWithLMHeadModel,
+        ),  # XLM can be MLM and CLM => model should be split similar to BERT; leave here for now
+        (CTRLConfig, TFCTRLLMHeadModel),
     ]
 )
 
-TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
+TF_MODEL_FOR_MASKED_LM_MAPPING = OrderedDict(
     [
-        (AlbertConfig, TFAlbertForQuestionAnswering),
-        (CamembertConfig, TFCamembertForQuestionAnswering),
-        (DistilBertConfig, TFDistilBertForQuestionAnswering),
-        (ElectraConfig, TFElectraForQuestionAnswering),
-        (FlaubertConfig, TFFlaubertForQuestionAnsweringSimple),
-        (MobileBertConfig, TFMobileBertForQuestionAnswering),
-        (RobertaConfig, TFRobertaForQuestionAnswering),
-        (BertConfig, TFBertForQuestionAnswering),
-        (XLMConfig, TFXLMForQuestionAnsweringSimple),
-        (XLMRobertaConfig, TFXLMRobertaForQuestionAnswering),
-        (XLNetConfig, TFXLNetForQuestionAnsweringSimple),
+        (DistilBertConfig, TFDistilBertForMaskedLM),
+        (AlbertConfig, TFAlbertForMaskedLM),
+        (CamembertConfig, TFCamembertForMaskedLM),
+        (XLMRobertaConfig, TFXLMRobertaForMaskedLM),
+        (RobertaConfig, TFRobertaForMaskedLM),
+        (BertConfig, TFBertForMaskedLM),
+        (MobileBertConfig, TFMobileBertForMaskedLM),
+        (FlaubertConfig, TFFlaubertWithLMHeadModel),
+        (XLMConfig, TFXLMWithLMHeadModel),
+        (ElectraConfig, TFElectraForMaskedLM),
     ]
 )
 
+TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict([(T5Config, TFT5ForConditionalGeneration)])
+
 TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
     [
+        (DistilBertConfig, TFDistilBertForSequenceClassification),
         (AlbertConfig, TFAlbertForSequenceClassification),
         (CamembertConfig, TFCamembertForSequenceClassification),
-        (DistilBertConfig, TFDistilBertForSequenceClassification),
-        (FlaubertConfig, TFFlaubertForSequenceClassification),
-        (MobileBertConfig, TFMobileBertForSequenceClassification),
+        (XLMRobertaConfig, TFXLMRobertaForSequenceClassification),
         (RobertaConfig, TFRobertaForSequenceClassification),
         (BertConfig, TFBertForSequenceClassification),
-        (XLMConfig, TFXLMForSequenceClassification),
-        (XLMRobertaConfig, TFXLMRobertaForSequenceClassification),
         (XLNetConfig, TFXLNetForSequenceClassification),
+        (MobileBertConfig, TFMobileBertForSequenceClassification),
+        (FlaubertConfig, TFFlaubertForSequenceClassification),
+        (XLMConfig, TFXLMForSequenceClassification),
+    ]
+)
+
+TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
+    [
+        (DistilBertConfig, TFDistilBertForQuestionAnswering),
+        (AlbertConfig, TFAlbertForQuestionAnswering),
+        (CamembertConfig, TFCamembertForQuestionAnswering),
+        (XLMRobertaConfig, TFXLMRobertaForQuestionAnswering),
+        (RobertaConfig, TFRobertaForQuestionAnswering),
+        (BertConfig, TFBertForQuestionAnswering),
+        (XLNetConfig, TFXLNetForQuestionAnsweringSimple),
+        (MobileBertConfig, TFMobileBertForQuestionAnswering),
+        (FlaubertConfig, TFFlaubertForQuestionAnsweringSimple),
+        (XLMConfig, TFXLMForQuestionAnsweringSimple),
+        (ElectraConfig, TFElectraForQuestionAnswering),
     ]
 )
 
 TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict(
     [
+        (DistilBertConfig, TFDistilBertForTokenClassification),
         (AlbertConfig, TFAlbertForTokenClassification),
         (CamembertConfig, TFCamembertForTokenClassification),
-        (DistilBertConfig, TFDistilBertForTokenClassification),
-        (ElectraConfig, TFElectraForTokenClassification),
         (FlaubertConfig, TFFlaubertForTokenClassification),
-        (MobileBertConfig, TFMobileBertForTokenClassification),
-        (RobertaConfig, TFRobertaForTokenClassification),
-        (BertConfig, TFBertForTokenClassification),
         (XLMConfig, TFXLMForTokenClassification),
         (XLMRobertaConfig, TFXLMRobertaForTokenClassification),
+        (RobertaConfig, TFRobertaForTokenClassification),
+        (BertConfig, TFBertForTokenClassification),
+        (MobileBertConfig, TFMobileBertForTokenClassification),
         (XLNetConfig, TFXLNetForTokenClassification),
+        (ElectraConfig, TFElectraForTokenClassification),
+    ]
+)
+
+TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict(
+    [
+        (CamembertConfig, TFCamembertForMultipleChoice),
+        (XLMConfig, TFXLMForMultipleChoice),
+        (XLMRobertaConfig, TFXLMRobertaForMultipleChoice),
+        (RobertaConfig, TFRobertaForMultipleChoice),
+        (BertConfig, TFBertForMultipleChoice),
+        (DistilBertConfig, TFDistilBertForMultipleChoice),
+        (MobileBertConfig, TFMobileBertForMultipleChoice),
+        (XLNetConfig, TFXLNetForMultipleChoice),
+        (FlaubertConfig, TFFlaubertForMultipleChoice),
+        (AlbertConfig, TFAlbertForMultipleChoice),
     ]
 )
 
@@ -303,11 +337,11 @@ def from_config(cls, config):
 
         Note:
             Loading a model from its configuration file does **not** load the model weights.
-            It only affects the model's configuration. Use :func:`~transformers.AutoModel.from_pretrained` to load
+            It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load
             the model weights
 
         Args:
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
                 The model class to instantiate is selected based on the configuration class:
 
                     - isInstance of `distilbert` configuration class: TFDistilBertModel (DistilBERT model)
@@ -359,7 +393,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                 - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
 
             from_pt: (`Optional`) Boolean
@@ -368,17 +402,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
             state_dict: (`optional`) dict:
                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option.
 
             cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
@@ -401,7 +435,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.TFPretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
@@ -452,11 +486,11 @@ def from_config(cls, config):
 
         Note:
             Loading a model from its configuration file does **not** load the model weights.
-            It only affects the model's configuration. Use :func:`~transformers.AutoModel.from_pretrained` to load
+            It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load
             the model weights
 
         Args:
-            config (:class:`~transformers.PretrainedConfig`):
+            config (:class:`~transformers.TFPretrainedConfig`):
                 The model class to instantiate is selected based on the configuration class:
 
                 - isInstance of `distilbert` configuration class: :class:`~transformers.TFDistilBertModelForMaskedLM` (DistilBERT model)
@@ -478,7 +512,7 @@ def from_config(cls, config):
             if isinstance(config, config_class):
                 return model_class(config)
         raise ValueError(
-            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
+            "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
             "Model type should be one of {}.".format(
                 config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_PRETRAINING_MAPPING.keys())
             )
@@ -513,21 +547,21 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                 - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
             state_dict: (`optional`) dict:
                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option.
             cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
                 configuration should be cached if the standard cache should not be used.
@@ -549,7 +583,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                   underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
                   already been done)
                 - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
-                  initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of
+                  initialization function (:func:`~transformers.TFPretrainedConfig.from_pretrained`). Each key of
                   ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
                   with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
                   attribute will be passed to the underlying model's ``__init__`` function.
@@ -573,7 +607,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             if isinstance(config, config_class):
                 return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
         raise ValueError(
-            "Unrecognized configuration class {} for this kind of AutoModel: {}.\n"
+            "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
             "Model type should be one of {}.".format(
                 config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_PRETRAINING_MAPPING.keys())
             )
@@ -619,28 +653,32 @@ def from_config(cls, config):
 
         Note:
             Loading a model from its configuration file does **not** load the model weights.
-            It only affects the model's configuration. Use :func:`~transformers.AutoModel.from_pretrained` to load
+            It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load
             the model weights
 
         Args:
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
                 The model class to instantiate is selected based on the configuration class:
 
-                    - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model)
-                    - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model)
-                    - isInstance of `bert` configuration class: BertModel (Bert model)
+                    - isInstance of `distilbert` configuration class: TFDistilBertModel (DistilBERT model)
+                    - isInstance of `roberta` configuration class: TFRobertaModel (RoBERTa model)
+                    - isInstance of `bert` configuration class: TFBertModel (Bert model)
                     - isInstance of `openai-gpt` configuration class: OpenAIGPTModel (OpenAI GPT model)
-                    - isInstance of `gpt2` configuration class: GPT2Model (OpenAI GPT-2 model)
-                    - isInstance of `ctrl` configuration class: CTRLModel (Salesforce CTRL  model)
+                    - isInstance of `gpt2` configuration class: TFGPT2Model (OpenAI GPT-2 model)
+                    - isInstance of `ctrl` configuration class: TFCTRLModel (Salesforce CTRL  model)
                     - isInstance of `transfo-xl` configuration class: TransfoXLModel (Transformer-XL model)
-                    - isInstance of `xlnet` configuration class: XLNetModel (XLNet model)
-                    - isInstance of `xlm` configuration class: XLMModel (XLM model)
+                    - isInstance of `xlnet` configuration class: TFXLNetModel (XLNet model)
+                    - isInstance of `xlm` configuration class: TFXLMModel (XLM model)
 
         Examples::
 
             config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
             model = TFAutoModelWithLMHead.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
         """
+        warnings.warn(
+            "The class `TFAutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `TFAutoModelForCausalLM` for causal language models, `TFAutoModelForMaskedLM` for masked language models and `TFAutoModelForSeq2SeqLM` for encoder-decoder models.",
+            FutureWarning,
+        )
         for config_class, model_class in TF_MODEL_WITH_LM_HEAD_MAPPING.items():
             if isinstance(config, config_class):
                 return model_class(config)
@@ -676,7 +714,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                 - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
 
             from_pt: (`Optional`) Boolean
@@ -685,17 +723,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
             state_dict: (`optional`) dict:
                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option.
 
             cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
@@ -718,7 +756,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.TFPretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
@@ -731,6 +769,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
 
         """
+        warnings.warn(
+            "The class `TFAutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `TFAutoModelForCausalLM` for causal language models, `TFAutoModelForMaskedLM` for masked language models and `TFAutoModelForSeq2SeqLM` for encoder-decoder models.",
+            FutureWarning,
+        )
         config = kwargs.pop("config", None)
 
         if not isinstance(config, PretrainedConfig):
@@ -778,19 +820,19 @@ def from_config(cls, config):
 
         Note:
             Loading a model from its configuration file does **not** load the model weights.
-            It only affects the model's configuration. Use :func:`~transformers.AutoModel.from_pretrained` to load
+            It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load
             the model weights
 
         Args:
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
                 The model class to instantiate is selected based on the configuration class:
-                    - isInstance of `albert` configuration class: AlbertModel (Albert model)
-                    - isInstance of `bert` configuration class: BertModel (Bert model)
+                    - isInstance of `albert` configuration class: TFAlbertModel (Albert model)
+                    - isInstance of `bert` configuration class: TFBertModel (Bert model)
 
         Examples::
 
             config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            model = AutoModelForMulitpleChoice.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = TFAutoModelForMulitpleChoice.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
         """
         for config_class, model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.items():
             if isinstance(config, config_class):
@@ -824,7 +866,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                 - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
 
             from_pt: (`Optional`) Boolean
@@ -833,17 +875,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
             state_dict: (`optional`) dict:
                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option.
 
             cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
@@ -866,7 +908,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.TFPretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
@@ -896,6 +938,406 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         )
 
 
+class TFAutoModelForCausalLM:
+    r"""
+        :class:`~transformers.TFAutoModelForCausalLM` is a generic model class
+        that will be instantiated as one of the language modeling model classes of the library
+        when created with the `TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+
+    def __init__(self):
+        raise EnvironmentError(
+            "TFAutoModelForCausalLM is designed to be instantiated "
+            "using the `TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)` or "
+            "`TFAutoModelForCausalLM.from_config(config)` methods."
+        )
+
+    @classmethod
+    def from_config(cls, config):
+        r""" Instantiates one of the base model classes of the library
+        from a configuration.
+
+        Note:
+            Loading a model from its configuration file does **not** load the model weights.
+            It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load
+            the model weights
+
+        Args:
+            config (:class:`~transformers.TFPretrainedConfig`):
+                The model class to instantiate is selected based on the configuration class:
+
+                - isInstance of `bert` configuration class: :class:`~transformers.TFBertLMHeadModel` (Bert model)
+                - isInstance of `openai-gpt` configuration class: :class:`~transformers.TFOpenAIGPTLMHeadModel` (OpenAI GPT model)
+                - isInstance of `gpt2` configuration class: :class:`~transformers.TFGPT2LMHeadModel` (OpenAI GPT-2 model)
+                - isInstance of `ctrl` configuration class: :class:`~transformers.TFCTRLLMHeadModel` (Salesforce CTRL  model)
+                - isInstance of `transfo-xl` configuration class: :class:`~transformers.TFTransfoXLLMHeadModel` (Transformer-XL model)
+                - isInstance of `xlnet` configuration class: :class:`~transformers.TFXLNetLMHeadModel` (XLNet model)
+
+        Examples::
+
+            config = GPT2Config.from_pretrained('gpt2')    # Download configuration from S3 and cache.
+            model = TFAutoModelForCausalLM.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        """
+        for config_class, model_class in TF_MODEL_FOR_CAUSAL_LM_MAPPING.items():
+            if isinstance(config, config_class):
+                return model_class(config)
+        raise ValueError(
+            "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
+            "Model type should be one of {}.".format(
+                config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_CAUSAL_LM_MAPPING.keys())
+            )
+        )
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the language modeling model classes of the library
+        from a pre-trained model configuration.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        based on the `model_type` property of the config object, or when it's missing,
+        falling back to using pattern matching on the `pretrained_model_name_or_path` string:
+
+            - `bert`: :class:`~transformers.TFBertLMHeadModel` (Bert model)
+            - `openai-gpt`: :class:`~transformers.TFOpenAIGPTLMHeadModel` (OpenAI GPT model)
+            - `gpt2`: :class:`~transformers.TFGPT2LMHeadModel` (OpenAI GPT-2 model)
+            - `transfo-xl`: :class:`~transformers.TFTransfoXLLMHeadModel` (Transformer-XL model)
+            - `xlnet`: :class:`~transformers.TFXLNetLMHeadModel` (XLNet model)
+            - `ctrl`: :class:`~transformers.TFCTRLLMHeadModel` (Salesforce CTRL model)
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+
+        Args:
+            pretrained_model_name_or_path:
+                Either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+            config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option.
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+            resume_download: (`optional`) boolean, default False:
+                Do not delete incompletely received file. Attempt to resume the download if such a file exists.
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages.
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                These arguments will be passed to the configuration and the model.
+
+        Examples::
+
+            model = TFAutoModelForCausalLM.from_pretrained('gpt2')    # Download model and configuration from S3 and cache.
+            model = TFAutoModelForCausalLM.from_pretrained('./test/gpt2_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/gpt2_tf_model_config.json')
+            model =  TFAutoModelForCausalLM.from_pretrained('./tf_model/gpt2_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        config = kwargs.pop("config", None)
+        if not isinstance(config, PretrainedConfig):
+            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+        for config_class, model_class in TF_MODEL_FOR_CAUSAL_LM_MAPPING.items():
+            if isinstance(config, config_class):
+                return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
+        raise ValueError(
+            "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
+            "Model type should be one of {}.".format(
+                config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_CAUSAL_LM_MAPPING.keys())
+            )
+        )
+
+
+class TFAutoModelForMaskedLM:
+    r"""
+        :class:`~transformers.TFAutoModelForMaskedLM` is a generic model class
+        that will be instantiated as one of the language modeling model classes of the library
+        when created with the `TFAutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+
+    def __init__(self):
+        raise EnvironmentError(
+            "TFAutoModelForMaskedLM is designed to be instantiated "
+            "using the `TFAutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path)` or "
+            "`TFAutoModelForMaskedLM.from_config(config)` methods."
+        )
+
+    @classmethod
+    def from_config(cls, config):
+        r""" Instantiates one of the base model classes of the library
+        from a configuration.
+
+        Note:
+            Loading a model from its configuration file does **not** load the model weights.
+            It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load
+            the model weights
+
+        Args:
+            config (:class:`~transformers.TFPretrainedConfig`):
+                The model class to instantiate is selected based on the configuration class:
+                - isInstance of `distilbert` configuration class: :class:`~transformers.TFDistilBertForMaskedLM` (DistilBERT model)
+                - isInstance of `roberta` configuration class: :class:`~transformers.TFRobertaForMaskedLM` (RoBERTa model)
+                - isInstance of `bert` configuration class: :class:`~transformers.TFBertForMaskedLM` (Bert model)
+                - isInstance of `flaubert` configuration class: :class:`~transformers.TFFlaubertWithLMHeadModel` (Flaubert model)
+                - isInstance of `xlm` configuration class: :class:`~transformers.TFXLMWithLMHeadModel` (XLM model)
+                - isInstance of `xlm-roberta` configuration class: :class:`~transformers.TFXLMRobertaForMaskedLM` (XLM-Roberta model)
+                - isInstance of `electra` configuration class: :class:`~transformers.TFElectraForMaskedLM` (Electra model)
+                - isInstance of `camembert` configuration class: :class:`~transformers.TFCamembertForMaskedLM` (Camembert model)
+                - isInstance of `albert` configuration class: :class:`~transformers.TFAlbertForMaskedLM` (Albert model)
+
+
+        Examples::
+
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            model = TFAutoModelForMaskedLM.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        """
+        for config_class, model_class in TF_MODEL_FOR_MASKED_LM_MAPPING.items():
+            if isinstance(config, config_class):
+                return model_class(config)
+        raise ValueError(
+            "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
+            "Model type should be one of {}.".format(
+                config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_MASKED_LM_MAPPING.keys())
+            )
+        )
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the language modeling model classes of the library
+        from a pre-trained model configuration.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        based on the `model_type` property of the config object, or when it's missing,
+        falling back to using pattern matching on the `pretrained_model_name_or_path` string:
+
+            - `distilbert`: :class:`~transformers.TFDistilBertForMaskedLM` (DistilBERT model)
+            - `albert`: :class:`~transformers.TFAlbertForMaskedLM` (ALBERT model)
+            - `camembert`: :class:`~transformers.TFCamembertForMaskedLM` (CamemBERT model)
+            - `xlm-roberta`: :class:`~transformers.TFXLMRobertaForMaskedLM` (XLM-RoBERTa model)
+            - `longformer`: :class:`~transformers.TFLongformerForMaskedLM` (Longformer model)
+            - `roberta`: :class:`~transformers.TFRobertaForMaskedLM` (RoBERTa model)
+            - `xlm`: :class:`~transformers.TFXLMWithLMHeadModel` (XLM model)
+            - `flaubert`: :class:`~transformers.TFFlaubertWithLMHeadModel` (Flaubert model)
+            - `electra`: :class:`~transformers.TFElectraForMaskedLM` (Electra model)
+            - `bert`: :class:`~transformers.TFBertLMHeadModel` (Bert model)
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+
+        Args:
+            pretrained_model_name_or_path:
+                Either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+            config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option.
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+            resume_download: (`optional`) boolean, default False:
+                Do not delete incompletely received file. Attempt to resume the download if such a file exists.
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages.
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                These arguments will be passed to the configuration and the model.
+
+        Examples::
+
+            model = TFAutoModelForMaskedLM.from_pretrained('bert')    # Download model and configuration from S3 and cache.
+            model = TFAutoModelForMaskedLM.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model =  TFAutoModelForMaskedLM.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        config = kwargs.pop("config", None)
+        if not isinstance(config, PretrainedConfig):
+            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+        for config_class, model_class in TF_MODEL_FOR_MASKED_LM_MAPPING.items():
+            if isinstance(config, config_class):
+                return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
+        raise ValueError(
+            "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
+            "Model type should be one of {}.".format(
+                config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_MASKED_LM_MAPPING.keys())
+            )
+        )
+
+
+class TFAutoModelForSeq2SeqLM:
+    r"""
+        :class:`~transformers.TFAutoModelForSeq2SeqLM` is a generic model class
+        that will be instantiated as one of the language modeling model classes of the library
+        when created with the `TFAutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+
+    def __init__(self):
+        raise EnvironmentError(
+            "TFAutoModelForSeq2SeqLM is designed to be instantiated "
+            "using the `TFAutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path)` or "
+            "`TFAutoModelForSeq2SeqLM.from_config(config)` methods."
+        )
+
+    @classmethod
+    def from_config(cls, config):
+        r""" Instantiates one of the base model classes of the library
+        from a configuration.
+
+        Note:
+            Loading a model from its configuration file does **not** load the model weights.
+            It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load
+            the model weights
+
+        Args:
+            config (:class:`~transformers.TFPretrainedConfig`):
+                The model class to instantiate is selected based on the configuration class:
+
+                - isInstance of `t5` configuration class: :class:`~transformers.TFT5ForConditionalGeneration` (T5 model)
+
+        Examples::
+
+            config = T5Config.from_pretrained('t5')
+            model = TFAutoModelForSeq2SeqLM.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+        """
+        for config_class, model_class in TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.items():
+            if isinstance(config, config_class):
+                return model_class(config)
+        raise ValueError(
+            "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
+            "Model type should be one of {}.".format(
+                config.__class__,
+                cls.__name__,
+                ", ".join(c.__name__ for c in TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.keys()),
+            )
+        )
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the language modeling model classes of the library
+        from a pre-trained model configuration.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        based on the `model_type` property of the config object, or when it's missing,
+        falling back to using pattern matching on the `pretrained_model_name_or_path` string:
+
+            - `t5`: :class:`~transformers.TFT5ForConditionalGeneration` (T5 model)
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with `model.train()`
+
+        Args:
+            pretrained_model_name_or_path:
+                Either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+            config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option.
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+            resume_download: (`optional`) boolean, default False:
+                Do not delete incompletely received file. Attempt to resume the download if such a file exists.
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages.
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                These arguments will be passed to the configuration and the model.
+
+        Examples::
+
+            model = TFAutoModelForSeq2SeqLM.from_pretrained('t5-base')    # Download model and configuration from S3 and cache.
+            model = TFAutoModelForSeq2SeqLM.from_pretrained('./test/t5_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/t5_tf_model_config.json')
+            model =  TFAutoModelForSeq2SeqLM.from_pretrained('./tf_model/t5_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        config = kwargs.pop("config", None)
+        if not isinstance(config, PretrainedConfig):
+            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+        for config_class, model_class in TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.items():
+            if isinstance(config, config_class):
+                return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs)
+        raise ValueError(
+            "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n"
+            "Model type should be one of {}.".format(
+                config.__class__,
+                cls.__name__,
+                ", ".join(c.__name__ for c in TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.keys()),
+            )
+        )
+
+
 class TFAutoModelForSequenceClassification(object):
     r"""
         :class:`~transformers.TFAutoModelForSequenceClassification` is a generic model class
@@ -930,11 +1372,11 @@ def from_config(cls, config):
 
         Note:
             Loading a model from its configuration file does **not** load the model weights.
-            It only affects the model's configuration. Use :func:`~transformers.AutoModel.from_pretrained` to load
+            It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load
             the model weights
 
         Args:
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
                 The model class to instantiate is selected based on the configuration class:
 
                     - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model)
@@ -946,7 +1388,7 @@ def from_config(cls, config):
         Examples::
 
             config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            model = AutoModelForSequenceClassification.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = TFAutoModelForSequenceClassification.from_config(config)  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
         """
         for config_class, model_class in TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.items():
             if isinstance(config, config_class):
@@ -983,7 +1425,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                 - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
 
             from_pt: (`Optional`) Boolean
@@ -992,17 +1434,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
             state_dict: (`optional`) dict:
                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option.
 
             cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
@@ -1025,7 +1467,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.TFPretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
@@ -1090,11 +1532,11 @@ def from_config(cls, config):
 
         Note:
             Loading a model from its configuration file does **not** load the model weights.
-            It only affects the model's configuration. Use :func:`~transformers.AutoModel.from_pretrained` to load
+            It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load
             the model weights
 
         Args:
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
                 The model class to instantiate is selected based on the configuration class:
 
                     - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model)
@@ -1145,7 +1587,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                 - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
 
             from_pt: (`Optional`) Boolean
@@ -1154,17 +1596,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
             state_dict: (`optional`) dict:
                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option.
 
             cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
@@ -1187,7 +1629,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.TFPretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
@@ -1222,7 +1664,7 @@ def __init__(self):
         raise EnvironmentError(
             "TFAutoModelForTokenClassification is designed to be instantiated "
             "using the `TFAutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or "
-            "`AutoModelForTokenClassification.from_config(config)` methods."
+            "`TFAutoModelForTokenClassification.from_config(config)` methods."
         )
 
     @classmethod
@@ -1232,11 +1674,11 @@ def from_config(cls, config):
 
         Note:
             Loading a model from its configuration file does **not** load the model weights.
-            It only affects the model's configuration. Use :func:`~transformers.AutoModel.from_pretrained` to load
+            It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load
             the model weights
 
         Args:
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
                 The model class to instantiate is selected based on the configuration class:
 
                     - isInstance of `bert` configuration class: BertModel (Bert model)
@@ -1282,23 +1724,23 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
             model_args: (`optional`) Sequence of positional arguments:
                 All remaning positional arguments will be passed to the underlying model's ``__init__`` method
 
-            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+            config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`:
                 Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
 
                 - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
-                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
                 - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
 
             state_dict: (`optional`) dict:
                 an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+                In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option.
 
             cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
@@ -1318,7 +1760,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
-                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.TFPretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
         Examples::
 
diff --git a/src/transformers/modeling_tf_bert.py b/src/transformers/modeling_tf_bert.py
index 0bab7699cef44f..b829b3b8ae6911 100644
--- a/src/transformers/modeling_tf_bert.py
+++ b/src/transformers/modeling_tf_bert.py
@@ -29,6 +29,8 @@
     add_start_docstrings_to_callable,
 )
 from .modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
+    TFMaskedLanguageModelingLoss,
     TFMultipleChoiceLoss,
     TFPreTrainedModel,
     TFQuestionAnsweringLoss,
@@ -803,9 +805,12 @@ def call(self, inputs, **kwargs):
 
 
 @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
-class TFBertForMaskedLM(TFBertPreTrainedModel):
+class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
+        assert (
+            not config.is_decoder
+        ), "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention."
 
         self.bert = TFBertMainLayer(config, name="bert")
         self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
@@ -815,8 +820,26 @@ def get_output_embeddings(self):
 
     @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
-    def call(self, inputs, **kwargs):
+    def call(
+        self,
+        inputs=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        labels=None,
+        training=False,
+    ):
         r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+
     Return:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
         prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
@@ -833,13 +856,113 @@ def call(self, inputs, **kwargs):
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
         """
-        outputs = self.bert(inputs, **kwargs)
+        if isinstance(inputs, (tuple, list)):
+            labels = inputs[8] if len(inputs) > 8 else labels
+            if len(inputs) > 8:
+                inputs = inputs[:8]
+        elif isinstance(inputs, (dict, BatchEncoding)):
+            labels = inputs.pop("labels", labels)
+
+        outputs = self.bert(
+            inputs,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            training=training,
+        )
 
         sequence_output = outputs[0]
-        prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
+        prediction_scores = self.mlm(sequence_output, training=training)
 
         outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
 
+        if labels is not None:
+            loss = self.compute_loss(labels, prediction_scores)
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), prediction_scores, (hidden_states), (attentions)
+
+
+class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        assert config.is_decoder, "If you want to use `TFBertLMHeadModel` as a standalone, add `is_decoder=True.`"
+
+        self.bert = TFBertMainLayer(config, name="bert")
+        self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls")
+
+    def get_output_embeddings(self):
+        return self.bert.embeddings
+
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased")
+    def call(
+        self,
+        inputs=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        labels=None,
+        training=False,
+    ):
+        r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the cross entropy classification loss.
+            Indices should be in ``[0, ..., config.vocab_size - 1]``.
+
+    Return:
+        :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            tuple of :obj:`tf.Tensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
+        if isinstance(inputs, (tuple, list)):
+            labels = inputs[8] if len(inputs) > 8 else labels
+            if len(inputs) > 8:
+                inputs = inputs[:8]
+        elif isinstance(inputs, (dict, BatchEncoding)):
+            labels = inputs.pop("labels", labels)
+
+        outputs = self.bert(
+            inputs,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.mlm(sequence_output, training=training)
+
+        outputs = (logits,) + outputs[2:]  # Add hidden states and attention if they are here
+        if labels is not None:
+            # shift labels to the left and cut last logit token
+            logits = logits[:, :-1]
+            labels = labels[:, 1:]
+            loss = self.compute_loss(labels, logits)
+            outputs = (loss,) + outputs
+
         return outputs  # prediction_scores, (hidden_states), (attentions)
 
 
diff --git a/src/transformers/modeling_tf_ctrl.py b/src/transformers/modeling_tf_ctrl.py
index 96c2d0e2ad9ca5..dc20cf74ba349a 100644
--- a/src/transformers/modeling_tf_ctrl.py
+++ b/src/transformers/modeling_tf_ctrl.py
@@ -24,6 +24,7 @@
 from .configuration_ctrl import CTRLConfig
 from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
     TFPreTrainedModel,
     TFSharedEmbeddings,
     cast_bool_to_primitive,
@@ -542,7 +543,7 @@ def call(self, hidden_states):
     (linear layer with weights tied to the input embeddings). """,
     CTRL_START_DOCSTRING,
 )
-class TFCTRLLMHeadModel(TFCTRLPreTrainedModel):
+class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.transformer = TFCTRLMainLayer(config, name="transformer")
@@ -561,8 +562,26 @@ def prepare_inputs_for_generation(self, inputs, past, **kwargs):
 
     @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl")
-    def call(self, inputs, **kwargs):
+    def call(
+        self,
+        inputs,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        labels=None,
+        training=False,
+    ):
         r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the cross entropy classification loss.
+            Indices should be in ``[0, ..., config.vocab_size - 1]``.
+
     Return:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs:
         prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
@@ -583,11 +602,37 @@ def call(self, inputs, **kwargs):
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
         """
-        transformer_outputs = self.transformer(inputs, **kwargs)
+        if isinstance(inputs, (tuple, list)):
+            labels = inputs[10] if len(inputs) > 10 else labels
+            if len(inputs) > 10:
+                inputs = inputs[:10]
+        elif isinstance(inputs, (dict, BatchEncoding)):
+            labels = inputs.pop("labels", labels)
+
+        transformer_outputs = self.transformer(
+            inputs,
+            past=past,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            training=training,
+        )
+
         hidden_states = transformer_outputs[0]
 
-        lm_logits = self.lm_head(hidden_states)
+        logits = self.lm_head(hidden_states)
 
-        outputs = (lm_logits,) + transformer_outputs[1:]
+        outputs = (logits,) + transformer_outputs[1:]
+        if labels is not None:
+            # shift labels to the left and cut last logit token
+            logits = logits[:, :-1]
+            labels = labels[:, 1:]
+            loss = self.compute_loss(labels, logits)
+            outputs = (loss,) + outputs
 
         return outputs  # lm_logits, presents, (all hidden_states), (attentions)
diff --git a/src/transformers/modeling_tf_distilbert.py b/src/transformers/modeling_tf_distilbert.py
index e9fe573bde6996..a22328d743b0fc 100644
--- a/src/transformers/modeling_tf_distilbert.py
+++ b/src/transformers/modeling_tf_distilbert.py
@@ -30,6 +30,7 @@
     add_start_docstrings_to_callable,
 )
 from .modeling_tf_utils import (
+    TFMaskedLanguageModelingLoss,
     TFMultipleChoiceLoss,
     TFPreTrainedModel,
     TFQuestionAnsweringLoss,
@@ -116,7 +117,7 @@ def build(self, input_shape):
     def call(self, inputs, inputs_embeds=None, mode="embedding", training=False):
         """Get token embeddings of inputs.
         Args:
-            inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids)
+            inputs: list of two int64 tensors with shape [batch_size, length]: (input_ids, position_ids)
             mode: string, a valid value is one of "embedding" and "linear".
         Returns:
             outputs: (1) If mode == "embedding", output embedding tensor, float32 with
@@ -528,9 +529,9 @@ class TFDistilBertPreTrainedModel(TFPreTrainedModel):
 
         - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)`
         - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
-          :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
+          :obj:`model([input_ids, attention_mask])`
         - a dictionary with one or several input Tensors associated to the input names given in the docstring:
-          :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
+          :obj:`model({'input_ids': input_ids})`
 
     Parameters:
         config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model.
@@ -626,7 +627,7 @@ def call(self, hidden_states):
 @add_start_docstrings(
     """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING,
 )
-class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
+class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModelingLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.vocab_size = config.vocab_size
@@ -644,8 +645,23 @@ def get_output_embeddings(self):
 
     @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased")
-    def call(self, inputs, **kwargs):
+    def call(
+        self,
+        inputs=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        labels=None,
+        training=False,
+    ):
         r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
 
     Returns:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs:
@@ -663,7 +679,22 @@ def call(self, inputs, **kwargs):
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
         """
-        distilbert_output = self.distilbert(inputs, **kwargs)
+        if isinstance(inputs, (tuple, list)):
+            labels = inputs[6] if len(inputs) > 6 else labels
+            if len(inputs) > 6:
+                inputs = inputs[:6]
+        elif isinstance(inputs, (dict, BatchEncoding)):
+            labels = inputs.pop("labels", labels)
+
+        distilbert_output = self.distilbert(
+            inputs,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            training=training,
+        )
 
         hidden_states = distilbert_output[0]  # (bs, seq_length, dim)
         prediction_logits = self.vocab_transform(hidden_states)  # (bs, seq_length, dim)
@@ -672,6 +703,11 @@ def call(self, inputs, **kwargs):
         prediction_logits = self.vocab_projector(prediction_logits)
 
         outputs = (prediction_logits,) + distilbert_output[1:]
+
+        if labels is not None:
+            loss = self.compute_loss(labels, prediction_logits)
+            outputs = (loss,) + outputs
+
         return outputs  # logits, (hidden_states), (attentions)
 
 
diff --git a/src/transformers/modeling_tf_electra.py b/src/transformers/modeling_tf_electra.py
index b77c04e4d25ee6..595482ee275e88 100644
--- a/src/transformers/modeling_tf_electra.py
+++ b/src/transformers/modeling_tf_electra.py
@@ -7,6 +7,7 @@
 from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel
 from .modeling_tf_utils import (
+    TFMaskedLanguageModelingLoss,
     TFQuestionAnsweringLoss,
     TFTokenClassificationLoss,
     get_initializer,
@@ -506,7 +507,7 @@ def call(self, hidden_states, training=False):
     the only model of the two to have been trained for the masked language modeling task.""",
     ELECTRA_START_DOCSTRING,
 )
-class TFElectraForMaskedLM(TFElectraPreTrainedModel):
+class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLoss):
     def __init__(self, config, **kwargs):
         super().__init__(config, **kwargs)
 
@@ -534,9 +535,16 @@ def call(
         inputs_embeds=None,
         output_attentions=None,
         output_hidden_states=None,
+        labels=None,
         training=False,
     ):
         r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+
     Returns:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
         prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
@@ -553,6 +561,12 @@ def call(
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
         """
+        if isinstance(input_ids, (tuple, list)):
+            labels = input_ids[8] if len(input_ids) > 8 else labels
+            if len(input_ids) > 8:
+                input_ids = input_ids[:8]
+        elif isinstance(input_ids, (dict, BatchEncoding)):
+            labels = input_ids.pop("labels", labels)
 
         generator_hidden_states = self.electra(
             input_ids,
@@ -571,6 +585,10 @@ def call(
         output = (prediction_scores,)
         output += generator_hidden_states[1:]
 
+        if labels is not None:
+            loss = self.compute_loss(labels, prediction_scores)
+            output = (loss,) + output
+
         return output  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
 
 
diff --git a/src/transformers/modeling_tf_gpt2.py b/src/transformers/modeling_tf_gpt2.py
index 5c4bbd27c60256..de7dc4c3577fd6 100644
--- a/src/transformers/modeling_tf_gpt2.py
+++ b/src/transformers/modeling_tf_gpt2.py
@@ -24,6 +24,7 @@
 from .configuration_gpt2 import GPT2Config
 from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
     TFConv1D,
     TFPreTrainedModel,
     TFSequenceSummary,
@@ -272,8 +273,8 @@ def call(
             head_mask = inputs[5] if len(inputs) > 5 else head_mask
             inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds
             use_cache = inputs[7] if len(inputs) > 7 else use_cache
-            output_attentions = inputs[8] if len(inputs) > 7 else output_attentions
-            output_hidden_states = inputs[9] if len(inputs) > 8 else output_hidden_states
+            output_attentions = inputs[8] if len(inputs) > 8 else output_attentions
+            output_hidden_states = inputs[9] if len(inputs) > 9 else output_hidden_states
             assert len(inputs) <= 10, "Too many inputs."
         elif isinstance(inputs, (dict, BatchEncoding)):
             input_ids = inputs.get("input_ids")
@@ -524,7 +525,7 @@ def call(self, inputs, **kwargs):
     (linear layer with weights tied to the input embeddings). """,
     GPT2_START_DOCSTRING,
 )
-class TFGPT2LMHeadModel(TFGPT2PreTrainedModel):
+class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.transformer = TFGPT2MainLayer(config, name="transformer")
@@ -541,8 +542,26 @@ def prepare_inputs_for_generation(self, inputs, past, **kwargs):
 
     @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2")
-    def call(self, inputs, **kwargs):
+    def call(
+        self,
+        inputs,
+        past=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        labels=None,
+        training=False,
+    ):
         r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the cross entropy classification loss.
+            Indices should be in ``[0, ..., config.vocab_size - 1]``.
+
     Return:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
         prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
@@ -563,12 +582,38 @@ def call(self, inputs, **kwargs):
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
         """
-        transformer_outputs = self.transformer(inputs, **kwargs)
+        if isinstance(inputs, (tuple, list)):
+            labels = inputs[10] if len(inputs) > 10 else labels
+            if len(inputs) > 10:
+                inputs = inputs[:10]
+        elif isinstance(inputs, (dict, BatchEncoding)):
+            labels = inputs.pop("labels", labels)
+
+        transformer_outputs = self.transformer(
+            inputs,
+            past=past,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            training=training,
+        )
+
         hidden_states = transformer_outputs[0]
 
-        lm_logits = self.transformer.wte(hidden_states, mode="linear")
+        logits = self.transformer.wte(hidden_states, mode="linear")
 
-        outputs = (lm_logits,) + transformer_outputs[1:]
+        outputs = (logits,) + transformer_outputs[1:]
+        if labels is not None:
+            # shift labels to the left and cut last logit token
+            logits = logits[:, :-1]
+            labels = labels[:, 1:]
+            loss = self.compute_loss(labels, logits)
+            outputs = (loss,) + outputs
 
         return outputs  # lm_logits, presents, (all hidden_states), (attentions)
 
diff --git a/src/transformers/modeling_tf_mobilebert.py b/src/transformers/modeling_tf_mobilebert.py
index 5e68853a1fbb28..98fd9c080cde9b 100644
--- a/src/transformers/modeling_tf_mobilebert.py
+++ b/src/transformers/modeling_tf_mobilebert.py
@@ -29,6 +29,7 @@
 )
 from .modeling_tf_bert import TFBertIntermediate, gelu, gelu_new, swish
 from .modeling_tf_utils import (
+    TFMaskedLanguageModelingLoss,
     TFMultipleChoiceLoss,
     TFPreTrainedModel,
     TFQuestionAnsweringLoss,
@@ -929,7 +930,7 @@ def call(self, inputs, **kwargs):
 
 
 @add_start_docstrings("""MobileBert Model with a `language modeling` head on top. """, MOBILEBERT_START_DOCSTRING)
-class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel):
+class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModelingLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
 
@@ -941,8 +942,25 @@ def get_output_embeddings(self):
 
     @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased")
-    def call(self, inputs, **kwargs):
+    def call(
+        self,
+        inputs=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        labels=None,
+        training=False,
+    ):
         r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+
     Return:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs:
         prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
@@ -959,14 +977,34 @@ def call(self, inputs, **kwargs):
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
         """
-        outputs = self.mobilebert(inputs, **kwargs)
+        if isinstance(inputs, (tuple, list)):
+            labels = inputs[8] if len(inputs) > 8 else labels
+            if len(inputs) > 8:
+                inputs = inputs[:8]
+        elif isinstance(inputs, (dict, BatchEncoding)):
+            labels = inputs.pop("labels", labels)
+
+        outputs = self.mobilebert(
+            inputs,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            training=training,
+        )
 
         sequence_output = outputs[0]
-        prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False))
+        prediction_scores = self.mlm(sequence_output, training=training)
 
         outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
+        if labels is not None:
+            loss = self.compute_loss(labels, prediction_scores)
+            outputs = (loss,) + outputs
 
-        return outputs  # prediction_scores, (hidden_states), (attentions)
+        return outputs  # (loss), prediction_scores, (hidden_states), (attentions)
 
 
 class TFMobileBertOnlyNSPHead(tf.keras.layers.Layer):
diff --git a/src/transformers/modeling_tf_openai.py b/src/transformers/modeling_tf_openai.py
index 477e63ee590e79..ef6805abcc9a9a 100644
--- a/src/transformers/modeling_tf_openai.py
+++ b/src/transformers/modeling_tf_openai.py
@@ -24,6 +24,7 @@
 from .configuration_openai import OpenAIGPTConfig
 from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
     TFConv1D,
     TFPreTrainedModel,
     TFSequenceSummary,
@@ -479,7 +480,7 @@ def call(self, inputs, **kwargs):
     (linear layer with weights tied to the input embeddings). """,
     OPENAI_GPT_START_DOCSTRING,
 )
-class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel):
+class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelingLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.transformer = TFOpenAIGPTMainLayer(config, name="transformer")
@@ -489,8 +490,24 @@ def get_output_embeddings(self):
 
     @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt")
-    def call(self, inputs, **kwargs):
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        labels=None,
+        training=False,
+    ):
         r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the cross entropy classification loss.
+            Indices should be in ``[0, ..., config.vocab_size - 1]``.
+
     Return:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
         prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
@@ -507,12 +524,35 @@ def call(self, inputs, **kwargs):
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
         """
-        transformer_outputs = self.transformer(inputs, **kwargs)
+        if isinstance(inputs, (tuple, list)):
+            labels = inputs[8] if len(inputs) > 8 else labels
+            if len(inputs) > 8:
+                inputs = inputs[:8]
+        elif isinstance(inputs, (dict, BatchEncoding)):
+            labels = inputs.pop("labels", labels)
+
+        transformer_outputs = self.transformer(
+            inputs,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            training=training,
+        )
         hidden_states = transformer_outputs[0]
 
-        lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear")
+        logits = self.transformer.tokens_embed(hidden_states, mode="linear")
+        outputs = (logits,) + transformer_outputs[1:]
 
-        outputs = (lm_logits,) + transformer_outputs[1:]
+        if labels is not None:
+            # shift labels to the left and cut last logit token
+            logits = logits[:, :-1]
+            labels = labels[:, 1:]
+            loss = self.compute_loss(labels, logits)
+            outputs = (loss,) + outputs
 
         return outputs  # lm_logits, (all hidden_states), (attentions)
 
diff --git a/src/transformers/modeling_tf_roberta.py b/src/transformers/modeling_tf_roberta.py
index 4148a69065aa9c..99ef96bc477e50 100644
--- a/src/transformers/modeling_tf_roberta.py
+++ b/src/transformers/modeling_tf_roberta.py
@@ -29,6 +29,7 @@
 )
 from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu
 from .modeling_tf_utils import (
+    TFMaskedLanguageModelingLoss,
     TFMultipleChoiceLoss,
     TFPreTrainedModel,
     TFQuestionAnsweringLoss,
@@ -264,7 +265,7 @@ def call(self, features):
 
 
 @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)
-class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
+class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
 
@@ -276,8 +277,26 @@ def get_output_embeddings(self):
 
     @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING)
     @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base")
-    def call(self, inputs, **kwargs):
+    def call(
+        self,
+        inputs=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        labels=None,
+        training=False,
+    ):
         r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+
     Return:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
         prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
@@ -294,14 +313,37 @@ def call(self, inputs, **kwargs):
             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
             heads.
         """
-        outputs = self.roberta(inputs, **kwargs)
+        if isinstance(inputs, (tuple, list)):
+            labels = inputs[8] if len(inputs) > 8 else labels
+            if len(inputs) > 8:
+                inputs = inputs[:8]
+        elif isinstance(inputs, (dict, BatchEncoding)):
+            labels = inputs.pop("labels", labels)
+
+        outputs = self.roberta(
+            inputs,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
 
         sequence_output = outputs[0]
         prediction_scores = self.lm_head(sequence_output)
 
         outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
 
-        return outputs  # prediction_scores, (hidden_states), (attentions)
+        if labels is not None:
+            loss = self.compute_loss(labels, prediction_scores)
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), prediction_scores, (hidden_states), (attentions)
 
 
 class TFRobertaClassificationHead(tf.keras.layers.Layer):
diff --git a/src/transformers/modeling_tf_t5.py b/src/transformers/modeling_tf_t5.py
index 1898397b4ce7b5..202842e9302117 100644
--- a/src/transformers/modeling_tf_t5.py
+++ b/src/transformers/modeling_tf_t5.py
@@ -20,12 +20,14 @@
 import itertools
 import logging
 import math
+import warnings
 
 import tensorflow as tf
 
 from .configuration_t5 import T5Config
 from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
     TFPreTrainedModel,
     TFSharedEmbeddings,
     cast_bool_to_primitive,
@@ -111,6 +113,7 @@ def __init__(self, config, has_relative_attention_bias=False, **kwargs):
         super().__init__(**kwargs)
         self.layer_id = next(TFT5Attention.NEW_ID)
         self.is_decoder = config.is_decoder
+        self.use_cache = config.use_cache
         self.has_relative_attention_bias = has_relative_attention_bias
 
         self.relative_attention_num_buckets = config.relative_attention_num_buckets
@@ -258,9 +261,7 @@ def unshape(x):
                 k, v = past_key_value_state
 
         # to cope with keras serialization
-        use_cache = cast_bool_to_primitive(use_cache)
-
-        if self.is_decoder and use_cache is True:
+        if self.is_decoder and cast_bool_to_primitive(use_cache, self.use_cache) is True:
             present_key_value_state = ((k, v),)
         else:
             present_key_value_state = (None,)
@@ -295,7 +296,7 @@ def unshape(x):
 
         outputs = (context,) + present_key_value_state
 
-        if cast_bool_to_primitive(output_attentions) is True:
+        if cast_bool_to_primitive(output_attentions, True) is True:
             outputs = outputs + (weights,)
         if self.has_relative_attention_bias:
             outputs = outputs + (position_bias,)
@@ -572,18 +573,22 @@ def call(
             inputs_embeds = inputs[4] if len(inputs) > 4 else inputs_embeds
             head_mask = inputs[5] if len(inputs) > 5 else head_mask
             past_key_value_states = inputs[6] if len(inputs) > 6 else past_key_value_states
-            output_attentions = inputs[7] if len(inputs) > 7 else output_attentions
-            assert len(inputs) <= 8, "Too many inputs."
+            use_cache = inputs[7] if len(inputs) > 7 else use_cache
+            output_attentions = inputs[8] if len(inputs) > 7 else output_attentions
+            output_hidden_states = inputs[9] if len(inputs) > 8 else output_hidden_states
+            assert len(inputs) <= 10, "Too many inputs."
         elif isinstance(inputs, (dict, BatchEncoding)):
-            input_ids = inputs.get("decoder_input_ids")
-            attention_mask = inputs.get("decoder_attention_mask", attention_mask)
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
             encoder_hidden_states = inputs.get("encoder_hidden_states", encoder_hidden_states)
             encoder_attention_mask = inputs.get("encoder_attention_mask", encoder_attention_mask)
             inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
             head_mask = inputs.get("head_mask", head_mask)
             past_key_value_states = inputs.get("past_key_value_states", past_key_value_states)
+            use_cache = inputs.get("use_cache", use_cache)
             output_attentions = inputs.get("output_attentions", output_attentions)
-            assert len(inputs) <= 8, "Too many inputs."
+            output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
+            assert len(inputs) <= 10, "Too many inputs."
         else:
             input_ids = inputs
 
@@ -741,8 +746,8 @@ def call(
             all_hidden_states = all_hidden_states + (hidden_states,)
 
         outputs = (hidden_states,)
-        if use_cache is True:
-            assert self.is_decoder, "`use_cache` can only be set to `True` if {} is used as a decoder".format(self)
+        # need to check if is decoder here as well for special cases when using keras compile
+        if cast_bool_to_primitive(use_cache, self.use_cache) is True and self.is_decoder:
             outputs = outputs + (present_key_value_states,)
         if cast_bool_to_primitive(output_hidden_states) is True:
             outputs = outputs + (all_hidden_states,)
@@ -771,12 +776,38 @@ def dummy_inputs(self):
         inputs = tf.constant(DUMMY_INPUTS)
         input_mask = tf.constant(DUMMY_MASK)
         dummy_inputs = {
-            "inputs": inputs,
+            "input_ids": inputs,
             "decoder_input_ids": inputs,
             "decoder_attention_mask": input_mask,
         }
         return dummy_inputs
 
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+
+        assert (
+            decoder_start_token_id is not None
+        ), "self.model.config.decoder_start_token_id has to be defined. In TF T5 it is usually set to the pad_token_id. See T5 docs for more information"
+
+        # shift inputs to the right
+        shifted_input_ids = tf.zeros_like(input_ids, dtype=tf.int32)
+        shifted_input_ids = tf.roll(shifted_input_ids, 1, axis=-1)
+        start_tokens = tf.fill((shape_list(shifted_input_ids)[0], 1), decoder_start_token_id)
+        shifted_input_ids = tf.concat([start_tokens, shifted_input_ids[:, 1:]], -1)
+
+        assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids = tf.where(
+            shifted_input_ids == -100, tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids
+        )
+
+        assert tf.math.reduce_any(
+            shifted_input_ids >= 0
+        ).numpy(), "Verify that `labels` has only positive values and -100"
+
+        return shifted_input_ids
+
 
 T5_START_DOCSTRING = r"""
     The T5 model was proposed in `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer
@@ -908,7 +939,22 @@ def get_decoder(self):
         return self.decoder
 
     @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        head_mask=None,
+        decoder_past_key_value_states=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        training=False,
+    ):
         r"""
     Returns:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs:
@@ -942,37 +988,58 @@ def call(self, inputs, **kwargs):
         >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
         """
-
-        if isinstance(inputs, dict):
-            kwargs.update(inputs)
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            encoder_outputs = inputs[2] if len(inputs) > 2 else encoder_outputs
+            inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
+            head_mask = inputs[4] if len(inputs) > 4 else head_mask
+            decoder_past_key_value_states = inputs[5] if len(inputs) > 5 else decoder_past_key_value_states
+            decoder_input_ids = inputs[6] if len(inputs) > 6 else decoder_input_ids
+            decoder_attention_mask = inputs[7] if len(inputs) > 7 else decoder_attention_mask
+            decoder_inputs_embeds = inputs[8] if len(inputs) > 8 else decoder_inputs_embeds
+            use_cache = inputs[9] if len(inputs) > 9 else use_cache
+            output_attentions = inputs[10] if len(inputs) > 10 else output_attentions
+            output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states
+            assert len(inputs) <= 12, "Too many inputs."
+        elif isinstance(inputs, (dict, BatchEncoding)):
+            if "inputs" in inputs:
+                warnings.warn("Using `inputs` as a keyword argument is deprecated. Please use `input_ids` instead.")
+                input_ids = inputs.get("inputs")
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            encoder_outputs = inputs.get("encoder_outputs", encoder_outputs)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
+            head_mask = inputs.get("head_mask", head_mask)
+            decoder_past_key_value_states = inputs.get("past_key_value_states", decoder_past_key_value_states)
+            decoder_input_ids = inputs.get("decoder_input_ids", decoder_input_ids)
+            decoder_attention_mask = inputs.get("decoder_attention_mask", decoder_attention_mask)
+            decoder_inputs_embeds = inputs.get("decoder_inputs_embeds", decoder_inputs_embeds)
+            use_cache = inputs.get("use_cache", use_cache)
+            output_attentions = inputs.get("output_attentions", output_attentions)
+            output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
+            assert len(inputs) <= 12, "Too many inputs."
         else:
-            kwargs["inputs"] = inputs
-
-        # retrieve arguments
-        inputs = kwargs.get("inputs", None)
-        inputs_embeds = kwargs.get("inputs_embeds", None)
-        attention_mask = kwargs.get("attention_mask", None)
-        encoder_outputs = kwargs.get("encoder_outputs", None)
-        decoder_input_ids = kwargs.get("decoder_input_ids", None)
-        decoder_attention_mask = kwargs.get("decoder_attention_mask", None)
-        decoder_inputs_embeds = kwargs.get("decoder_inputs_embeds", None)
-        decoder_past_key_value_states = kwargs.get("decoder_past_key_value_states", None)
-        use_cache = kwargs.get("use_cache", None)
-        head_mask = kwargs.get("head_mask", None)
-        output_attentions = kwargs.get("output_attentions", None)
-        output_hidden_states = kwargs.get("output_hidden_states", None)
+            input_ids = inputs
 
         use_cache = use_cache if use_cache is not None else self.config.use_cache
 
         # Encode if needed (training, first prediction pass)
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
-                inputs,
-                attention_mask=attention_mask,
-                inputs_embeds=inputs_embeds,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
+                [
+                    input_ids,
+                    attention_mask,
+                    None,
+                    None,
+                    inputs_embeds,
+                    head_mask,
+                    None,
+                    False,
+                    output_attentions,
+                    output_hidden_states,
+                ],
+                training=training,
             )
 
         hidden_states = encoder_outputs[0]
@@ -987,19 +1054,22 @@ def call(self, inputs, **kwargs):
 
         # Decode
         decoder_outputs = self.decoder(
-            decoder_input_ids,
-            attention_mask=decoder_attention_mask,
-            inputs_embeds=decoder_inputs_embeds,
-            past_key_value_states=decoder_past_key_value_states,
-            encoder_hidden_states=hidden_states,
-            encoder_attention_mask=attention_mask,
-            head_mask=head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
+            [
+                decoder_input_ids,
+                decoder_attention_mask,
+                hidden_states,
+                attention_mask,
+                decoder_inputs_embeds,
+                head_mask,
+                decoder_past_key_value_states,
+                use_cache,
+                output_attentions,
+                output_hidden_states,
+            ],
+            training=training,
         )
 
-        if use_cache is True:
+        if cast_bool_to_primitive(use_cache, self.config.use_cache) is True:
             past = ((encoder_outputs, decoder_outputs[1]),)
             decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:]
 
@@ -1007,7 +1077,7 @@ def call(self, inputs, **kwargs):
 
 
 @add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING)
-class TFT5ForConditionalGeneration(TFT5PreTrainedModel):
+class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModelingLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.model_dim = config.d_model
@@ -1050,8 +1120,28 @@ def get_decoder(self):
         return self.decoder
 
     @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        head_mask=None,
+        decoder_past_key_value_states=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        labels=None,
+        training=False,
+    ):
         r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the cross entropy classification loss.
+            Indices should be in ``[0, ..., config.vocab_size - 1]``.
+
     Returns:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs:
         loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_label` is provided):
@@ -1090,25 +1180,41 @@ def call(self, inputs, **kwargs):
         >>> result = model.generate(inputs)
 
         """
-
-        if isinstance(inputs, dict):
-            kwargs.update(inputs)
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            encoder_outputs = inputs[2] if len(inputs) > 2 else encoder_outputs
+            inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds
+            head_mask = inputs[4] if len(inputs) > 4 else head_mask
+            decoder_past_key_value_states = inputs[5] if len(inputs) > 5 else decoder_past_key_value_states
+            decoder_input_ids = inputs[6] if len(inputs) > 6 else decoder_input_ids
+            decoder_attention_mask = inputs[7] if len(inputs) > 7 else decoder_attention_mask
+            decoder_inputs_embeds = inputs[8] if len(inputs) > 8 else decoder_inputs_embeds
+            use_cache = inputs[9] if len(inputs) > 9 else use_cache
+            output_attentions = inputs[10] if len(inputs) > 10 else output_attentions
+            output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states
+            labels = inputs[12] if len(inputs) > 12 else labels
+            assert len(inputs) <= 13, "Too many inputs."
+        elif isinstance(inputs, (dict, BatchEncoding)):
+            if "inputs" in inputs:
+                warnings.warn("Using `inputs` as a keyword argument is deprecated. Please use `input_ids` instead.")
+                input_ids = inputs.get("inputs")
+            input_ids = inputs.get("input_ids")
+            attention_mask = inputs.get("attention_mask", attention_mask)
+            encoder_outputs = inputs.get("encoder_outputs", encoder_outputs)
+            inputs_embeds = inputs.get("inputs_embeds", inputs_embeds)
+            head_mask = inputs.get("head_mask", head_mask)
+            decoder_past_key_value_states = inputs.get("past_key_value_states", decoder_past_key_value_states)
+            decoder_input_ids = inputs.get("decoder_input_ids", decoder_input_ids)
+            decoder_attention_mask = inputs.get("decoder_attention_mask", decoder_attention_mask)
+            decoder_inputs_embeds = inputs.get("decoder_inputs_embeds", decoder_inputs_embeds)
+            use_cache = inputs.get("use_cache", use_cache)
+            output_attentions = inputs.get("output_attentions", output_attentions)
+            output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
+            labels = inputs.get("labels", labels)
+            assert len(inputs) <= 13, "Too many inputs."
         else:
-            kwargs["inputs"] = inputs
-
-        # retrieve arguments
-        inputs = kwargs.get("inputs", None)
-        decoder_input_ids = kwargs.get("decoder_input_ids", None)
-        attention_mask = kwargs.get("attention_mask", None)
-        encoder_outputs = kwargs.get("encoder_outputs", None)
-        decoder_attention_mask = kwargs.get("decoder_attention_mask", None)
-        decoder_past_key_value_states = kwargs.get("decoder_past_key_value_states", None)
-        use_cache = kwargs.get("use_cache", None)
-        inputs_embeds = kwargs.get("inputs_embeds", None)
-        decoder_inputs_embeds = kwargs.get("decoder_inputs_embeds", None)
-        head_mask = kwargs.get("head_mask", None)
-        output_attentions = kwargs.get("output_attentions", None)
-        output_hidden_states = kwargs.get("output_hidden_states", None)
+            input_ids = inputs
 
         use_cache = use_cache if use_cache is not None else self.config.use_cache
 
@@ -1116,16 +1222,27 @@ def call(self, inputs, **kwargs):
         if encoder_outputs is None:
             # Convert encoder inputs in embeddings if needed
             encoder_outputs = self.encoder(
-                inputs,
-                attention_mask=attention_mask,
-                inputs_embeds=inputs_embeds,
-                head_mask=head_mask,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
+                [
+                    input_ids,
+                    attention_mask,
+                    None,
+                    None,
+                    inputs_embeds,
+                    head_mask,
+                    None,
+                    False,
+                    output_attentions,
+                    output_hidden_states,
+                ],
+                training=training,
             )
 
         hidden_states = encoder_outputs[0]
 
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+
         # If decoding with past key value states, only the last tokens
         # should be given as an input
         if decoder_past_key_value_states is not None:
@@ -1136,28 +1253,35 @@ def call(self, inputs, **kwargs):
 
         # Decode
         decoder_outputs = self.decoder(
-            decoder_input_ids,
-            attention_mask=decoder_attention_mask,
-            inputs_embeds=decoder_inputs_embeds,
-            past_key_value_states=decoder_past_key_value_states,
-            encoder_hidden_states=hidden_states,
-            encoder_attention_mask=attention_mask,
-            head_mask=head_mask,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
+            [
+                decoder_input_ids,
+                decoder_attention_mask,
+                hidden_states,
+                attention_mask,
+                decoder_inputs_embeds,
+                head_mask,
+                decoder_past_key_value_states,
+                use_cache,
+                output_attentions,
+                output_hidden_states,
+            ],
+            training=training,
         )
 
         # insert decoder past at right place
         # to speed up decoding
-        if use_cache is True:
+        if cast_bool_to_primitive(use_cache, self.config.use_cache) is True:
             past = ((encoder_outputs, decoder_outputs[1]),)
             decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:]
 
         sequence_output = decoder_outputs[0] * (self.model_dim ** -0.5)
         embed_tokens = self.get_output_embeddings()
-        lm_logits = embed_tokens(sequence_output, mode="linear")
-        decoder_outputs = (lm_logits,) + decoder_outputs[1:]
+        logits = embed_tokens(sequence_output, mode="linear")
+        decoder_outputs = (logits,) + decoder_outputs[1:]
+
+        if labels is not None:
+            loss = self.compute_loss(labels, logits)
+            decoder_outputs = (loss,) + decoder_outputs
 
         return decoder_outputs + encoder_outputs
 
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 741d013b6242bd..5452ecd3dc05c6 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -17,6 +17,7 @@
 import functools
 import logging
 import os
+import warnings
 
 import h5py
 import numpy as np
@@ -107,6 +108,19 @@ def get_config(self):
     return cls
 
 
+class TFCausalLanguageModelingLoss:
+    def compute_loss(self, labels, logits):
+        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
+            from_logits=True, reduction=tf.keras.losses.Reduction.NONE
+        )
+        # make sure only labels that are not equal to -100
+        # are taken into account as loss
+        active_loss = tf.reshape(labels, (-1,)) != -100
+        reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
+        labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)
+        return loss_fn(labels, reduced_logits)
+
+
 class TFQuestionAnsweringLoss:
     def compute_loss(self, labels, logits):
         loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
@@ -123,7 +137,13 @@ def compute_loss(self, labels, logits):
         loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
             from_logits=True, reduction=tf.keras.losses.Reduction.NONE
         )
-        active_loss = tf.reshape(labels, (-1,)) != -1
+        # make sure only labels that are not equal to -100
+        # are taken into account as loss
+        if tf.math.reduce_any(labels == -1).numpy() is True:
+            warnings.warn("Using `-1` to mask the loss for the token is depreciated. Please use `-100` instead.")
+            active_loss = tf.reshape(labels, (-1,)) != -1
+        else:
+            active_loss = tf.reshape(labels, (-1,)) != -100
         reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss)
         labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss)
 
@@ -143,6 +163,7 @@ def compute_loss(self, labels, logits):
 
 
 TFMultipleChoiceLoss = TFSequenceClassificationLoss
+TFMaskedLanguageModelingLoss = TFCausalLanguageModelingLoss
 
 
 class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin):
diff --git a/src/transformers/modeling_tf_xlnet.py b/src/transformers/modeling_tf_xlnet.py
index 3ec96593235f09..ac8d9fe66401d6 100644
--- a/src/transformers/modeling_tf_xlnet.py
+++ b/src/transformers/modeling_tf_xlnet.py
@@ -30,6 +30,7 @@
     add_start_docstrings_to_callable,
 )
 from .modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
     TFMultipleChoiceLoss,
     TFPreTrainedModel,
     TFQuestionAnsweringLoss,
@@ -871,7 +872,7 @@ def call(self, inputs, **kwargs):
     (linear layer with weights tied to the input embeddings). """,
     XLNET_START_DOCSTRING,
 )
-class TFXLNetLMHeadModel(TFXLNetPreTrainedModel):
+class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
     def __init__(self, config, *inputs, **kwargs):
         super().__init__(config, *inputs, **kwargs)
         self.transformer = TFXLNetMainLayer(config, name="transformer")
@@ -912,8 +913,28 @@ def prepare_inputs_for_generation(self, inputs, past, **kwargs):
         return inputs
 
     @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING)
-    def call(self, inputs, **kwargs):
+    def call(
+        self,
+        inputs,
+        attention_mask=None,
+        mems=None,
+        perm_mask=None,
+        target_mapping=None,
+        token_type_ids=None,
+        input_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        use_cache=True,
+        output_attentions=None,
+        output_hidden_states=None,
+        labels=None,
+        training=False,
+    ):
         r"""
+        labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the cross entropy classification loss.
+            Indices should be in ``[0, ..., config.vocab_size - 1]``.
+
     Return:
         :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs:
         prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
@@ -957,12 +978,40 @@ def call(self, inputs, **kwargs):
         next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
 
         """
-        transformer_outputs = self.transformer(inputs, **kwargs)
+        if isinstance(inputs, (tuple, list)):
+            labels = inputs[12] if len(inputs) > 12 else labels
+            if len(inputs) > 12:
+                inputs = inputs[:12]
+        elif isinstance(inputs, (dict, BatchEncoding)):
+            labels = inputs.pop("labels", labels)
+
+        transformer_outputs = self.transformer(
+            inputs,
+            attention_mask=None,
+            mems=None,
+            perm_mask=None,
+            target_mapping=None,
+            token_type_ids=None,
+            input_mask=None,
+            head_mask=None,
+            inputs_embeds=None,
+            use_cache=True,
+            output_attentions=None,
+            output_hidden_states=None,
+            training=training,
+        )
         hidden_state = transformer_outputs[0]
-        logits = self.lm_loss(hidden_state)
+        logits = self.lm_loss(hidden_state, training=training)
 
         outputs = (logits,) + transformer_outputs[1:]  # Keep mems, hidden states, attentions if there are in it
 
+        if labels is not None:
+            # shift labels to the left and cut last logit token
+            logits = logits[:, :-1]
+            labels = labels[:, 1:]
+            loss = self.compute_loss(labels, logits)
+            outputs = (loss,) + outputs
+
         return outputs  # return logits, (mems), (hidden states), (attentions)
 
 
diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py
index 0b67bafc3b7808..a950b33de18371 100644
--- a/src/transformers/modeling_xlnet.py
+++ b/src/transformers/modeling_xlnet.py
@@ -1041,9 +1041,9 @@ def forward(
         head_mask=None,
         inputs_embeds=None,
         use_cache=True,
-        labels=None,
         output_attentions=None,
         output_hidden_states=None,
+        labels=None,
     ):
         r"""
         labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`):
diff --git a/tests/test_modeling_distilbert.py b/tests/test_modeling_distilbert.py
index 0c602851ef1a11..871d4d9da3c50c 100644
--- a/tests/test_modeling_distilbert.py
+++ b/tests/test_modeling_distilbert.py
@@ -17,7 +17,7 @@
 import unittest
 
 from transformers import is_torch_available
-from transformers.testing_utils import require_torch, torch_device
+from transformers.testing_utils import require_torch, slow, torch_device
 
 from .test_configuration_common import ConfigTester
 from .test_modeling_common import ModelTesterMixin, ids_tensor
@@ -32,6 +32,7 @@
         DistilBertForTokenClassification,
         DistilBertForQuestionAnswering,
         DistilBertForSequenceClassification,
+        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
     )
 
     class DistilBertModelTester(object):
@@ -276,8 +277,8 @@ def test_for_multiple_choice(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_distilbert_for_multiple_choice(*config_and_inputs)
 
-    # @slow
-    # def test_model_from_pretrained(self):
-    #     for model_name in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-    #         model = DistilBertModel.from_pretrained(model_name)
-    #         self.assertIsNotNone(model)
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            model = DistilBertModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
diff --git a/tests/test_modeling_tf_auto.py b/tests/test_modeling_tf_auto.py
index 3687513dbfc725..2702bfb5205d5f 100644
--- a/tests/test_modeling_tf_auto.py
+++ b/tests/test_modeling_tf_auto.py
@@ -24,6 +24,8 @@
     from transformers import (
         AutoConfig,
         BertConfig,
+        GPT2Config,
+        T5Config,
         TFAutoModel,
         TFBertModel,
         TFAutoModelForPreTraining,
@@ -35,6 +37,25 @@
         TFBertForSequenceClassification,
         TFAutoModelForQuestionAnswering,
         TFBertForQuestionAnswering,
+        TFAutoModelForCausalLM,
+        TFGPT2LMHeadModel,
+        TFAutoModelForMaskedLM,
+        TFAutoModelForSeq2SeqLM,
+        TFT5ForConditionalGeneration,
+    )
+    from transformers.modeling_tf_bert import TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers.modeling_tf_gpt2 import TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers.modeling_tf_t5 import TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST
+    from transformers.modeling_tf_auto import (
+        TF_MODEL_MAPPING,
+        TF_MODEL_FOR_PRETRAINING_MAPPING,
+        TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+        TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+        TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+        TF_MODEL_WITH_LM_HEAD_MAPPING,
+        TF_MODEL_FOR_CAUSAL_LM_MAPPING,
+        TF_MODEL_FOR_MASKED_LM_MAPPING,
+        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
     )
 
 
@@ -72,10 +93,21 @@ def test_model_for_pretraining_from_pretrained(self):
             self.assertIsNotNone(model)
             self.assertIsInstance(model, TFBertForPreTraining)
 
+    @slow
+    def test_model_for_causal_lm(self):
+        for model_name in TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, GPT2Config)
+
+            model = TFAutoModelForCausalLM.from_pretrained(model_name)
+            model, loading_info = TFAutoModelForCausalLM.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFGPT2LMHeadModel)
+
     @slow
     def test_lmhead_model_from_pretrained(self):
-        # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-        for model_name in ["bert-base-uncased"]:
+        for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
             config = AutoConfig.from_pretrained(model_name)
             self.assertIsNotNone(config)
             self.assertIsInstance(config, BertConfig)
@@ -84,6 +116,30 @@ def test_lmhead_model_from_pretrained(self):
             self.assertIsNotNone(model)
             self.assertIsInstance(model, TFBertForMaskedLM)
 
+    @slow
+    def test_model_for_masked_lm(self):
+        for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = TFAutoModelForMaskedLM.from_pretrained(model_name)
+            model, loading_info = TFAutoModelForMaskedLM.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFBertForMaskedLM)
+
+    @slow
+    def test_model_for_encoder_decoder_lm(self):
+        for model_name in TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, T5Config)
+
+            model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name)
+            model, loading_info = TFAutoModelForSeq2SeqLM.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, TFT5ForConditionalGeneration)
+
     @slow
     def test_sequence_classification_model_from_pretrained(self):
         # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
@@ -119,3 +175,28 @@ def test_from_identifier_from_model_type(self):
         self.assertIsInstance(model, TFRobertaForMaskedLM)
         self.assertEqual(model.num_parameters(), 14830)
         self.assertEqual(model.num_parameters(only_trainable=True), 14830)
+
+    def test_parents_and_children_in_mappings(self):
+        # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
+        # by the parents and will return the wrong configuration type when using auto models
+        mappings = (
+            TF_MODEL_MAPPING,
+            TF_MODEL_FOR_PRETRAINING_MAPPING,
+            TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+            TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+            TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+            TF_MODEL_WITH_LM_HEAD_MAPPING,
+            TF_MODEL_FOR_CAUSAL_LM_MAPPING,
+            TF_MODEL_FOR_MASKED_LM_MAPPING,
+            TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+        )
+
+        for mapping in mappings:
+            mapping = tuple(mapping.items())
+            for index, (child_config, child_model) in enumerate(mapping[1:]):
+                for parent_config, parent_model in mapping[: index + 1]:
+                    with self.subTest(
+                        msg="Testing if {} is child of {}".format(child_config.__name__, parent_config.__name__)
+                    ):
+                        self.assertFalse(issubclass(child_config, parent_config))
+                        self.assertFalse(issubclass(child_model, parent_model))
diff --git a/tests/test_modeling_tf_bert.py b/tests/test_modeling_tf_bert.py
index 042f3b40067e06..7e1884bafca410 100644
--- a/tests/test_modeling_tf_bert.py
+++ b/tests/test_modeling_tf_bert.py
@@ -27,6 +27,7 @@
     import tensorflow as tf
     from transformers.modeling_tf_bert import (
         TFBertModel,
+        TFBertLMHeadModel,
         TFBertForMaskedLM,
         TFBertForNextSentencePrediction,
         TFBertForPreTraining,
@@ -142,11 +143,30 @@ def create_and_check_bert_model(
         )
         self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
 
+    def create_and_check_bert_lm_head(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        config.is_decoder = True
+        model = TFBertLMHeadModel(config=config)
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+        }
+        (prediction_scores,) = model(inputs)
+        self.parent.assertListEqual(
+            list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size]
+        )
+
     def create_and_check_bert_for_masked_lm(
         self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
     ):
         model = TFBertForMaskedLM(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+        }
         (prediction_scores,) = model(inputs)
         result = {
             "prediction_scores": prediction_scores.numpy(),
@@ -186,11 +206,14 @@ def create_and_check_bert_for_sequence_classification(
     ):
         config.num_labels = self.num_labels
         model = TFBertForSequenceClassification(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        (logits,) = model(inputs)
-        result = {
-            "logits": logits.numpy(),
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
         }
+
+        (logits,) = model(inputs)
+        result = {"logits": logits.numpy()}
         self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
 
     def create_and_check_bert_for_multiple_choice(
@@ -207,9 +230,7 @@ def create_and_check_bert_for_multiple_choice(
             "token_type_ids": multiple_choice_token_type_ids,
         }
         (logits,) = model(inputs)
-        result = {
-            "logits": logits.numpy(),
-        }
+        result = {"logits": logits.numpy()}
         self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
 
     def create_and_check_bert_for_token_classification(
@@ -217,7 +238,11 @@ def create_and_check_bert_for_token_classification(
     ):
         config.num_labels = self.num_labels
         model = TFBertForTokenClassification(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
+        }
         (logits,) = model(inputs)
         result = {
             "logits": logits.numpy(),
@@ -228,12 +253,14 @@ def create_and_check_bert_for_question_answering(
         self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
     ):
         model = TFBertForQuestionAnswering(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids}
-        start_logits, end_logits = model(inputs)
-        result = {
-            "start_logits": start_logits.numpy(),
-            "end_logits": end_logits.numpy(),
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+            "token_type_ids": token_type_ids,
         }
+
+        start_logits, end_logits = model(inputs)
+        result = {"start_logits": start_logits.numpy(), "end_logits": end_logits.numpy()}
         self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
         self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
 
@@ -285,6 +312,10 @@ def test_for_masked_lm(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_bert_for_masked_lm(*config_and_inputs)
 
+    def test_for_causal_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_bert_lm_head(*config_and_inputs)
+
     def test_for_multiple_choice(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_bert_for_multiple_choice(*config_and_inputs)
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index 56fc8c9cf76fb1..0d90a82ecb6ce1 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -38,6 +38,9 @@
         TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
         TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
         TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+        TF_MODEL_FOR_CAUSAL_LM_MAPPING,
+        TF_MODEL_FOR_MASKED_LM_MAPPING,
+        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
     )
 
     if _tf_gpu_memory_limit is not None:
@@ -93,6 +96,12 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
                 inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size)
             elif model_class in TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.values():
                 inputs_dict["labels"] = tf.zeros((self.model_tester.batch_size, self.model_tester.seq_length))
+            elif model_class in TF_MODEL_FOR_CAUSAL_LM_MAPPING.values():
+                inputs_dict["labels"] = tf.zeros((self.model_tester.batch_size, self.model_tester.seq_length))
+            elif model_class in TF_MODEL_FOR_MASKED_LM_MAPPING.values():
+                inputs_dict["labels"] = tf.zeros((self.model_tester.batch_size, self.model_tester.seq_length))
+            elif model_class in TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.values():
+                inputs_dict["labels"] = tf.zeros((self.model_tester.batch_size, self.model_tester.seq_length))
         return inputs_dict
 
     def test_initialization(self):
@@ -291,7 +300,7 @@ def test_compile_tf_model(self):
                     "decoder_input_ids": tf.keras.Input(
                         batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32"
                     ),
-                    "inputs": tf.keras.Input(batch_shape=(2, 2000), name="inputs", dtype="int32"),
+                    "input_ids": tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32"),
                 }
             elif model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values():
                 input_ids = tf.keras.Input(batch_shape=(4, 2, 2000), name="input_ids", dtype="int32")
@@ -325,7 +334,7 @@ def test_keyword_and_dict_args(self):
             outputs_dict = model(self._prepare_for_class(inputs_dict, model_class))
 
             inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-            input_ids = inputs_keywords.pop("input_ids" if not self.is_encoder_decoder else "inputs", None,)
+            input_ids = inputs_keywords.pop("input_ids", None)
             outputs_keywords = model(input_ids, **inputs_keywords)
             output_dict = outputs_dict[0].numpy()
             output_keywords = outputs_keywords[0].numpy()
@@ -479,9 +488,9 @@ def test_inputs_embeds(self):
                 input_ids = inputs["input_ids"]
                 del inputs["input_ids"]
             else:
-                encoder_input_ids = inputs["inputs"]
+                encoder_input_ids = inputs["input_ids"]
                 decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                del inputs["inputs"]
+                del inputs["input_ids"]
                 inputs.pop("decoder_input_ids", None)
 
             wte = model.get_input_embeddings()
@@ -596,9 +605,15 @@ def test_loss_computation(self):
                 added_label = prepared_for_class[list(prepared_for_class.keys() - inputs_dict.keys())[0]]
                 loss_size = tf.size(added_label)
 
+                if model.__class__ in TF_MODEL_FOR_CAUSAL_LM_MAPPING.values():
+                    # if loss is causal lm loss, labels are shift, so that one label per batch
+                    # is cut
+                    loss_size = loss_size - self.model_tester.batch_size
+
                 # Test that model correctly compute the loss with kwargs
                 prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True)
                 input_ids = prepared_for_class.pop("input_ids")
+
                 loss = model(input_ids, **prepared_for_class)[0]
                 self.assertEqual(loss.shape, [loss_size])
 
diff --git a/tests/test_modeling_tf_distilbert.py b/tests/test_modeling_tf_distilbert.py
index 7d7b63fb0e9c7f..e3c83a47a7c3f4 100644
--- a/tests/test_modeling_tf_distilbert.py
+++ b/tests/test_modeling_tf_distilbert.py
@@ -17,7 +17,7 @@
 import unittest
 
 from transformers import DistilBertConfig, is_tf_available
-from transformers.testing_utils import require_tf
+from transformers.testing_utils import require_tf, slow
 
 from .test_configuration_common import ConfigTester
 from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
@@ -32,6 +32,7 @@
         TFDistilBertForSequenceClassification,
         TFDistilBertForTokenClassification,
         TFDistilBertForMultipleChoice,
+        TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
     )
 
 
@@ -118,9 +119,7 @@ def create_and_check_distilbert_for_masked_lm(
         model = TFDistilBertForMaskedLM(config=config)
         inputs = {"input_ids": input_ids, "attention_mask": input_mask}
         (prediction_scores,) = model(inputs)
-        result = {
-            "prediction_scores": prediction_scores.numpy(),
-        }
+        result = {"prediction_scores": prediction_scores.numpy()}
         self.parent.assertListEqual(
             list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size]
         )
@@ -129,12 +128,12 @@ def create_and_check_distilbert_for_question_answering(
         self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels
     ):
         model = TFDistilBertForQuestionAnswering(config=config)
-        inputs = {"input_ids": input_ids, "attention_mask": input_mask}
-        start_logits, end_logits = model(inputs)
-        result = {
-            "start_logits": start_logits.numpy(),
-            "end_logits": end_logits.numpy(),
+        inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
         }
+        start_logits, end_logits = model(inputs)
+        result = {"start_logits": start_logits.numpy(), "end_logits": end_logits.numpy()}
         self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length])
         self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length])
 
@@ -145,9 +144,7 @@ def create_and_check_distilbert_for_sequence_classification(
         model = TFDistilBertForSequenceClassification(config)
         inputs = {"input_ids": input_ids, "attention_mask": input_mask}
         (logits,) = model(inputs)
-        result = {
-            "logits": logits.numpy(),
-        }
+        result = {"logits": logits.numpy()}
         self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels])
 
     def create_and_check_distilbert_for_multiple_choice(
@@ -162,9 +159,7 @@ def create_and_check_distilbert_for_multiple_choice(
             "attention_mask": multiple_choice_input_mask,
         }
         (logits,) = model(inputs)
-        result = {
-            "logits": logits.numpy(),
-        }
+        result = {"logits": logits.numpy()}
         self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices])
 
     def create_and_check_distilbert_for_token_classification(
@@ -236,8 +231,8 @@ def test_for_token_classification(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_distilbert_for_token_classification(*config_and_inputs)
 
-    # @slow
-    # def test_model_from_pretrained(self):
-    #     for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-    #         model = DistilBertModesss.from_pretrained(model_name)
-    #         self.assertIsNotNone(model)
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in list(TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]):
+            model = TFDistilBertModel.from_pretrained(model_name)
+            self.assertIsNotNone(model)
diff --git a/tests/test_modeling_tf_t5.py b/tests/test_modeling_tf_t5.py
index be4637f465af4a..6147279d2e271c 100644
--- a/tests/test_modeling_tf_t5.py
+++ b/tests/test_modeling_tf_t5.py
@@ -76,6 +76,7 @@ def prepare_config_and_inputs(self):
             eos_token_id=self.eos_token_id,
             bos_token_id=self.pad_token_id,
             pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.pad_token_id,
         )
 
         return (config, input_ids, input_mask, token_labels)
@@ -83,7 +84,7 @@ def prepare_config_and_inputs(self):
     def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels):
         model = TFT5Model(config=config)
         inputs = {
-            "inputs": input_ids,
+            "input_ids": input_ids,
             "decoder_input_ids": input_ids,
             "decoder_attention_mask": input_mask,
         }
@@ -114,7 +115,7 @@ def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels)
     def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
         model = TFT5ForConditionalGeneration(config=config)
         inputs_dict = {
-            "inputs": input_ids,
+            "input_ids": input_ids,
             "decoder_input_ids": input_ids,
             "decoder_attention_mask": input_mask,
         }
@@ -208,7 +209,7 @@ def prepare_config_and_inputs_for_common(self):
         config_and_inputs = self.prepare_config_and_inputs()
         (config, input_ids, input_mask, token_labels) = config_and_inputs
         inputs_dict = {
-            "inputs": input_ids,
+            "input_ids": input_ids,
             "decoder_input_ids": input_ids,
             "decoder_attention_mask": input_mask,
             "use_cache": tf.convert_to_tensor([False]),