diff --git a/examples/token-classification/run_tf_ner.py b/examples/token-classification/run_tf_ner.py index 056a24c74fdaec..068f0617371cce 100644 --- a/examples/token-classification/run_tf_ner.py +++ b/examples/token-classification/run_tf_ner.py @@ -17,6 +17,7 @@ import logging import os +import warnings from dataclasses import dataclass, field from typing import Dict, List, Optional, Tuple @@ -184,7 +185,12 @@ def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[L for i in range(batch_size): for j in range(seq_len): - if label_ids[i, j] != -1: + if label_ids[i, j] == -1: + label_ids[i, j] = -100 + warnings.warn( + "Using `-1` to mask the loss for the token is depreciated. Please use `-100` instead." + ) + if label_ids[i, j] != -100: out_label_list[i].append(label_map[label_ids[i][j]]) preds_list[i].append(label_map[preds[i][j]]) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index e7aaed67189235..8f68fedff049bb 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -424,6 +424,9 @@ TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, TF_MODEL_WITH_LM_HEAD_MAPPING, + TF_MODEL_FOR_CAUSAL_LM_MAPPING, + TF_MODEL_FOR_MASKED_LM_MAPPING, + TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, TFAutoModel, TFAutoModelForMultipleChoice, TFAutoModelForPreTraining, @@ -431,6 +434,9 @@ TFAutoModelForSequenceClassification, TFAutoModelForTokenClassification, TFAutoModelWithLMHead, + TFAutoModelForCausalLM, + TFAutoModelForMaskedLM, + TFAutoModelForSeq2SeqLM, ) from .modeling_tf_albert import ( @@ -449,6 +455,7 @@ from .modeling_tf_bert import ( TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST, TFBertEmbeddings, + TFBertLMHeadModel, TFBertForMaskedLM, TFBertForMultipleChoice, TFBertForNextSentencePrediction, diff --git a/src/transformers/modeling_auto.py b/src/transformers/modeling_auto.py index a6c0688f734b71..15e55cb866da16 100644 --- a/src/transformers/modeling_auto.py +++ b/src/transformers/modeling_auto.py @@ -73,6 +73,7 @@ from .modeling_camembert import ( CamembertForMaskedLM, CamembertForMultipleChoice, + CamembertForQuestionAnswering, CamembertForSequenceClassification, CamembertForTokenClassification, CamembertModel, @@ -306,6 +307,7 @@ [ (DistilBertConfig, DistilBertForQuestionAnswering), (AlbertConfig, AlbertForQuestionAnswering), + (CamembertConfig, CamembertForQuestionAnswering), (BartConfig, BartForQuestionAnswering), (LongformerConfig, LongformerForQuestionAnswering), (XLMRobertaConfig, XLMRobertaForQuestionAnswering), @@ -336,7 +338,6 @@ ] ) - MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict( [ (CamembertConfig, CamembertForMultipleChoice), diff --git a/src/transformers/modeling_tf_albert.py b/src/transformers/modeling_tf_albert.py index 3c2a7bbf98f89f..0d9b699ddda24b 100644 --- a/src/transformers/modeling_tf_albert.py +++ b/src/transformers/modeling_tf_albert.py @@ -29,6 +29,7 @@ ) from .modeling_tf_bert import ACT2FN, TFBertSelfAttention from .modeling_tf_utils import ( + TFMaskedLanguageModelingLoss, TFMultipleChoiceLoss, TFPreTrainedModel, TFQuestionAnsweringLoss, @@ -822,7 +823,7 @@ def call(self, pooled_output, training: bool): @add_start_docstrings("""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING) -class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): +class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) @@ -834,8 +835,26 @@ def get_output_embeddings(self): @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="albert-base-v2") - def call(self, inputs, **kwargs): + def call( + self, + inputs=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + training=False, + ): r""" + labels (:obj::obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the masked language modeling loss. + Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` + Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)` @@ -852,14 +871,35 @@ def call(self, inputs, **kwargs): Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ - outputs = self.albert(inputs, **kwargs) + if isinstance(inputs, (tuple, list)): + labels = inputs[8] if len(inputs) > 8 else labels + if len(inputs) > 8: + inputs = inputs[:8] + elif isinstance(inputs, (dict, BatchEncoding)): + labels = inputs.pop("labels", labels) + + outputs = self.albert( + inputs, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + training=training, + ) sequence_output = outputs[0] - prediction_scores = self.predictions(sequence_output, training=kwargs.get("training", False)) + prediction_scores = self.predictions(sequence_output, training=training) # Add hidden states and attention if they are here outputs = (prediction_scores,) + outputs[2:] + if labels is not None: + loss = self.compute_loss(labels, prediction_scores) + outputs = (loss,) + outputs + return outputs # prediction_scores, (hidden_states), (attentions) diff --git a/src/transformers/modeling_tf_auto.py b/src/transformers/modeling_tf_auto.py index 7e9c2cd111fe27..bc954a7f4a655c 100644 --- a/src/transformers/modeling_tf_auto.py +++ b/src/transformers/modeling_tf_auto.py @@ -16,6 +16,7 @@ import logging +import warnings from collections import OrderedDict from .configuration_auto import ( @@ -54,6 +55,7 @@ TFBertForQuestionAnswering, TFBertForSequenceClassification, TFBertForTokenClassification, + TFBertLMHeadModel, TFBertModel, ) from .modeling_tf_camembert import ( @@ -140,126 +142,158 @@ TF_MODEL_MAPPING = OrderedDict( [ + (T5Config, TFT5Model), + (DistilBertConfig, TFDistilBertModel), (AlbertConfig, TFAlbertModel), (CamembertConfig, TFCamembertModel), - (CTRLConfig, TFCTRLModel), - (DistilBertConfig, TFDistilBertModel), - (ElectraConfig, TFElectraModel), - (FlaubertConfig, TFFlaubertModel), - (GPT2Config, TFGPT2Model), - (MobileBertConfig, TFMobileBertModel), - (OpenAIGPTConfig, TFOpenAIGPTModel), + (XLMRobertaConfig, TFXLMRobertaModel), (RobertaConfig, TFRobertaModel), (BertConfig, TFBertModel), - (T5Config, TFT5Model), + (OpenAIGPTConfig, TFOpenAIGPTModel), + (GPT2Config, TFGPT2Model), + (MobileBertConfig, TFMobileBertModel), (TransfoXLConfig, TFTransfoXLModel), - (XLMConfig, TFXLMModel), - (XLMRobertaConfig, TFXLMRobertaModel), (XLNetConfig, TFXLNetModel), + (FlaubertConfig, TFFlaubertModel), + (XLMConfig, TFXLMModel), + (CTRLConfig, TFCTRLModel), + (ElectraConfig, TFElectraModel), ] ) TF_MODEL_FOR_PRETRAINING_MAPPING = OrderedDict( [ + (T5Config, TFT5ForConditionalGeneration), + (DistilBertConfig, TFDistilBertForMaskedLM), (AlbertConfig, TFAlbertForPreTraining), (CamembertConfig, TFCamembertForMaskedLM), - (CTRLConfig, TFCTRLLMHeadModel), - (DistilBertConfig, TFDistilBertForMaskedLM), - (ElectraConfig, TFElectraForPreTraining), - (FlaubertConfig, TFFlaubertWithLMHeadModel), - (GPT2Config, TFGPT2LMHeadModel), - (MobileBertConfig, TFMobileBertForPreTraining), - (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel), + (XLMRobertaConfig, TFXLMRobertaForMaskedLM), (RobertaConfig, TFRobertaForMaskedLM), (BertConfig, TFBertForPreTraining), - (T5Config, TFT5ForConditionalGeneration), + (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel), + (GPT2Config, TFGPT2LMHeadModel), + (MobileBertConfig, TFMobileBertForPreTraining), (TransfoXLConfig, TFTransfoXLLMHeadModel), - (XLMConfig, TFXLMWithLMHeadModel), - (XLMRobertaConfig, TFXLMRobertaForMaskedLM), (XLNetConfig, TFXLNetLMHeadModel), + (FlaubertConfig, TFFlaubertWithLMHeadModel), + (XLMConfig, TFXLMWithLMHeadModel), + (CTRLConfig, TFCTRLLMHeadModel), + (ElectraConfig, TFElectraForPreTraining), ] ) TF_MODEL_WITH_LM_HEAD_MAPPING = OrderedDict( [ + (T5Config, TFT5ForConditionalGeneration), + (DistilBertConfig, TFDistilBertForMaskedLM), (AlbertConfig, TFAlbertForMaskedLM), (CamembertConfig, TFCamembertForMaskedLM), - (CTRLConfig, TFCTRLLMHeadModel), - (DistilBertConfig, TFDistilBertForMaskedLM), - (ElectraConfig, TFElectraForMaskedLM), - (FlaubertConfig, TFFlaubertWithLMHeadModel), - (GPT2Config, TFGPT2LMHeadModel), - (MobileBertConfig, TFMobileBertForMaskedLM), - (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel), + (XLMRobertaConfig, TFXLMRobertaForMaskedLM), (RobertaConfig, TFRobertaForMaskedLM), (BertConfig, TFBertForMaskedLM), - (T5Config, TFT5ForConditionalGeneration), + (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel), + (GPT2Config, TFGPT2LMHeadModel), + (MobileBertConfig, TFMobileBertForMaskedLM), (TransfoXLConfig, TFTransfoXLLMHeadModel), - (XLMConfig, TFXLMWithLMHeadModel), - (XLMRobertaConfig, TFXLMRobertaForMaskedLM), (XLNetConfig, TFXLNetLMHeadModel), + (FlaubertConfig, TFFlaubertWithLMHeadModel), + (XLMConfig, TFXLMWithLMHeadModel), + (CTRLConfig, TFCTRLLMHeadModel), + (ElectraConfig, TFElectraForMaskedLM), ] ) -TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict( +TF_MODEL_FOR_CAUSAL_LM_MAPPING = OrderedDict( [ - (AlbertConfig, TFAlbertForMultipleChoice), - (CamembertConfig, TFCamembertForMultipleChoice), - (DistilBertConfig, TFDistilBertForMultipleChoice), - (FlaubertConfig, TFFlaubertForMultipleChoice), - (MobileBertConfig, TFMobileBertForMultipleChoice), - (RobertaConfig, TFRobertaForMultipleChoice), - (BertConfig, TFBertForMultipleChoice), - (XLMConfig, TFXLMForMultipleChoice), - (XLMRobertaConfig, TFXLMRobertaForMultipleChoice), - (XLNetConfig, TFXLNetForMultipleChoice), + (BertConfig, TFBertLMHeadModel), + (OpenAIGPTConfig, TFOpenAIGPTLMHeadModel), + (GPT2Config, TFGPT2LMHeadModel), + (TransfoXLConfig, TFTransfoXLLMHeadModel), + (XLNetConfig, TFXLNetLMHeadModel), + ( + XLMConfig, + TFXLMWithLMHeadModel, + ), # XLM can be MLM and CLM => model should be split similar to BERT; leave here for now + (CTRLConfig, TFCTRLLMHeadModel), ] ) -TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict( +TF_MODEL_FOR_MASKED_LM_MAPPING = OrderedDict( [ - (AlbertConfig, TFAlbertForQuestionAnswering), - (CamembertConfig, TFCamembertForQuestionAnswering), - (DistilBertConfig, TFDistilBertForQuestionAnswering), - (ElectraConfig, TFElectraForQuestionAnswering), - (FlaubertConfig, TFFlaubertForQuestionAnsweringSimple), - (MobileBertConfig, TFMobileBertForQuestionAnswering), - (RobertaConfig, TFRobertaForQuestionAnswering), - (BertConfig, TFBertForQuestionAnswering), - (XLMConfig, TFXLMForQuestionAnsweringSimple), - (XLMRobertaConfig, TFXLMRobertaForQuestionAnswering), - (XLNetConfig, TFXLNetForQuestionAnsweringSimple), + (DistilBertConfig, TFDistilBertForMaskedLM), + (AlbertConfig, TFAlbertForMaskedLM), + (CamembertConfig, TFCamembertForMaskedLM), + (XLMRobertaConfig, TFXLMRobertaForMaskedLM), + (RobertaConfig, TFRobertaForMaskedLM), + (BertConfig, TFBertForMaskedLM), + (MobileBertConfig, TFMobileBertForMaskedLM), + (FlaubertConfig, TFFlaubertWithLMHeadModel), + (XLMConfig, TFXLMWithLMHeadModel), + (ElectraConfig, TFElectraForMaskedLM), ] ) +TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict([(T5Config, TFT5ForConditionalGeneration)]) + TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict( [ + (DistilBertConfig, TFDistilBertForSequenceClassification), (AlbertConfig, TFAlbertForSequenceClassification), (CamembertConfig, TFCamembertForSequenceClassification), - (DistilBertConfig, TFDistilBertForSequenceClassification), - (FlaubertConfig, TFFlaubertForSequenceClassification), - (MobileBertConfig, TFMobileBertForSequenceClassification), + (XLMRobertaConfig, TFXLMRobertaForSequenceClassification), (RobertaConfig, TFRobertaForSequenceClassification), (BertConfig, TFBertForSequenceClassification), - (XLMConfig, TFXLMForSequenceClassification), - (XLMRobertaConfig, TFXLMRobertaForSequenceClassification), (XLNetConfig, TFXLNetForSequenceClassification), + (MobileBertConfig, TFMobileBertForSequenceClassification), + (FlaubertConfig, TFFlaubertForSequenceClassification), + (XLMConfig, TFXLMForSequenceClassification), + ] +) + +TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict( + [ + (DistilBertConfig, TFDistilBertForQuestionAnswering), + (AlbertConfig, TFAlbertForQuestionAnswering), + (CamembertConfig, TFCamembertForQuestionAnswering), + (XLMRobertaConfig, TFXLMRobertaForQuestionAnswering), + (RobertaConfig, TFRobertaForQuestionAnswering), + (BertConfig, TFBertForQuestionAnswering), + (XLNetConfig, TFXLNetForQuestionAnsweringSimple), + (MobileBertConfig, TFMobileBertForQuestionAnswering), + (FlaubertConfig, TFFlaubertForQuestionAnsweringSimple), + (XLMConfig, TFXLMForQuestionAnsweringSimple), + (ElectraConfig, TFElectraForQuestionAnswering), ] ) TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = OrderedDict( [ + (DistilBertConfig, TFDistilBertForTokenClassification), (AlbertConfig, TFAlbertForTokenClassification), (CamembertConfig, TFCamembertForTokenClassification), - (DistilBertConfig, TFDistilBertForTokenClassification), - (ElectraConfig, TFElectraForTokenClassification), (FlaubertConfig, TFFlaubertForTokenClassification), - (MobileBertConfig, TFMobileBertForTokenClassification), - (RobertaConfig, TFRobertaForTokenClassification), - (BertConfig, TFBertForTokenClassification), (XLMConfig, TFXLMForTokenClassification), (XLMRobertaConfig, TFXLMRobertaForTokenClassification), + (RobertaConfig, TFRobertaForTokenClassification), + (BertConfig, TFBertForTokenClassification), + (MobileBertConfig, TFMobileBertForTokenClassification), (XLNetConfig, TFXLNetForTokenClassification), + (ElectraConfig, TFElectraForTokenClassification), + ] +) + +TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = OrderedDict( + [ + (CamembertConfig, TFCamembertForMultipleChoice), + (XLMConfig, TFXLMForMultipleChoice), + (XLMRobertaConfig, TFXLMRobertaForMultipleChoice), + (RobertaConfig, TFRobertaForMultipleChoice), + (BertConfig, TFBertForMultipleChoice), + (DistilBertConfig, TFDistilBertForMultipleChoice), + (MobileBertConfig, TFMobileBertForMultipleChoice), + (XLNetConfig, TFXLNetForMultipleChoice), + (FlaubertConfig, TFFlaubertForMultipleChoice), + (AlbertConfig, TFAlbertForMultipleChoice), ] ) @@ -303,11 +337,11 @@ def from_config(cls, config): Note: Loading a model from its configuration file does **not** load the model weights. - It only affects the model's configuration. Use :func:`~transformers.AutoModel.from_pretrained` to load + It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load the model weights Args: - config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`: The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: TFDistilBertModel (DistilBERT model) @@ -359,7 +393,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. + - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. from_pt: (`Optional`) Boolean @@ -368,17 +402,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method - config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. + - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. - In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. + In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model @@ -401,7 +435,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. + - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.TFPretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: @@ -452,11 +486,11 @@ def from_config(cls, config): Note: Loading a model from its configuration file does **not** load the model weights. - It only affects the model's configuration. Use :func:`~transformers.AutoModel.from_pretrained` to load + It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load the model weights Args: - config (:class:`~transformers.PretrainedConfig`): + config (:class:`~transformers.TFPretrainedConfig`): The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: :class:`~transformers.TFDistilBertModelForMaskedLM` (DistilBERT model) @@ -478,7 +512,7 @@ def from_config(cls, config): if isinstance(config, config_class): return model_class(config) raise ValueError( - "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" + "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_PRETRAINING_MAPPING.keys()) ) @@ -513,21 +547,21 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. + - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method - config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. + - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. - In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. + In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model configuration should be cached if the standard cache should not be used. @@ -549,7 +583,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class - initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of + initialization function (:func:`~transformers.TFPretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. @@ -573,7 +607,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): if isinstance(config, config_class): return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) raise ValueError( - "Unrecognized configuration class {} for this kind of AutoModel: {}.\n" + "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" "Model type should be one of {}.".format( config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_PRETRAINING_MAPPING.keys()) ) @@ -619,28 +653,32 @@ def from_config(cls, config): Note: Loading a model from its configuration file does **not** load the model weights. - It only affects the model's configuration. Use :func:`~transformers.AutoModel.from_pretrained` to load + It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load the model weights Args: - config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`: The model class to instantiate is selected based on the configuration class: - - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) - - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model) - - isInstance of `bert` configuration class: BertModel (Bert model) + - isInstance of `distilbert` configuration class: TFDistilBertModel (DistilBERT model) + - isInstance of `roberta` configuration class: TFRobertaModel (RoBERTa model) + - isInstance of `bert` configuration class: TFBertModel (Bert model) - isInstance of `openai-gpt` configuration class: OpenAIGPTModel (OpenAI GPT model) - - isInstance of `gpt2` configuration class: GPT2Model (OpenAI GPT-2 model) - - isInstance of `ctrl` configuration class: CTRLModel (Salesforce CTRL model) + - isInstance of `gpt2` configuration class: TFGPT2Model (OpenAI GPT-2 model) + - isInstance of `ctrl` configuration class: TFCTRLModel (Salesforce CTRL model) - isInstance of `transfo-xl` configuration class: TransfoXLModel (Transformer-XL model) - - isInstance of `xlnet` configuration class: XLNetModel (XLNet model) - - isInstance of `xlm` configuration class: XLMModel (XLM model) + - isInstance of `xlnet` configuration class: TFXLNetModel (XLNet model) + - isInstance of `xlm` configuration class: TFXLMModel (XLM model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = TFAutoModelWithLMHead.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ + warnings.warn( + "The class `TFAutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `TFAutoModelForCausalLM` for causal language models, `TFAutoModelForMaskedLM` for masked language models and `TFAutoModelForSeq2SeqLM` for encoder-decoder models.", + FutureWarning, + ) for config_class, model_class in TF_MODEL_WITH_LM_HEAD_MAPPING.items(): if isinstance(config, config_class): return model_class(config) @@ -676,7 +714,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. + - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. from_pt: (`Optional`) Boolean @@ -685,17 +723,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method - config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. + - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. - In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. + In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model @@ -718,7 +756,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. + - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.TFPretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: @@ -731,6 +769,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config) """ + warnings.warn( + "The class `TFAutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `TFAutoModelForCausalLM` for causal language models, `TFAutoModelForMaskedLM` for masked language models and `TFAutoModelForSeq2SeqLM` for encoder-decoder models.", + FutureWarning, + ) config = kwargs.pop("config", None) if not isinstance(config, PretrainedConfig): @@ -778,19 +820,19 @@ def from_config(cls, config): Note: Loading a model from its configuration file does **not** load the model weights. - It only affects the model's configuration. Use :func:`~transformers.AutoModel.from_pretrained` to load + It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load the model weights Args: - config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`: The model class to instantiate is selected based on the configuration class: - - isInstance of `albert` configuration class: AlbertModel (Albert model) - - isInstance of `bert` configuration class: BertModel (Bert model) + - isInstance of `albert` configuration class: TFAlbertModel (Albert model) + - isInstance of `bert` configuration class: TFBertModel (Bert model) Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. - model = AutoModelForMulitpleChoice.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` + model = TFAutoModelForMulitpleChoice.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.items(): if isinstance(config, config_class): @@ -824,7 +866,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. + - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. from_pt: (`Optional`) Boolean @@ -833,17 +875,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method - config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. + - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. - In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. + In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model @@ -866,7 +908,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. + - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.TFPretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: @@ -896,6 +938,406 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): ) +class TFAutoModelForCausalLM: + r""" + :class:`~transformers.TFAutoModelForCausalLM` is a generic model class + that will be instantiated as one of the language modeling model classes of the library + when created with the `TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)` + class method. + + This class cannot be instantiated using `__init__()` (throws an error). + """ + + def __init__(self): + raise EnvironmentError( + "TFAutoModelForCausalLM is designed to be instantiated " + "using the `TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)` or " + "`TFAutoModelForCausalLM.from_config(config)` methods." + ) + + @classmethod + def from_config(cls, config): + r""" Instantiates one of the base model classes of the library + from a configuration. + + Note: + Loading a model from its configuration file does **not** load the model weights. + It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load + the model weights + + Args: + config (:class:`~transformers.TFPretrainedConfig`): + The model class to instantiate is selected based on the configuration class: + + - isInstance of `bert` configuration class: :class:`~transformers.TFBertLMHeadModel` (Bert model) + - isInstance of `openai-gpt` configuration class: :class:`~transformers.TFOpenAIGPTLMHeadModel` (OpenAI GPT model) + - isInstance of `gpt2` configuration class: :class:`~transformers.TFGPT2LMHeadModel` (OpenAI GPT-2 model) + - isInstance of `ctrl` configuration class: :class:`~transformers.TFCTRLLMHeadModel` (Salesforce CTRL model) + - isInstance of `transfo-xl` configuration class: :class:`~transformers.TFTransfoXLLMHeadModel` (Transformer-XL model) + - isInstance of `xlnet` configuration class: :class:`~transformers.TFXLNetLMHeadModel` (XLNet model) + + Examples:: + + config = GPT2Config.from_pretrained('gpt2') # Download configuration from S3 and cache. + model = TFAutoModelForCausalLM.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` + """ + for config_class, model_class in TF_MODEL_FOR_CAUSAL_LM_MAPPING.items(): + if isinstance(config, config_class): + return model_class(config) + raise ValueError( + "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" + "Model type should be one of {}.".format( + config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_CAUSAL_LM_MAPPING.keys()) + ) + ) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + r""" Instantiates one of the language modeling model classes of the library + from a pre-trained model configuration. + + The `from_pretrained()` method takes care of returning the correct model class instance + based on the `model_type` property of the config object, or when it's missing, + falling back to using pattern matching on the `pretrained_model_name_or_path` string: + + - `bert`: :class:`~transformers.TFBertLMHeadModel` (Bert model) + - `openai-gpt`: :class:`~transformers.TFOpenAIGPTLMHeadModel` (OpenAI GPT model) + - `gpt2`: :class:`~transformers.TFGPT2LMHeadModel` (OpenAI GPT-2 model) + - `transfo-xl`: :class:`~transformers.TFTransfoXLLMHeadModel` (Transformer-XL model) + - `xlnet`: :class:`~transformers.TFXLNetLMHeadModel` (XLNet model) + - `ctrl`: :class:`~transformers.TFCTRLLMHeadModel` (Salesforce CTRL model) + + The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) + To train the model, you should first set it back in training mode with `model.train()` + + Args: + pretrained_model_name_or_path: + Either: + + - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. + - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. + - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. + model_args: (`optional`) Sequence of positional arguments: + All remaning positional arguments will be passed to the underlying model's ``__init__`` method + config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`: + Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: + + - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or + - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. + - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. + + state_dict: (`optional`) dict: + an optional state dictionary for the model to use instead of a state dictionary loaded from saved weights file. + This option can be used if you want to create a model from a pretrained configuration but load your own weights. + In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option. + cache_dir: (`optional`) string: + Path to a directory in which a downloaded pre-trained model + configuration should be cached if the standard cache should not be used. + force_download: (`optional`) boolean, default False: + Force to (re-)download the model weights and configuration files and override the cached versions if they exists. + resume_download: (`optional`) boolean, default False: + Do not delete incompletely received file. Attempt to resume the download if such a file exists. + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + output_loading_info: (`optional`) boolean: + Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages. + kwargs: (`optional`) Remaining dictionary of keyword arguments: + These arguments will be passed to the configuration and the model. + + Examples:: + + model = TFAutoModelForCausalLM.from_pretrained('gpt2') # Download model and configuration from S3 and cache. + model = TFAutoModelForCausalLM.from_pretrained('./test/gpt2_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` + assert model.config.output_attention == True + # Loading from a TF checkpoint file instead of a PyTorch model (slower) + config = AutoConfig.from_json_file('./tf_model/gpt2_tf_model_config.json') + model = TFAutoModelForCausalLM.from_pretrained('./tf_model/gpt2_tf_checkpoint.ckpt.index', from_tf=True, config=config) + + """ + config = kwargs.pop("config", None) + if not isinstance(config, PretrainedConfig): + config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + + for config_class, model_class in TF_MODEL_FOR_CAUSAL_LM_MAPPING.items(): + if isinstance(config, config_class): + return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) + raise ValueError( + "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" + "Model type should be one of {}.".format( + config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_CAUSAL_LM_MAPPING.keys()) + ) + ) + + +class TFAutoModelForMaskedLM: + r""" + :class:`~transformers.TFAutoModelForMaskedLM` is a generic model class + that will be instantiated as one of the language modeling model classes of the library + when created with the `TFAutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path)` + class method. + + This class cannot be instantiated using `__init__()` (throws an error). + """ + + def __init__(self): + raise EnvironmentError( + "TFAutoModelForMaskedLM is designed to be instantiated " + "using the `TFAutoModelForMaskedLM.from_pretrained(pretrained_model_name_or_path)` or " + "`TFAutoModelForMaskedLM.from_config(config)` methods." + ) + + @classmethod + def from_config(cls, config): + r""" Instantiates one of the base model classes of the library + from a configuration. + + Note: + Loading a model from its configuration file does **not** load the model weights. + It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load + the model weights + + Args: + config (:class:`~transformers.TFPretrainedConfig`): + The model class to instantiate is selected based on the configuration class: + - isInstance of `distilbert` configuration class: :class:`~transformers.TFDistilBertForMaskedLM` (DistilBERT model) + - isInstance of `roberta` configuration class: :class:`~transformers.TFRobertaForMaskedLM` (RoBERTa model) + - isInstance of `bert` configuration class: :class:`~transformers.TFBertForMaskedLM` (Bert model) + - isInstance of `flaubert` configuration class: :class:`~transformers.TFFlaubertWithLMHeadModel` (Flaubert model) + - isInstance of `xlm` configuration class: :class:`~transformers.TFXLMWithLMHeadModel` (XLM model) + - isInstance of `xlm-roberta` configuration class: :class:`~transformers.TFXLMRobertaForMaskedLM` (XLM-Roberta model) + - isInstance of `electra` configuration class: :class:`~transformers.TFElectraForMaskedLM` (Electra model) + - isInstance of `camembert` configuration class: :class:`~transformers.TFCamembertForMaskedLM` (Camembert model) + - isInstance of `albert` configuration class: :class:`~transformers.TFAlbertForMaskedLM` (Albert model) + + + Examples:: + + config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. + model = TFAutoModelForMaskedLM.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` + """ + for config_class, model_class in TF_MODEL_FOR_MASKED_LM_MAPPING.items(): + if isinstance(config, config_class): + return model_class(config) + raise ValueError( + "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" + "Model type should be one of {}.".format( + config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_MASKED_LM_MAPPING.keys()) + ) + ) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + r""" Instantiates one of the language modeling model classes of the library + from a pre-trained model configuration. + + The `from_pretrained()` method takes care of returning the correct model class instance + based on the `model_type` property of the config object, or when it's missing, + falling back to using pattern matching on the `pretrained_model_name_or_path` string: + + - `distilbert`: :class:`~transformers.TFDistilBertForMaskedLM` (DistilBERT model) + - `albert`: :class:`~transformers.TFAlbertForMaskedLM` (ALBERT model) + - `camembert`: :class:`~transformers.TFCamembertForMaskedLM` (CamemBERT model) + - `xlm-roberta`: :class:`~transformers.TFXLMRobertaForMaskedLM` (XLM-RoBERTa model) + - `longformer`: :class:`~transformers.TFLongformerForMaskedLM` (Longformer model) + - `roberta`: :class:`~transformers.TFRobertaForMaskedLM` (RoBERTa model) + - `xlm`: :class:`~transformers.TFXLMWithLMHeadModel` (XLM model) + - `flaubert`: :class:`~transformers.TFFlaubertWithLMHeadModel` (Flaubert model) + - `electra`: :class:`~transformers.TFElectraForMaskedLM` (Electra model) + - `bert`: :class:`~transformers.TFBertLMHeadModel` (Bert model) + + The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) + To train the model, you should first set it back in training mode with `model.train()` + + Args: + pretrained_model_name_or_path: + Either: + + - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. + - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. + - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. + model_args: (`optional`) Sequence of positional arguments: + All remaning positional arguments will be passed to the underlying model's ``__init__`` method + config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`: + Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: + + - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or + - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. + - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. + + state_dict: (`optional`) dict: + an optional state dictionary for the model to use instead of a state dictionary loaded from saved weights file. + This option can be used if you want to create a model from a pretrained configuration but load your own weights. + In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option. + cache_dir: (`optional`) string: + Path to a directory in which a downloaded pre-trained model + configuration should be cached if the standard cache should not be used. + force_download: (`optional`) boolean, default False: + Force to (re-)download the model weights and configuration files and override the cached versions if they exists. + resume_download: (`optional`) boolean, default False: + Do not delete incompletely received file. Attempt to resume the download if such a file exists. + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + output_loading_info: (`optional`) boolean: + Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages. + kwargs: (`optional`) Remaining dictionary of keyword arguments: + These arguments will be passed to the configuration and the model. + + Examples:: + + model = TFAutoModelForMaskedLM.from_pretrained('bert') # Download model and configuration from S3 and cache. + model = TFAutoModelForMaskedLM.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` + assert model.config.output_attention == True + # Loading from a TF checkpoint file instead of a PyTorch model (slower) + config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') + model = TFAutoModelForMaskedLM.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) + + """ + config = kwargs.pop("config", None) + if not isinstance(config, PretrainedConfig): + config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + + for config_class, model_class in TF_MODEL_FOR_MASKED_LM_MAPPING.items(): + if isinstance(config, config_class): + return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) + raise ValueError( + "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" + "Model type should be one of {}.".format( + config.__class__, cls.__name__, ", ".join(c.__name__ for c in TF_MODEL_FOR_MASKED_LM_MAPPING.keys()) + ) + ) + + +class TFAutoModelForSeq2SeqLM: + r""" + :class:`~transformers.TFAutoModelForSeq2SeqLM` is a generic model class + that will be instantiated as one of the language modeling model classes of the library + when created with the `TFAutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path)` + class method. + + This class cannot be instantiated using `__init__()` (throws an error). + """ + + def __init__(self): + raise EnvironmentError( + "TFAutoModelForSeq2SeqLM is designed to be instantiated " + "using the `TFAutoModelForSeq2SeqLM.from_pretrained(pretrained_model_name_or_path)` or " + "`TFAutoModelForSeq2SeqLM.from_config(config)` methods." + ) + + @classmethod + def from_config(cls, config): + r""" Instantiates one of the base model classes of the library + from a configuration. + + Note: + Loading a model from its configuration file does **not** load the model weights. + It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load + the model weights + + Args: + config (:class:`~transformers.TFPretrainedConfig`): + The model class to instantiate is selected based on the configuration class: + + - isInstance of `t5` configuration class: :class:`~transformers.TFT5ForConditionalGeneration` (T5 model) + + Examples:: + + config = T5Config.from_pretrained('t5') + model = TFAutoModelForSeq2SeqLM.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` + """ + for config_class, model_class in TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.items(): + if isinstance(config, config_class): + return model_class(config) + raise ValueError( + "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" + "Model type should be one of {}.".format( + config.__class__, + cls.__name__, + ", ".join(c.__name__ for c in TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.keys()), + ) + ) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + r""" Instantiates one of the language modeling model classes of the library + from a pre-trained model configuration. + + The `from_pretrained()` method takes care of returning the correct model class instance + based on the `model_type` property of the config object, or when it's missing, + falling back to using pattern matching on the `pretrained_model_name_or_path` string: + + - `t5`: :class:`~transformers.TFT5ForConditionalGeneration` (T5 model) + + The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) + To train the model, you should first set it back in training mode with `model.train()` + + Args: + pretrained_model_name_or_path: + Either: + + - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. + - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. + - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. + model_args: (`optional`) Sequence of positional arguments: + All remaning positional arguments will be passed to the underlying model's ``__init__`` method + config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`: + Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: + + - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or + - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. + - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. + + state_dict: (`optional`) dict: + an optional state dictionary for the model to use instead of a state dictionary loaded from saved weights file. + This option can be used if you want to create a model from a pretrained configuration but load your own weights. + In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option. + cache_dir: (`optional`) string: + Path to a directory in which a downloaded pre-trained model + configuration should be cached if the standard cache should not be used. + force_download: (`optional`) boolean, default False: + Force to (re-)download the model weights and configuration files and override the cached versions if they exists. + resume_download: (`optional`) boolean, default False: + Do not delete incompletely received file. Attempt to resume the download if such a file exists. + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + output_loading_info: (`optional`) boolean: + Set to ``True`` to also return a dictionary containing missing keys, unexpected keys and error messages. + kwargs: (`optional`) Remaining dictionary of keyword arguments: + These arguments will be passed to the configuration and the model. + + Examples:: + + model = TFAutoModelForSeq2SeqLM.from_pretrained('t5-base') # Download model and configuration from S3 and cache. + model = TFAutoModelForSeq2SeqLM.from_pretrained('./test/t5_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` + assert model.config.output_attention == True + # Loading from a TF checkpoint file instead of a PyTorch model (slower) + config = AutoConfig.from_json_file('./tf_model/t5_tf_model_config.json') + model = TFAutoModelForSeq2SeqLM.from_pretrained('./tf_model/t5_tf_checkpoint.ckpt.index', from_tf=True, config=config) + + """ + config = kwargs.pop("config", None) + if not isinstance(config, PretrainedConfig): + config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + + for config_class, model_class in TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.items(): + if isinstance(config, config_class): + return model_class.from_pretrained(pretrained_model_name_or_path, *model_args, config=config, **kwargs) + raise ValueError( + "Unrecognized configuration class {} for this kind of TFAutoModel: {}.\n" + "Model type should be one of {}.".format( + config.__class__, + cls.__name__, + ", ".join(c.__name__ for c in TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.keys()), + ) + ) + + class TFAutoModelForSequenceClassification(object): r""" :class:`~transformers.TFAutoModelForSequenceClassification` is a generic model class @@ -930,11 +1372,11 @@ def from_config(cls, config): Note: Loading a model from its configuration file does **not** load the model weights. - It only affects the model's configuration. Use :func:`~transformers.AutoModel.from_pretrained` to load + It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load the model weights Args: - config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`: The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) @@ -946,7 +1388,7 @@ def from_config(cls, config): Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. - model = AutoModelForSequenceClassification.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` + model = TFAutoModelForSequenceClassification.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ for config_class, model_class in TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING.items(): if isinstance(config, config_class): @@ -983,7 +1425,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. + - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. from_pt: (`Optional`) Boolean @@ -992,17 +1434,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method - config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. + - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. - In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. + In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model @@ -1025,7 +1467,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. + - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.TFPretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: @@ -1090,11 +1532,11 @@ def from_config(cls, config): Note: Loading a model from its configuration file does **not** load the model weights. - It only affects the model's configuration. Use :func:`~transformers.AutoModel.from_pretrained` to load + It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load the model weights Args: - config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`: The model class to instantiate is selected based on the configuration class: - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) @@ -1145,7 +1587,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. + - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. from_pt: (`Optional`) Boolean @@ -1154,17 +1596,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method - config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. + - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. - In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. + In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model @@ -1187,7 +1629,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. + - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.TFPretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: @@ -1222,7 +1664,7 @@ def __init__(self): raise EnvironmentError( "TFAutoModelForTokenClassification is designed to be instantiated " "using the `TFAutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or " - "`AutoModelForTokenClassification.from_config(config)` methods." + "`TFAutoModelForTokenClassification.from_config(config)` methods." ) @classmethod @@ -1232,11 +1674,11 @@ def from_config(cls, config): Note: Loading a model from its configuration file does **not** load the model weights. - It only affects the model's configuration. Use :func:`~transformers.AutoModel.from_pretrained` to load + It only affects the model's configuration. Use :func:`~transformers.TFAutoModel.from_pretrained` to load the model weights Args: - config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`: The model class to instantiate is selected based on the configuration class: - isInstance of `bert` configuration class: BertModel (Bert model) @@ -1282,23 +1724,23 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. + - a path to a `directory` containing model weights saved using :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method - config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + config: (`optional`) instance of a class derived from :class:`~transformers.TFPretrainedConfig`: Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. + - the model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. state_dict: (`optional`) dict: an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. This option can be used if you want to create a model from a pretrained configuration but load your own weights. - In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. + In this case though, you should check if using :func:`~transformers.TFPreTrainedModel.save_pretrained` and :func:`~transformers.TFPreTrainedModel.from_pretrained` is not a simpler option. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model @@ -1318,7 +1760,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. + - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.TFPretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. Examples:: diff --git a/src/transformers/modeling_tf_bert.py b/src/transformers/modeling_tf_bert.py index 0bab7699cef44f..b829b3b8ae6911 100644 --- a/src/transformers/modeling_tf_bert.py +++ b/src/transformers/modeling_tf_bert.py @@ -29,6 +29,8 @@ add_start_docstrings_to_callable, ) from .modeling_tf_utils import ( + TFCausalLanguageModelingLoss, + TFMaskedLanguageModelingLoss, TFMultipleChoiceLoss, TFPreTrainedModel, TFQuestionAnsweringLoss, @@ -803,9 +805,12 @@ def call(self, inputs, **kwargs): @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING) -class TFBertForMaskedLM(TFBertPreTrainedModel): +class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) + assert ( + not config.is_decoder + ), "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention." self.bert = TFBertMainLayer(config, name="bert") self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") @@ -815,8 +820,26 @@ def get_output_embeddings(self): @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") - def call(self, inputs, **kwargs): + def call( + self, + inputs=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + training=False, + ): r""" + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the masked language modeling loss. + Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` + Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): @@ -833,13 +856,113 @@ def call(self, inputs, **kwargs): Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ - outputs = self.bert(inputs, **kwargs) + if isinstance(inputs, (tuple, list)): + labels = inputs[8] if len(inputs) > 8 else labels + if len(inputs) > 8: + inputs = inputs[:8] + elif isinstance(inputs, (dict, BatchEncoding)): + labels = inputs.pop("labels", labels) + + outputs = self.bert( + inputs, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + training=training, + ) sequence_output = outputs[0] - prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False)) + prediction_scores = self.mlm(sequence_output, training=training) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here + if labels is not None: + loss = self.compute_loss(labels, prediction_scores) + outputs = (loss,) + outputs + + return outputs # (loss), prediction_scores, (hidden_states), (attentions) + + +class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + assert config.is_decoder, "If you want to use `TFBertLMHeadModel` as a standalone, add `is_decoder=True.`" + + self.bert = TFBertMainLayer(config, name="bert") + self.mlm = TFBertMLMHead(config, self.bert.embeddings, name="mlm___cls") + + def get_output_embeddings(self): + return self.bert.embeddings + + @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-cased") + def call( + self, + inputs=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + training=False, + ): + r""" + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the cross entropy classification loss. + Indices should be in ``[0, ..., config.vocab_size - 1]``. + + Return: + :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: + prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + if isinstance(inputs, (tuple, list)): + labels = inputs[8] if len(inputs) > 8 else labels + if len(inputs) > 8: + inputs = inputs[:8] + elif isinstance(inputs, (dict, BatchEncoding)): + labels = inputs.pop("labels", labels) + + outputs = self.bert( + inputs, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + training=training, + ) + + sequence_output = outputs[0] + logits = self.mlm(sequence_output, training=training) + + outputs = (logits,) + outputs[2:] # Add hidden states and attention if they are here + if labels is not None: + # shift labels to the left and cut last logit token + logits = logits[:, :-1] + labels = labels[:, 1:] + loss = self.compute_loss(labels, logits) + outputs = (loss,) + outputs + return outputs # prediction_scores, (hidden_states), (attentions) diff --git a/src/transformers/modeling_tf_ctrl.py b/src/transformers/modeling_tf_ctrl.py index 96c2d0e2ad9ca5..dc20cf74ba349a 100644 --- a/src/transformers/modeling_tf_ctrl.py +++ b/src/transformers/modeling_tf_ctrl.py @@ -24,6 +24,7 @@ from .configuration_ctrl import CTRLConfig from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_utils import ( + TFCausalLanguageModelingLoss, TFPreTrainedModel, TFSharedEmbeddings, cast_bool_to_primitive, @@ -542,7 +543,7 @@ def call(self, hidden_states): (linear layer with weights tied to the input embeddings). """, CTRL_START_DOCSTRING, ) -class TFCTRLLMHeadModel(TFCTRLPreTrainedModel): +class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFCTRLMainLayer(config, name="transformer") @@ -561,8 +562,26 @@ def prepare_inputs_for_generation(self, inputs, past, **kwargs): @add_start_docstrings_to_callable(CTRL_INPUTS_DOCSTRING) @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="ctrl") - def call(self, inputs, **kwargs): + def call( + self, + inputs, + past=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + training=False, + ): r""" + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the cross entropy classification loss. + Indices should be in ``[0, ..., config.vocab_size - 1]``. + Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.CTRLConfig`) and inputs: prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): @@ -583,11 +602,37 @@ def call(self, inputs, **kwargs): Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ - transformer_outputs = self.transformer(inputs, **kwargs) + if isinstance(inputs, (tuple, list)): + labels = inputs[10] if len(inputs) > 10 else labels + if len(inputs) > 10: + inputs = inputs[:10] + elif isinstance(inputs, (dict, BatchEncoding)): + labels = inputs.pop("labels", labels) + + transformer_outputs = self.transformer( + inputs, + past=past, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + training=training, + ) + hidden_states = transformer_outputs[0] - lm_logits = self.lm_head(hidden_states) + logits = self.lm_head(hidden_states) - outputs = (lm_logits,) + transformer_outputs[1:] + outputs = (logits,) + transformer_outputs[1:] + if labels is not None: + # shift labels to the left and cut last logit token + logits = logits[:, :-1] + labels = labels[:, 1:] + loss = self.compute_loss(labels, logits) + outputs = (loss,) + outputs return outputs # lm_logits, presents, (all hidden_states), (attentions) diff --git a/src/transformers/modeling_tf_distilbert.py b/src/transformers/modeling_tf_distilbert.py index e9fe573bde6996..a22328d743b0fc 100644 --- a/src/transformers/modeling_tf_distilbert.py +++ b/src/transformers/modeling_tf_distilbert.py @@ -30,6 +30,7 @@ add_start_docstrings_to_callable, ) from .modeling_tf_utils import ( + TFMaskedLanguageModelingLoss, TFMultipleChoiceLoss, TFPreTrainedModel, TFQuestionAnsweringLoss, @@ -116,7 +117,7 @@ def build(self, input_shape): def call(self, inputs, inputs_embeds=None, mode="embedding", training=False): """Get token embeddings of inputs. Args: - inputs: list of three int64 tensors with shape [batch_size, length]: (input_ids, position_ids, token_type_ids) + inputs: list of two int64 tensors with shape [batch_size, length]: (input_ids, position_ids) mode: string, a valid value is one of "embedding" and "linear". Returns: outputs: (1) If mode == "embedding", output embedding tensor, float32 with @@ -528,9 +529,9 @@ class TFDistilBertPreTrainedModel(TFPreTrainedModel): - a single Tensor with input_ids only and nothing else: :obj:`model(inputs_ids)` - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: - :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` + :obj:`model([input_ids, attention_mask])` - a dictionary with one or several input Tensors associated to the input names given in the docstring: - :obj:`model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` + :obj:`model({'input_ids': input_ids})` Parameters: config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. @@ -626,7 +627,7 @@ def call(self, hidden_states): @add_start_docstrings( """DistilBert Model with a `masked language modeling` head on top. """, DISTILBERT_START_DOCSTRING, ) -class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): +class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel, TFMaskedLanguageModelingLoss): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.vocab_size = config.vocab_size @@ -644,8 +645,23 @@ def get_output_embeddings(self): @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="distilbert-base-uncased") - def call(self, inputs, **kwargs): + def call( + self, + inputs=None, + attention_mask=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + training=False, + ): r""" + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the masked language modeling loss. + Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: @@ -663,7 +679,22 @@ def call(self, inputs, **kwargs): Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ - distilbert_output = self.distilbert(inputs, **kwargs) + if isinstance(inputs, (tuple, list)): + labels = inputs[6] if len(inputs) > 6 else labels + if len(inputs) > 6: + inputs = inputs[:6] + elif isinstance(inputs, (dict, BatchEncoding)): + labels = inputs.pop("labels", labels) + + distilbert_output = self.distilbert( + inputs, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + training=training, + ) hidden_states = distilbert_output[0] # (bs, seq_length, dim) prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) @@ -672,6 +703,11 @@ def call(self, inputs, **kwargs): prediction_logits = self.vocab_projector(prediction_logits) outputs = (prediction_logits,) + distilbert_output[1:] + + if labels is not None: + loss = self.compute_loss(labels, prediction_logits) + outputs = (loss,) + outputs + return outputs # logits, (hidden_states), (attentions) diff --git a/src/transformers/modeling_tf_electra.py b/src/transformers/modeling_tf_electra.py index b77c04e4d25ee6..595482ee275e88 100644 --- a/src/transformers/modeling_tf_electra.py +++ b/src/transformers/modeling_tf_electra.py @@ -7,6 +7,7 @@ from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_bert import ACT2FN, TFBertEncoder, TFBertPreTrainedModel from .modeling_tf_utils import ( + TFMaskedLanguageModelingLoss, TFQuestionAnsweringLoss, TFTokenClassificationLoss, get_initializer, @@ -506,7 +507,7 @@ def call(self, hidden_states, training=False): the only model of the two to have been trained for the masked language modeling task.""", ELECTRA_START_DOCSTRING, ) -class TFElectraForMaskedLM(TFElectraPreTrainedModel): +class TFElectraForMaskedLM(TFElectraPreTrainedModel, TFMaskedLanguageModelingLoss): def __init__(self, config, **kwargs): super().__init__(config, **kwargs) @@ -534,9 +535,16 @@ def call( inputs_embeds=None, output_attentions=None, output_hidden_states=None, + labels=None, training=False, ): r""" + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the masked language modeling loss. + Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` + Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): @@ -553,6 +561,12 @@ def call( Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ + if isinstance(input_ids, (tuple, list)): + labels = input_ids[8] if len(input_ids) > 8 else labels + if len(input_ids) > 8: + input_ids = input_ids[:8] + elif isinstance(input_ids, (dict, BatchEncoding)): + labels = input_ids.pop("labels", labels) generator_hidden_states = self.electra( input_ids, @@ -571,6 +585,10 @@ def call( output = (prediction_scores,) output += generator_hidden_states[1:] + if labels is not None: + loss = self.compute_loss(labels, prediction_scores) + output = (loss,) + output + return output # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) diff --git a/src/transformers/modeling_tf_gpt2.py b/src/transformers/modeling_tf_gpt2.py index 5c4bbd27c60256..de7dc4c3577fd6 100644 --- a/src/transformers/modeling_tf_gpt2.py +++ b/src/transformers/modeling_tf_gpt2.py @@ -24,6 +24,7 @@ from .configuration_gpt2 import GPT2Config from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_utils import ( + TFCausalLanguageModelingLoss, TFConv1D, TFPreTrainedModel, TFSequenceSummary, @@ -272,8 +273,8 @@ def call( head_mask = inputs[5] if len(inputs) > 5 else head_mask inputs_embeds = inputs[6] if len(inputs) > 6 else inputs_embeds use_cache = inputs[7] if len(inputs) > 7 else use_cache - output_attentions = inputs[8] if len(inputs) > 7 else output_attentions - output_hidden_states = inputs[9] if len(inputs) > 8 else output_hidden_states + output_attentions = inputs[8] if len(inputs) > 8 else output_attentions + output_hidden_states = inputs[9] if len(inputs) > 9 else output_hidden_states assert len(inputs) <= 10, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): input_ids = inputs.get("input_ids") @@ -524,7 +525,7 @@ def call(self, inputs, **kwargs): (linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, ) -class TFGPT2LMHeadModel(TFGPT2PreTrainedModel): +class TFGPT2LMHeadModel(TFGPT2PreTrainedModel, TFCausalLanguageModelingLoss): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFGPT2MainLayer(config, name="transformer") @@ -541,8 +542,26 @@ def prepare_inputs_for_generation(self, inputs, past, **kwargs): @add_start_docstrings_to_callable(GPT2_INPUTS_DOCSTRING) @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="gpt2") - def call(self, inputs, **kwargs): + def call( + self, + inputs, + past=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + training=False, + ): r""" + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the cross entropy classification loss. + Indices should be in ``[0, ..., config.vocab_size - 1]``. + Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs: prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): @@ -563,12 +582,38 @@ def call(self, inputs, **kwargs): Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ - transformer_outputs = self.transformer(inputs, **kwargs) + if isinstance(inputs, (tuple, list)): + labels = inputs[10] if len(inputs) > 10 else labels + if len(inputs) > 10: + inputs = inputs[:10] + elif isinstance(inputs, (dict, BatchEncoding)): + labels = inputs.pop("labels", labels) + + transformer_outputs = self.transformer( + inputs, + past=past, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + training=training, + ) + hidden_states = transformer_outputs[0] - lm_logits = self.transformer.wte(hidden_states, mode="linear") + logits = self.transformer.wte(hidden_states, mode="linear") - outputs = (lm_logits,) + transformer_outputs[1:] + outputs = (logits,) + transformer_outputs[1:] + if labels is not None: + # shift labels to the left and cut last logit token + logits = logits[:, :-1] + labels = labels[:, 1:] + loss = self.compute_loss(labels, logits) + outputs = (loss,) + outputs return outputs # lm_logits, presents, (all hidden_states), (attentions) diff --git a/src/transformers/modeling_tf_mobilebert.py b/src/transformers/modeling_tf_mobilebert.py index 5e68853a1fbb28..98fd9c080cde9b 100644 --- a/src/transformers/modeling_tf_mobilebert.py +++ b/src/transformers/modeling_tf_mobilebert.py @@ -29,6 +29,7 @@ ) from .modeling_tf_bert import TFBertIntermediate, gelu, gelu_new, swish from .modeling_tf_utils import ( + TFMaskedLanguageModelingLoss, TFMultipleChoiceLoss, TFPreTrainedModel, TFQuestionAnsweringLoss, @@ -929,7 +930,7 @@ def call(self, inputs, **kwargs): @add_start_docstrings("""MobileBert Model with a `language modeling` head on top. """, MOBILEBERT_START_DOCSTRING) -class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel): +class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModelingLoss): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) @@ -941,8 +942,25 @@ def get_output_embeddings(self): @add_start_docstrings_to_callable(MOBILEBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="google/mobilebert-uncased") - def call(self, inputs, **kwargs): + def call( + self, + inputs=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + training=False, + ): r""" + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the masked language modeling loss. + Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels + Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): @@ -959,14 +977,34 @@ def call(self, inputs, **kwargs): Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ - outputs = self.mobilebert(inputs, **kwargs) + if isinstance(inputs, (tuple, list)): + labels = inputs[8] if len(inputs) > 8 else labels + if len(inputs) > 8: + inputs = inputs[:8] + elif isinstance(inputs, (dict, BatchEncoding)): + labels = inputs.pop("labels", labels) + + outputs = self.mobilebert( + inputs, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + training=training, + ) sequence_output = outputs[0] - prediction_scores = self.mlm(sequence_output, training=kwargs.get("training", False)) + prediction_scores = self.mlm(sequence_output, training=training) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here + if labels is not None: + loss = self.compute_loss(labels, prediction_scores) + outputs = (loss,) + outputs - return outputs # prediction_scores, (hidden_states), (attentions) + return outputs # (loss), prediction_scores, (hidden_states), (attentions) class TFMobileBertOnlyNSPHead(tf.keras.layers.Layer): diff --git a/src/transformers/modeling_tf_openai.py b/src/transformers/modeling_tf_openai.py index 477e63ee590e79..ef6805abcc9a9a 100644 --- a/src/transformers/modeling_tf_openai.py +++ b/src/transformers/modeling_tf_openai.py @@ -24,6 +24,7 @@ from .configuration_openai import OpenAIGPTConfig from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_utils import ( + TFCausalLanguageModelingLoss, TFConv1D, TFPreTrainedModel, TFSequenceSummary, @@ -479,7 +480,7 @@ def call(self, inputs, **kwargs): (linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, ) -class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel): +class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel, TFCausalLanguageModelingLoss): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFOpenAIGPTMainLayer(config, name="transformer") @@ -489,8 +490,24 @@ def get_output_embeddings(self): @add_start_docstrings_to_callable(OPENAI_GPT_INPUTS_DOCSTRING) @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="openai-gpt") - def call(self, inputs, **kwargs): + def call( + self, + inputs, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + training=False, + ): r""" + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the cross entropy classification loss. + Indices should be in ``[0, ..., config.vocab_size - 1]``. + Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs: prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): @@ -507,12 +524,35 @@ def call(self, inputs, **kwargs): Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ - transformer_outputs = self.transformer(inputs, **kwargs) + if isinstance(inputs, (tuple, list)): + labels = inputs[8] if len(inputs) > 8 else labels + if len(inputs) > 8: + inputs = inputs[:8] + elif isinstance(inputs, (dict, BatchEncoding)): + labels = inputs.pop("labels", labels) + + transformer_outputs = self.transformer( + inputs, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + training=training, + ) hidden_states = transformer_outputs[0] - lm_logits = self.transformer.tokens_embed(hidden_states, mode="linear") + logits = self.transformer.tokens_embed(hidden_states, mode="linear") + outputs = (logits,) + transformer_outputs[1:] - outputs = (lm_logits,) + transformer_outputs[1:] + if labels is not None: + # shift labels to the left and cut last logit token + logits = logits[:, :-1] + labels = labels[:, 1:] + loss = self.compute_loss(labels, logits) + outputs = (loss,) + outputs return outputs # lm_logits, (all hidden_states), (attentions) diff --git a/src/transformers/modeling_tf_roberta.py b/src/transformers/modeling_tf_roberta.py index 4148a69065aa9c..99ef96bc477e50 100644 --- a/src/transformers/modeling_tf_roberta.py +++ b/src/transformers/modeling_tf_roberta.py @@ -29,6 +29,7 @@ ) from .modeling_tf_bert import TFBertEmbeddings, TFBertMainLayer, gelu from .modeling_tf_utils import ( + TFMaskedLanguageModelingLoss, TFMultipleChoiceLoss, TFPreTrainedModel, TFQuestionAnsweringLoss, @@ -264,7 +265,7 @@ def call(self, features): @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING) -class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): +class TFRobertaForMaskedLM(TFRobertaPreTrainedModel, TFMaskedLanguageModelingLoss): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) @@ -276,8 +277,26 @@ def get_output_embeddings(self): @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="roberta-base") - def call(self, inputs, **kwargs): + def call( + self, + inputs=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + training=False, + ): r""" + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the masked language modeling loss. + Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` + Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): @@ -294,14 +313,37 @@ def call(self, inputs, **kwargs): Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ - outputs = self.roberta(inputs, **kwargs) + if isinstance(inputs, (tuple, list)): + labels = inputs[8] if len(inputs) > 8 else labels + if len(inputs) > 8: + inputs = inputs[:8] + elif isinstance(inputs, (dict, BatchEncoding)): + labels = inputs.pop("labels", labels) + + outputs = self.roberta( + inputs, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + training=training, + ) + + sequence_output = outputs[0] sequence_output = outputs[0] prediction_scores = self.lm_head(sequence_output) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here - return outputs # prediction_scores, (hidden_states), (attentions) + if labels is not None: + loss = self.compute_loss(labels, prediction_scores) + outputs = (loss,) + outputs + + return outputs # (loss), prediction_scores, (hidden_states), (attentions) class TFRobertaClassificationHead(tf.keras.layers.Layer): diff --git a/src/transformers/modeling_tf_t5.py b/src/transformers/modeling_tf_t5.py index 1898397b4ce7b5..202842e9302117 100644 --- a/src/transformers/modeling_tf_t5.py +++ b/src/transformers/modeling_tf_t5.py @@ -20,12 +20,14 @@ import itertools import logging import math +import warnings import tensorflow as tf from .configuration_t5 import T5Config from .file_utils import DUMMY_INPUTS, DUMMY_MASK, add_start_docstrings, add_start_docstrings_to_callable from .modeling_tf_utils import ( + TFCausalLanguageModelingLoss, TFPreTrainedModel, TFSharedEmbeddings, cast_bool_to_primitive, @@ -111,6 +113,7 @@ def __init__(self, config, has_relative_attention_bias=False, **kwargs): super().__init__(**kwargs) self.layer_id = next(TFT5Attention.NEW_ID) self.is_decoder = config.is_decoder + self.use_cache = config.use_cache self.has_relative_attention_bias = has_relative_attention_bias self.relative_attention_num_buckets = config.relative_attention_num_buckets @@ -258,9 +261,7 @@ def unshape(x): k, v = past_key_value_state # to cope with keras serialization - use_cache = cast_bool_to_primitive(use_cache) - - if self.is_decoder and use_cache is True: + if self.is_decoder and cast_bool_to_primitive(use_cache, self.use_cache) is True: present_key_value_state = ((k, v),) else: present_key_value_state = (None,) @@ -295,7 +296,7 @@ def unshape(x): outputs = (context,) + present_key_value_state - if cast_bool_to_primitive(output_attentions) is True: + if cast_bool_to_primitive(output_attentions, True) is True: outputs = outputs + (weights,) if self.has_relative_attention_bias: outputs = outputs + (position_bias,) @@ -572,18 +573,22 @@ def call( inputs_embeds = inputs[4] if len(inputs) > 4 else inputs_embeds head_mask = inputs[5] if len(inputs) > 5 else head_mask past_key_value_states = inputs[6] if len(inputs) > 6 else past_key_value_states - output_attentions = inputs[7] if len(inputs) > 7 else output_attentions - assert len(inputs) <= 8, "Too many inputs." + use_cache = inputs[7] if len(inputs) > 7 else use_cache + output_attentions = inputs[8] if len(inputs) > 7 else output_attentions + output_hidden_states = inputs[9] if len(inputs) > 8 else output_hidden_states + assert len(inputs) <= 10, "Too many inputs." elif isinstance(inputs, (dict, BatchEncoding)): - input_ids = inputs.get("decoder_input_ids") - attention_mask = inputs.get("decoder_attention_mask", attention_mask) + input_ids = inputs.get("input_ids") + attention_mask = inputs.get("attention_mask", attention_mask) encoder_hidden_states = inputs.get("encoder_hidden_states", encoder_hidden_states) encoder_attention_mask = inputs.get("encoder_attention_mask", encoder_attention_mask) inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) head_mask = inputs.get("head_mask", head_mask) past_key_value_states = inputs.get("past_key_value_states", past_key_value_states) + use_cache = inputs.get("use_cache", use_cache) output_attentions = inputs.get("output_attentions", output_attentions) - assert len(inputs) <= 8, "Too many inputs." + output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + assert len(inputs) <= 10, "Too many inputs." else: input_ids = inputs @@ -741,8 +746,8 @@ def call( all_hidden_states = all_hidden_states + (hidden_states,) outputs = (hidden_states,) - if use_cache is True: - assert self.is_decoder, "`use_cache` can only be set to `True` if {} is used as a decoder".format(self) + # need to check if is decoder here as well for special cases when using keras compile + if cast_bool_to_primitive(use_cache, self.use_cache) is True and self.is_decoder: outputs = outputs + (present_key_value_states,) if cast_bool_to_primitive(output_hidden_states) is True: outputs = outputs + (all_hidden_states,) @@ -771,12 +776,38 @@ def dummy_inputs(self): inputs = tf.constant(DUMMY_INPUTS) input_mask = tf.constant(DUMMY_MASK) dummy_inputs = { - "inputs": inputs, + "input_ids": inputs, "decoder_input_ids": inputs, "decoder_attention_mask": input_mask, } return dummy_inputs + def _shift_right(self, input_ids): + decoder_start_token_id = self.config.decoder_start_token_id + pad_token_id = self.config.pad_token_id + + assert ( + decoder_start_token_id is not None + ), "self.model.config.decoder_start_token_id has to be defined. In TF T5 it is usually set to the pad_token_id. See T5 docs for more information" + + # shift inputs to the right + shifted_input_ids = tf.zeros_like(input_ids, dtype=tf.int32) + shifted_input_ids = tf.roll(shifted_input_ids, 1, axis=-1) + start_tokens = tf.fill((shape_list(shifted_input_ids)[0], 1), decoder_start_token_id) + shifted_input_ids = tf.concat([start_tokens, shifted_input_ids[:, 1:]], -1) + + assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined." + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids = tf.where( + shifted_input_ids == -100, tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids + ) + + assert tf.math.reduce_any( + shifted_input_ids >= 0 + ).numpy(), "Verify that `labels` has only positive values and -100" + + return shifted_input_ids + T5_START_DOCSTRING = r""" The T5 model was proposed in `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer @@ -908,7 +939,22 @@ def get_decoder(self): return self.decoder @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING) - def call(self, inputs, **kwargs): + def call( + self, + inputs, + attention_mask=None, + encoder_outputs=None, + inputs_embeds=None, + head_mask=None, + decoder_past_key_value_states=None, + decoder_input_ids=None, + decoder_attention_mask=None, + decoder_inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + training=False, + ): r""" Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs: @@ -942,37 +988,58 @@ def call(self, inputs, **kwargs): >>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ - - if isinstance(inputs, dict): - kwargs.update(inputs) + if isinstance(inputs, (tuple, list)): + input_ids = inputs[0] + attention_mask = inputs[1] if len(inputs) > 1 else attention_mask + encoder_outputs = inputs[2] if len(inputs) > 2 else encoder_outputs + inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds + head_mask = inputs[4] if len(inputs) > 4 else head_mask + decoder_past_key_value_states = inputs[5] if len(inputs) > 5 else decoder_past_key_value_states + decoder_input_ids = inputs[6] if len(inputs) > 6 else decoder_input_ids + decoder_attention_mask = inputs[7] if len(inputs) > 7 else decoder_attention_mask + decoder_inputs_embeds = inputs[8] if len(inputs) > 8 else decoder_inputs_embeds + use_cache = inputs[9] if len(inputs) > 9 else use_cache + output_attentions = inputs[10] if len(inputs) > 10 else output_attentions + output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states + assert len(inputs) <= 12, "Too many inputs." + elif isinstance(inputs, (dict, BatchEncoding)): + if "inputs" in inputs: + warnings.warn("Using `inputs` as a keyword argument is deprecated. Please use `input_ids` instead.") + input_ids = inputs.get("inputs") + input_ids = inputs.get("input_ids") + attention_mask = inputs.get("attention_mask", attention_mask) + encoder_outputs = inputs.get("encoder_outputs", encoder_outputs) + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) + head_mask = inputs.get("head_mask", head_mask) + decoder_past_key_value_states = inputs.get("past_key_value_states", decoder_past_key_value_states) + decoder_input_ids = inputs.get("decoder_input_ids", decoder_input_ids) + decoder_attention_mask = inputs.get("decoder_attention_mask", decoder_attention_mask) + decoder_inputs_embeds = inputs.get("decoder_inputs_embeds", decoder_inputs_embeds) + use_cache = inputs.get("use_cache", use_cache) + output_attentions = inputs.get("output_attentions", output_attentions) + output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + assert len(inputs) <= 12, "Too many inputs." else: - kwargs["inputs"] = inputs - - # retrieve arguments - inputs = kwargs.get("inputs", None) - inputs_embeds = kwargs.get("inputs_embeds", None) - attention_mask = kwargs.get("attention_mask", None) - encoder_outputs = kwargs.get("encoder_outputs", None) - decoder_input_ids = kwargs.get("decoder_input_ids", None) - decoder_attention_mask = kwargs.get("decoder_attention_mask", None) - decoder_inputs_embeds = kwargs.get("decoder_inputs_embeds", None) - decoder_past_key_value_states = kwargs.get("decoder_past_key_value_states", None) - use_cache = kwargs.get("use_cache", None) - head_mask = kwargs.get("head_mask", None) - output_attentions = kwargs.get("output_attentions", None) - output_hidden_states = kwargs.get("output_hidden_states", None) + input_ids = inputs use_cache = use_cache if use_cache is not None else self.config.use_cache # Encode if needed (training, first prediction pass) if encoder_outputs is None: encoder_outputs = self.encoder( - inputs, - attention_mask=attention_mask, - inputs_embeds=inputs_embeds, - head_mask=head_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + [ + input_ids, + attention_mask, + None, + None, + inputs_embeds, + head_mask, + None, + False, + output_attentions, + output_hidden_states, + ], + training=training, ) hidden_states = encoder_outputs[0] @@ -987,19 +1054,22 @@ def call(self, inputs, **kwargs): # Decode decoder_outputs = self.decoder( - decoder_input_ids, - attention_mask=decoder_attention_mask, - inputs_embeds=decoder_inputs_embeds, - past_key_value_states=decoder_past_key_value_states, - encoder_hidden_states=hidden_states, - encoder_attention_mask=attention_mask, - head_mask=head_mask, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + [ + decoder_input_ids, + decoder_attention_mask, + hidden_states, + attention_mask, + decoder_inputs_embeds, + head_mask, + decoder_past_key_value_states, + use_cache, + output_attentions, + output_hidden_states, + ], + training=training, ) - if use_cache is True: + if cast_bool_to_primitive(use_cache, self.config.use_cache) is True: past = ((encoder_outputs, decoder_outputs[1]),) decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:] @@ -1007,7 +1077,7 @@ def call(self, inputs, **kwargs): @add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING) -class TFT5ForConditionalGeneration(TFT5PreTrainedModel): +class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModelingLoss): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.model_dim = config.d_model @@ -1050,8 +1120,28 @@ def get_decoder(self): return self.decoder @add_start_docstrings_to_callable(T5_INPUTS_DOCSTRING) - def call(self, inputs, **kwargs): + def call( + self, + inputs, + attention_mask=None, + encoder_outputs=None, + inputs_embeds=None, + head_mask=None, + decoder_past_key_value_states=None, + decoder_input_ids=None, + decoder_attention_mask=None, + decoder_inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + labels=None, + training=False, + ): r""" + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the cross entropy classification loss. + Indices should be in ``[0, ..., config.vocab_size - 1]``. + Returns: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs: loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_label` is provided): @@ -1090,25 +1180,41 @@ def call(self, inputs, **kwargs): >>> result = model.generate(inputs) """ - - if isinstance(inputs, dict): - kwargs.update(inputs) + if isinstance(inputs, (tuple, list)): + input_ids = inputs[0] + attention_mask = inputs[1] if len(inputs) > 1 else attention_mask + encoder_outputs = inputs[2] if len(inputs) > 2 else encoder_outputs + inputs_embeds = inputs[3] if len(inputs) > 3 else inputs_embeds + head_mask = inputs[4] if len(inputs) > 4 else head_mask + decoder_past_key_value_states = inputs[5] if len(inputs) > 5 else decoder_past_key_value_states + decoder_input_ids = inputs[6] if len(inputs) > 6 else decoder_input_ids + decoder_attention_mask = inputs[7] if len(inputs) > 7 else decoder_attention_mask + decoder_inputs_embeds = inputs[8] if len(inputs) > 8 else decoder_inputs_embeds + use_cache = inputs[9] if len(inputs) > 9 else use_cache + output_attentions = inputs[10] if len(inputs) > 10 else output_attentions + output_hidden_states = inputs[11] if len(inputs) > 11 else output_hidden_states + labels = inputs[12] if len(inputs) > 12 else labels + assert len(inputs) <= 13, "Too many inputs." + elif isinstance(inputs, (dict, BatchEncoding)): + if "inputs" in inputs: + warnings.warn("Using `inputs` as a keyword argument is deprecated. Please use `input_ids` instead.") + input_ids = inputs.get("inputs") + input_ids = inputs.get("input_ids") + attention_mask = inputs.get("attention_mask", attention_mask) + encoder_outputs = inputs.get("encoder_outputs", encoder_outputs) + inputs_embeds = inputs.get("inputs_embeds", inputs_embeds) + head_mask = inputs.get("head_mask", head_mask) + decoder_past_key_value_states = inputs.get("past_key_value_states", decoder_past_key_value_states) + decoder_input_ids = inputs.get("decoder_input_ids", decoder_input_ids) + decoder_attention_mask = inputs.get("decoder_attention_mask", decoder_attention_mask) + decoder_inputs_embeds = inputs.get("decoder_inputs_embeds", decoder_inputs_embeds) + use_cache = inputs.get("use_cache", use_cache) + output_attentions = inputs.get("output_attentions", output_attentions) + output_hidden_states = inputs.get("output_hidden_states", output_hidden_states) + labels = inputs.get("labels", labels) + assert len(inputs) <= 13, "Too many inputs." else: - kwargs["inputs"] = inputs - - # retrieve arguments - inputs = kwargs.get("inputs", None) - decoder_input_ids = kwargs.get("decoder_input_ids", None) - attention_mask = kwargs.get("attention_mask", None) - encoder_outputs = kwargs.get("encoder_outputs", None) - decoder_attention_mask = kwargs.get("decoder_attention_mask", None) - decoder_past_key_value_states = kwargs.get("decoder_past_key_value_states", None) - use_cache = kwargs.get("use_cache", None) - inputs_embeds = kwargs.get("inputs_embeds", None) - decoder_inputs_embeds = kwargs.get("decoder_inputs_embeds", None) - head_mask = kwargs.get("head_mask", None) - output_attentions = kwargs.get("output_attentions", None) - output_hidden_states = kwargs.get("output_hidden_states", None) + input_ids = inputs use_cache = use_cache if use_cache is not None else self.config.use_cache @@ -1116,16 +1222,27 @@ def call(self, inputs, **kwargs): if encoder_outputs is None: # Convert encoder inputs in embeddings if needed encoder_outputs = self.encoder( - inputs, - attention_mask=attention_mask, - inputs_embeds=inputs_embeds, - head_mask=head_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + [ + input_ids, + attention_mask, + None, + None, + inputs_embeds, + head_mask, + None, + False, + output_attentions, + output_hidden_states, + ], + training=training, ) hidden_states = encoder_outputs[0] + if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: + # get decoder inputs from shifting lm labels to the right + decoder_input_ids = self._shift_right(labels) + # If decoding with past key value states, only the last tokens # should be given as an input if decoder_past_key_value_states is not None: @@ -1136,28 +1253,35 @@ def call(self, inputs, **kwargs): # Decode decoder_outputs = self.decoder( - decoder_input_ids, - attention_mask=decoder_attention_mask, - inputs_embeds=decoder_inputs_embeds, - past_key_value_states=decoder_past_key_value_states, - encoder_hidden_states=hidden_states, - encoder_attention_mask=attention_mask, - head_mask=head_mask, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, + [ + decoder_input_ids, + decoder_attention_mask, + hidden_states, + attention_mask, + decoder_inputs_embeds, + head_mask, + decoder_past_key_value_states, + use_cache, + output_attentions, + output_hidden_states, + ], + training=training, ) # insert decoder past at right place # to speed up decoding - if use_cache is True: + if cast_bool_to_primitive(use_cache, self.config.use_cache) is True: past = ((encoder_outputs, decoder_outputs[1]),) decoder_outputs = decoder_outputs[:1] + past + decoder_outputs[2:] sequence_output = decoder_outputs[0] * (self.model_dim ** -0.5) embed_tokens = self.get_output_embeddings() - lm_logits = embed_tokens(sequence_output, mode="linear") - decoder_outputs = (lm_logits,) + decoder_outputs[1:] + logits = embed_tokens(sequence_output, mode="linear") + decoder_outputs = (logits,) + decoder_outputs[1:] + + if labels is not None: + loss = self.compute_loss(labels, logits) + decoder_outputs = (loss,) + decoder_outputs return decoder_outputs + encoder_outputs diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py index 741d013b6242bd..5452ecd3dc05c6 100644 --- a/src/transformers/modeling_tf_utils.py +++ b/src/transformers/modeling_tf_utils.py @@ -17,6 +17,7 @@ import functools import logging import os +import warnings import h5py import numpy as np @@ -107,6 +108,19 @@ def get_config(self): return cls +class TFCausalLanguageModelingLoss: + def compute_loss(self, labels, logits): + loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( + from_logits=True, reduction=tf.keras.losses.Reduction.NONE + ) + # make sure only labels that are not equal to -100 + # are taken into account as loss + active_loss = tf.reshape(labels, (-1,)) != -100 + reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss) + labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss) + return loss_fn(labels, reduced_logits) + + class TFQuestionAnsweringLoss: def compute_loss(self, labels, logits): loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( @@ -123,7 +137,13 @@ def compute_loss(self, labels, logits): loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True, reduction=tf.keras.losses.Reduction.NONE ) - active_loss = tf.reshape(labels, (-1,)) != -1 + # make sure only labels that are not equal to -100 + # are taken into account as loss + if tf.math.reduce_any(labels == -1).numpy() is True: + warnings.warn("Using `-1` to mask the loss for the token is depreciated. Please use `-100` instead.") + active_loss = tf.reshape(labels, (-1,)) != -1 + else: + active_loss = tf.reshape(labels, (-1,)) != -100 reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss) labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss) @@ -143,6 +163,7 @@ def compute_loss(self, labels, logits): TFMultipleChoiceLoss = TFSequenceClassificationLoss +TFMaskedLanguageModelingLoss = TFCausalLanguageModelingLoss class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin): diff --git a/src/transformers/modeling_tf_xlnet.py b/src/transformers/modeling_tf_xlnet.py index 3ec96593235f09..ac8d9fe66401d6 100644 --- a/src/transformers/modeling_tf_xlnet.py +++ b/src/transformers/modeling_tf_xlnet.py @@ -30,6 +30,7 @@ add_start_docstrings_to_callable, ) from .modeling_tf_utils import ( + TFCausalLanguageModelingLoss, TFMultipleChoiceLoss, TFPreTrainedModel, TFQuestionAnsweringLoss, @@ -871,7 +872,7 @@ def call(self, inputs, **kwargs): (linear layer with weights tied to the input embeddings). """, XLNET_START_DOCSTRING, ) -class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): +class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss): def __init__(self, config, *inputs, **kwargs): super().__init__(config, *inputs, **kwargs) self.transformer = TFXLNetMainLayer(config, name="transformer") @@ -912,8 +913,28 @@ def prepare_inputs_for_generation(self, inputs, past, **kwargs): return inputs @add_start_docstrings_to_callable(XLNET_INPUTS_DOCSTRING) - def call(self, inputs, **kwargs): + def call( + self, + inputs, + attention_mask=None, + mems=None, + perm_mask=None, + target_mapping=None, + token_type_ids=None, + input_mask=None, + head_mask=None, + inputs_embeds=None, + use_cache=True, + output_attentions=None, + output_hidden_states=None, + labels=None, + training=False, + ): r""" + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the cross entropy classification loss. + Indices should be in ``[0, ..., config.vocab_size - 1]``. + Return: :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.XLNetConfig`) and inputs: prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): @@ -957,12 +978,40 @@ def call(self, inputs, **kwargs): next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] """ - transformer_outputs = self.transformer(inputs, **kwargs) + if isinstance(inputs, (tuple, list)): + labels = inputs[12] if len(inputs) > 12 else labels + if len(inputs) > 12: + inputs = inputs[:12] + elif isinstance(inputs, (dict, BatchEncoding)): + labels = inputs.pop("labels", labels) + + transformer_outputs = self.transformer( + inputs, + attention_mask=None, + mems=None, + perm_mask=None, + target_mapping=None, + token_type_ids=None, + input_mask=None, + head_mask=None, + inputs_embeds=None, + use_cache=True, + output_attentions=None, + output_hidden_states=None, + training=training, + ) hidden_state = transformer_outputs[0] - logits = self.lm_loss(hidden_state) + logits = self.lm_loss(hidden_state, training=training) outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it + if labels is not None: + # shift labels to the left and cut last logit token + logits = logits[:, :-1] + labels = labels[:, 1:] + loss = self.compute_loss(labels, logits) + outputs = (loss,) + outputs + return outputs # return logits, (mems), (hidden states), (attentions) diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py index 0b67bafc3b7808..a950b33de18371 100644 --- a/src/transformers/modeling_xlnet.py +++ b/src/transformers/modeling_xlnet.py @@ -1041,9 +1041,9 @@ def forward( head_mask=None, inputs_embeds=None, use_cache=True, - labels=None, output_attentions=None, output_hidden_states=None, + labels=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_predict)`, `optional`, defaults to :obj:`None`): diff --git a/tests/test_modeling_distilbert.py b/tests/test_modeling_distilbert.py index 0c602851ef1a11..871d4d9da3c50c 100644 --- a/tests/test_modeling_distilbert.py +++ b/tests/test_modeling_distilbert.py @@ -17,7 +17,7 @@ import unittest from transformers import is_torch_available -from transformers.testing_utils import require_torch, torch_device +from transformers.testing_utils import require_torch, slow, torch_device from .test_configuration_common import ConfigTester from .test_modeling_common import ModelTesterMixin, ids_tensor @@ -32,6 +32,7 @@ DistilBertForTokenClassification, DistilBertForQuestionAnswering, DistilBertForSequenceClassification, + DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST, ) class DistilBertModelTester(object): @@ -276,8 +277,8 @@ def test_for_multiple_choice(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_distilbert_for_multiple_choice(*config_and_inputs) - # @slow - # def test_model_from_pretrained(self): - # for model_name in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: - # model = DistilBertModel.from_pretrained(model_name) - # self.assertIsNotNone(model) + @slow + def test_model_from_pretrained(self): + for model_name in DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + model = DistilBertModel.from_pretrained(model_name) + self.assertIsNotNone(model) diff --git a/tests/test_modeling_tf_auto.py b/tests/test_modeling_tf_auto.py index 3687513dbfc725..2702bfb5205d5f 100644 --- a/tests/test_modeling_tf_auto.py +++ b/tests/test_modeling_tf_auto.py @@ -24,6 +24,8 @@ from transformers import ( AutoConfig, BertConfig, + GPT2Config, + T5Config, TFAutoModel, TFBertModel, TFAutoModelForPreTraining, @@ -35,6 +37,25 @@ TFBertForSequenceClassification, TFAutoModelForQuestionAnswering, TFBertForQuestionAnswering, + TFAutoModelForCausalLM, + TFGPT2LMHeadModel, + TFAutoModelForMaskedLM, + TFAutoModelForSeq2SeqLM, + TFT5ForConditionalGeneration, + ) + from transformers.modeling_tf_bert import TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST + from transformers.modeling_tf_gpt2 import TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST + from transformers.modeling_tf_t5 import TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST + from transformers.modeling_tf_auto import ( + TF_MODEL_MAPPING, + TF_MODEL_FOR_PRETRAINING_MAPPING, + TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING, + TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, + TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, + TF_MODEL_WITH_LM_HEAD_MAPPING, + TF_MODEL_FOR_CAUSAL_LM_MAPPING, + TF_MODEL_FOR_MASKED_LM_MAPPING, + TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, ) @@ -72,10 +93,21 @@ def test_model_for_pretraining_from_pretrained(self): self.assertIsNotNone(model) self.assertIsInstance(model, TFBertForPreTraining) + @slow + def test_model_for_causal_lm(self): + for model_name in TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + config = AutoConfig.from_pretrained(model_name) + self.assertIsNotNone(config) + self.assertIsInstance(config, GPT2Config) + + model = TFAutoModelForCausalLM.from_pretrained(model_name) + model, loading_info = TFAutoModelForCausalLM.from_pretrained(model_name, output_loading_info=True) + self.assertIsNotNone(model) + self.assertIsInstance(model, TFGPT2LMHeadModel) + @slow def test_lmhead_model_from_pretrained(self): - # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: - for model_name in ["bert-base-uncased"]: + for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, BertConfig) @@ -84,6 +116,30 @@ def test_lmhead_model_from_pretrained(self): self.assertIsNotNone(model) self.assertIsInstance(model, TFBertForMaskedLM) + @slow + def test_model_for_masked_lm(self): + for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + config = AutoConfig.from_pretrained(model_name) + self.assertIsNotNone(config) + self.assertIsInstance(config, BertConfig) + + model = TFAutoModelForMaskedLM.from_pretrained(model_name) + model, loading_info = TFAutoModelForMaskedLM.from_pretrained(model_name, output_loading_info=True) + self.assertIsNotNone(model) + self.assertIsInstance(model, TFBertForMaskedLM) + + @slow + def test_model_for_encoder_decoder_lm(self): + for model_name in TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: + config = AutoConfig.from_pretrained(model_name) + self.assertIsNotNone(config) + self.assertIsInstance(config, T5Config) + + model = TFAutoModelForSeq2SeqLM.from_pretrained(model_name) + model, loading_info = TFAutoModelForSeq2SeqLM.from_pretrained(model_name, output_loading_info=True) + self.assertIsNotNone(model) + self.assertIsInstance(model, TFT5ForConditionalGeneration) + @slow def test_sequence_classification_model_from_pretrained(self): # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: @@ -119,3 +175,28 @@ def test_from_identifier_from_model_type(self): self.assertIsInstance(model, TFRobertaForMaskedLM) self.assertEqual(model.num_parameters(), 14830) self.assertEqual(model.num_parameters(only_trainable=True), 14830) + + def test_parents_and_children_in_mappings(self): + # Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered + # by the parents and will return the wrong configuration type when using auto models + mappings = ( + TF_MODEL_MAPPING, + TF_MODEL_FOR_PRETRAINING_MAPPING, + TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING, + TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, + TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, + TF_MODEL_WITH_LM_HEAD_MAPPING, + TF_MODEL_FOR_CAUSAL_LM_MAPPING, + TF_MODEL_FOR_MASKED_LM_MAPPING, + TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, + ) + + for mapping in mappings: + mapping = tuple(mapping.items()) + for index, (child_config, child_model) in enumerate(mapping[1:]): + for parent_config, parent_model in mapping[: index + 1]: + with self.subTest( + msg="Testing if {} is child of {}".format(child_config.__name__, parent_config.__name__) + ): + self.assertFalse(issubclass(child_config, parent_config)) + self.assertFalse(issubclass(child_model, parent_model)) diff --git a/tests/test_modeling_tf_bert.py b/tests/test_modeling_tf_bert.py index 042f3b40067e06..7e1884bafca410 100644 --- a/tests/test_modeling_tf_bert.py +++ b/tests/test_modeling_tf_bert.py @@ -27,6 +27,7 @@ import tensorflow as tf from transformers.modeling_tf_bert import ( TFBertModel, + TFBertLMHeadModel, TFBertForMaskedLM, TFBertForNextSentencePrediction, TFBertForPreTraining, @@ -142,11 +143,30 @@ def create_and_check_bert_model( ) self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) + def create_and_check_bert_lm_head( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.is_decoder = True + model = TFBertLMHeadModel(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + (prediction_scores,) = model(inputs) + self.parent.assertListEqual( + list(prediction_scores.numpy().shape), [self.batch_size, self.seq_length, self.vocab_size] + ) + def create_and_check_bert_for_masked_lm( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = TFBertForMaskedLM(config=config) - inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } (prediction_scores,) = model(inputs) result = { "prediction_scores": prediction_scores.numpy(), @@ -186,11 +206,14 @@ def create_and_check_bert_for_sequence_classification( ): config.num_labels = self.num_labels model = TFBertForSequenceClassification(config=config) - inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - (logits,) = model(inputs) - result = { - "logits": logits.numpy(), + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, } + + (logits,) = model(inputs) + result = {"logits": logits.numpy()} self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) def create_and_check_bert_for_multiple_choice( @@ -207,9 +230,7 @@ def create_and_check_bert_for_multiple_choice( "token_type_ids": multiple_choice_token_type_ids, } (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = {"logits": logits.numpy()} self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) def create_and_check_bert_for_token_classification( @@ -217,7 +238,11 @@ def create_and_check_bert_for_token_classification( ): config.num_labels = self.num_labels model = TFBertForTokenClassification(config=config) - inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } (logits,) = model(inputs) result = { "logits": logits.numpy(), @@ -228,12 +253,14 @@ def create_and_check_bert_for_question_answering( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = TFBertForQuestionAnswering(config=config) - inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} - start_logits, end_logits = model(inputs) - result = { - "start_logits": start_logits.numpy(), - "end_logits": end_logits.numpy(), + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, } + + start_logits, end_logits = model(inputs) + result = {"start_logits": start_logits.numpy(), "end_logits": end_logits.numpy()} self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) @@ -285,6 +312,10 @@ def test_for_masked_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_bert_for_masked_lm(*config_and_inputs) + def test_for_causal_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_bert_lm_head(*config_and_inputs) + def test_for_multiple_choice(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_bert_for_multiple_choice(*config_and_inputs) diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py index 56fc8c9cf76fb1..0d90a82ecb6ce1 100644 --- a/tests/test_modeling_tf_common.py +++ b/tests/test_modeling_tf_common.py @@ -38,6 +38,9 @@ TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING, TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, + TF_MODEL_FOR_CAUSAL_LM_MAPPING, + TF_MODEL_FOR_MASKED_LM_MAPPING, + TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, ) if _tf_gpu_memory_limit is not None: @@ -93,6 +96,12 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size) elif model_class in TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING.values(): inputs_dict["labels"] = tf.zeros((self.model_tester.batch_size, self.model_tester.seq_length)) + elif model_class in TF_MODEL_FOR_CAUSAL_LM_MAPPING.values(): + inputs_dict["labels"] = tf.zeros((self.model_tester.batch_size, self.model_tester.seq_length)) + elif model_class in TF_MODEL_FOR_MASKED_LM_MAPPING.values(): + inputs_dict["labels"] = tf.zeros((self.model_tester.batch_size, self.model_tester.seq_length)) + elif model_class in TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING.values(): + inputs_dict["labels"] = tf.zeros((self.model_tester.batch_size, self.model_tester.seq_length)) return inputs_dict def test_initialization(self): @@ -291,7 +300,7 @@ def test_compile_tf_model(self): "decoder_input_ids": tf.keras.Input( batch_shape=(2, 2000), name="decoder_input_ids", dtype="int32" ), - "inputs": tf.keras.Input(batch_shape=(2, 2000), name="inputs", dtype="int32"), + "input_ids": tf.keras.Input(batch_shape=(2, 2000), name="input_ids", dtype="int32"), } elif model_class in TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING.values(): input_ids = tf.keras.Input(batch_shape=(4, 2, 2000), name="input_ids", dtype="int32") @@ -325,7 +334,7 @@ def test_keyword_and_dict_args(self): outputs_dict = model(self._prepare_for_class(inputs_dict, model_class)) inputs_keywords = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) - input_ids = inputs_keywords.pop("input_ids" if not self.is_encoder_decoder else "inputs", None,) + input_ids = inputs_keywords.pop("input_ids", None) outputs_keywords = model(input_ids, **inputs_keywords) output_dict = outputs_dict[0].numpy() output_keywords = outputs_keywords[0].numpy() @@ -479,9 +488,9 @@ def test_inputs_embeds(self): input_ids = inputs["input_ids"] del inputs["input_ids"] else: - encoder_input_ids = inputs["inputs"] + encoder_input_ids = inputs["input_ids"] decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids) - del inputs["inputs"] + del inputs["input_ids"] inputs.pop("decoder_input_ids", None) wte = model.get_input_embeddings() @@ -596,9 +605,15 @@ def test_loss_computation(self): added_label = prepared_for_class[list(prepared_for_class.keys() - inputs_dict.keys())[0]] loss_size = tf.size(added_label) + if model.__class__ in TF_MODEL_FOR_CAUSAL_LM_MAPPING.values(): + # if loss is causal lm loss, labels are shift, so that one label per batch + # is cut + loss_size = loss_size - self.model_tester.batch_size + # Test that model correctly compute the loss with kwargs prepared_for_class = self._prepare_for_class(inputs_dict.copy(), model_class, return_labels=True) input_ids = prepared_for_class.pop("input_ids") + loss = model(input_ids, **prepared_for_class)[0] self.assertEqual(loss.shape, [loss_size]) diff --git a/tests/test_modeling_tf_distilbert.py b/tests/test_modeling_tf_distilbert.py index 7d7b63fb0e9c7f..e3c83a47a7c3f4 100644 --- a/tests/test_modeling_tf_distilbert.py +++ b/tests/test_modeling_tf_distilbert.py @@ -17,7 +17,7 @@ import unittest from transformers import DistilBertConfig, is_tf_available -from transformers.testing_utils import require_tf +from transformers.testing_utils import require_tf, slow from .test_configuration_common import ConfigTester from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor @@ -32,6 +32,7 @@ TFDistilBertForSequenceClassification, TFDistilBertForTokenClassification, TFDistilBertForMultipleChoice, + TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST, ) @@ -118,9 +119,7 @@ def create_and_check_distilbert_for_masked_lm( model = TFDistilBertForMaskedLM(config=config) inputs = {"input_ids": input_ids, "attention_mask": input_mask} (prediction_scores,) = model(inputs) - result = { - "prediction_scores": prediction_scores.numpy(), - } + result = {"prediction_scores": prediction_scores.numpy()} self.parent.assertListEqual( list(result["prediction_scores"].shape), [self.batch_size, self.seq_length, self.vocab_size] ) @@ -129,12 +128,12 @@ def create_and_check_distilbert_for_question_answering( self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels ): model = TFDistilBertForQuestionAnswering(config=config) - inputs = {"input_ids": input_ids, "attention_mask": input_mask} - start_logits, end_logits = model(inputs) - result = { - "start_logits": start_logits.numpy(), - "end_logits": end_logits.numpy(), + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, } + start_logits, end_logits = model(inputs) + result = {"start_logits": start_logits.numpy(), "end_logits": end_logits.numpy()} self.parent.assertListEqual(list(result["start_logits"].shape), [self.batch_size, self.seq_length]) self.parent.assertListEqual(list(result["end_logits"].shape), [self.batch_size, self.seq_length]) @@ -145,9 +144,7 @@ def create_and_check_distilbert_for_sequence_classification( model = TFDistilBertForSequenceClassification(config) inputs = {"input_ids": input_ids, "attention_mask": input_mask} (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = {"logits": logits.numpy()} self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_labels]) def create_and_check_distilbert_for_multiple_choice( @@ -162,9 +159,7 @@ def create_and_check_distilbert_for_multiple_choice( "attention_mask": multiple_choice_input_mask, } (logits,) = model(inputs) - result = { - "logits": logits.numpy(), - } + result = {"logits": logits.numpy()} self.parent.assertListEqual(list(result["logits"].shape), [self.batch_size, self.num_choices]) def create_and_check_distilbert_for_token_classification( @@ -236,8 +231,8 @@ def test_for_token_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_distilbert_for_token_classification(*config_and_inputs) - # @slow - # def test_model_from_pretrained(self): - # for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: - # model = DistilBertModesss.from_pretrained(model_name) - # self.assertIsNotNone(model) + @slow + def test_model_from_pretrained(self): + for model_name in list(TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]): + model = TFDistilBertModel.from_pretrained(model_name) + self.assertIsNotNone(model) diff --git a/tests/test_modeling_tf_t5.py b/tests/test_modeling_tf_t5.py index be4637f465af4a..6147279d2e271c 100644 --- a/tests/test_modeling_tf_t5.py +++ b/tests/test_modeling_tf_t5.py @@ -76,6 +76,7 @@ def prepare_config_and_inputs(self): eos_token_id=self.eos_token_id, bos_token_id=self.pad_token_id, pad_token_id=self.pad_token_id, + decoder_start_token_id=self.pad_token_id, ) return (config, input_ids, input_mask, token_labels) @@ -83,7 +84,7 @@ def prepare_config_and_inputs(self): def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels): model = TFT5Model(config=config) inputs = { - "inputs": input_ids, + "input_ids": input_ids, "decoder_input_ids": input_ids, "decoder_attention_mask": input_mask, } @@ -114,7 +115,7 @@ def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels) def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels): model = TFT5ForConditionalGeneration(config=config) inputs_dict = { - "inputs": input_ids, + "input_ids": input_ids, "decoder_input_ids": input_ids, "decoder_attention_mask": input_mask, } @@ -208,7 +209,7 @@ def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() (config, input_ids, input_mask, token_labels) = config_and_inputs inputs_dict = { - "inputs": input_ids, + "input_ids": input_ids, "decoder_input_ids": input_ids, "decoder_attention_mask": input_mask, "use_cache": tf.convert_to_tensor([False]),