diff --git a/docs/source/index.rst b/docs/source/index.rst index 8bf77a679f6f5..3777650d9ed0e 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -343,7 +343,7 @@ Flax), PyTorch, and/or TensorFlow. +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ | DPR | ✅ | ✅ | ✅ | ✅ | ❌ | +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ -| DeBERTa | ✅ | ✅ | ✅ | ❌ | ❌ | +| DeBERTa | ✅ | ✅ | ✅ | ✅ | ❌ | +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ | DeBERTa-v2 | ✅ | ❌ | ✅ | ❌ | ❌ | +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ diff --git a/docs/source/model_doc/deberta.rst b/docs/source/model_doc/deberta.rst index 848948be4da44..5b721bb35e880 100644 --- a/docs/source/model_doc/deberta.rst +++ b/docs/source/model_doc/deberta.rst @@ -38,7 +38,8 @@ the training data performs consistently better on a wide range of NLP tasks, ach pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.* -This model was contributed by `DeBERTa `__. The original code can be found `here +This model was contributed by `DeBERTa `__. This model TF 2.0 implementation was +contributed by `kamalkraj `__ . The original code can be found `here `__. @@ -103,3 +104,45 @@ DebertaForQuestionAnswering .. autoclass:: transformers.DebertaForQuestionAnswering :members: forward + + +TFDebertaModel +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFDebertaModel + :members: call + + +TFDebertaPreTrainedModel +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFDebertaPreTrainedModel + :members: call + + +TFDebertaForMaskedLM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFDebertaForMaskedLM + :members: call + + +TFDebertaForSequenceClassification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFDebertaForSequenceClassification + :members: call + + +TFDebertaForTokenClassification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFDebertaForTokenClassification + :members: call + + +TFDebertaForQuestionAnswering +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFDebertaForQuestionAnswering + :members: call diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 2d547e8cccf6a..3320a788aae4a 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1297,6 +1297,17 @@ "TFCTRLPreTrainedModel", ] ) + _import_structure["models.deberta"].extend( + [ + "TF_DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFDebertaForMaskedLM", + "TFDebertaForQuestionAnswering", + "TFDebertaForSequenceClassification", + "TFDebertaForTokenClassification", + "TFDebertaModel", + "TFDebertaPreTrainedModel", + ] + ) _import_structure["models.distilbert"].extend( [ "TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -2820,6 +2831,15 @@ TFCTRLModel, TFCTRLPreTrainedModel, ) + from .models.deberta import ( + TF_DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, + TFDebertaForMaskedLM, + TFDebertaForQuestionAnswering, + TFDebertaForSequenceClassification, + TFDebertaForTokenClassification, + TFDebertaModel, + TFDebertaPreTrainedModel, + ) from .models.distilbert import ( TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST, TFDistilBertForMaskedLM, diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py index 8e9745763fbbd..cb66b80b2a794 100644 --- a/src/transformers/models/auto/modeling_tf_auto.py +++ b/src/transformers/models/auto/modeling_tf_auto.py @@ -29,6 +29,7 @@ TF_MODEL_MAPPING_NAMES = OrderedDict( [ # Base model mapping + ("deberta", "TFDebertaModel"), ("rembert", "TFRemBertModel"), ("roformer", "TFRoFormerModel"), ("convbert", "TFConvBertModel"), @@ -144,6 +145,7 @@ TF_MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict( [ # Model for Masked LM mapping + ("deberta", "TFDebertaForMaskedLM"), ("rembert", "TFRemBertForMaskedLM"), ("roformer", "TFRoFormerForMaskedLM"), ("convbert", "TFConvBertForMaskedLM"), @@ -183,6 +185,7 @@ TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict( [ # Model for Sequence Classification mapping + ("deberta", "TFDebertaForSequenceClassification"), ("rembert", "TFRemBertForSequenceClassification"), ("roformer", "TFRoFormerForSequenceClassification"), ("convbert", "TFConvBertForSequenceClassification"), @@ -211,6 +214,7 @@ TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict( [ # Model for Question Answering mapping + ("deberta", "TFDebertaForQuestionAnswering"), ("rembert", "TFRemBertForQuestionAnswering"), ("roformer", "TFRoFormerForQuestionAnswering"), ("convbert", "TFConvBertForQuestionAnswering"), @@ -234,6 +238,7 @@ TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict( [ # Model for Token Classification mapping + ("deberta", "TFDebertaForTokenClassification"), ("rembert", "TFRemBertForTokenClassification"), ("roformer", "TFRoFormerForTokenClassification"), ("convbert", "TFConvBertForTokenClassification"), diff --git a/src/transformers/models/deberta/__init__.py b/src/transformers/models/deberta/__init__.py index 50dd4144ba93d..731fde4eae5dc 100644 --- a/src/transformers/models/deberta/__init__.py +++ b/src/transformers/models/deberta/__init__.py @@ -18,7 +18,7 @@ from typing import TYPE_CHECKING -from ...file_utils import _LazyModule, is_tokenizers_available, is_torch_available +from ...file_utils import _LazyModule, is_tf_available, is_tokenizers_available, is_torch_available _import_structure = { @@ -40,6 +40,17 @@ "DebertaPreTrainedModel", ] +if is_tf_available(): + _import_structure["modeling_tf_deberta"] = [ + "TF_DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFDebertaForMaskedLM", + "TFDebertaForQuestionAnswering", + "TFDebertaForSequenceClassification", + "TFDebertaForTokenClassification", + "TFDebertaModel", + "TFDebertaPreTrainedModel", + ] + if TYPE_CHECKING: from .configuration_deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig @@ -59,6 +70,18 @@ DebertaPreTrainedModel, ) + if is_tf_available(): + from .modeling_tf_deberta import ( + TF_DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, + TFDebertaForMaskedLM, + TFDebertaForQuestionAnswering, + TFDebertaForSequenceClassification, + TFDebertaForTokenClassification, + TFDebertaModel, + TFDebertaPreTrainedModel, + ) + + else: import sys diff --git a/src/transformers/models/deberta/modeling_tf_deberta.py b/src/transformers/models/deberta/modeling_tf_deberta.py new file mode 100644 index 0000000000000..d0cd8f612906f --- /dev/null +++ b/src/transformers/models/deberta/modeling_tf_deberta.py @@ -0,0 +1,1555 @@ +# coding=utf-8 +# Copyright 2021 Microsoft and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" TF 2.0 DeBERTa model. """ + + +import math +from typing import Dict, Optional, Sequence, Tuple, Union + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward +from ...modeling_tf_outputs import ( + TFBaseModelOutput, + TFMaskedLMOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) +from ...modeling_tf_utils import ( + TFMaskedLanguageModelingLoss, + TFModelInputType, + TFPreTrainedModel, + TFQuestionAnsweringLoss, + TFSequenceClassificationLoss, + TFTokenClassificationLoss, + get_initializer, + input_processing, + shape_list, +) +from ...utils import logging +from .configuration_deberta import DebertaConfig + + +logger = logging.get_logger(__name__) + + +_CONFIG_FOR_DOC = "DebertaConfig" +_TOKENIZER_FOR_DOC = "DebertaTokenizer" +_CHECKPOINT_FOR_DOC = "kamalkraj/deberta-base" + +TF_DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "kamalkraj/deberta-base", + # See all DeBERTa models at https://huggingface.co/models?filter=DeBERTa +] + + +class TFDebertaContextPooler(tf.keras.layers.Layer): + def __init__(self, config: DebertaConfig, **kwargs): + super().__init__(**kwargs) + self.dense = tf.keras.layers.Dense(config.pooler_hidden_size, name="dense") + self.dropout = TFDebertaStableDropout(config.pooler_dropout, name="dropout") + self.config = config + + def call(self, hidden_states, training: bool = False): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + context_token = hidden_states[:, 0] + context_token = self.dropout(context_token, training=training) + pooled_output = self.dense(context_token) + pooled_output = get_tf_activation(self.config.pooler_hidden_act)(pooled_output) + return pooled_output + + @property + def output_dim(self) -> int: + return self.config.hidden_size + + +class TFDebertaXSoftmax(tf.keras.layers.Layer): + """ + Masked Softmax which is optimized for saving memory + + Args: + input (:obj:`tf.Tensor`): The input tensor that will apply softmax. + mask (:obj:`tf.Tensor`): The mask matrix where 0 indicate that element will be ignored in the softmax calculation. + dim (int): The dimension that will apply softmax + """ + + def __init__(self, axis=-1, **kwargs): + super().__init__(**kwargs) + self.axis = axis + + def call(self, inputs: tf.Tensor, mask: tf.Tensor): + + rmask = tf.logical_not(tf.cast(mask, tf.bool)) + output = tf.where(rmask, float("-inf"), inputs) + output = tf.nn.softmax(output, self.axis) + output = tf.where(rmask, 0.0, output) + return output + + +def get_mask(input, dropout): + mask = tf.cast( + 1 - tf.compat.v1.distributions.Bernoulli(probs=1 - dropout).sample(sample_shape=shape_list(input)), tf.bool + ) + return mask, dropout + + +@tf.custom_gradient +def TFDebertaXDropout(input, local_ctx): + mask, dropout = get_mask(input, local_ctx) + scale = tf.convert_to_tensor(1.0 / (1 - dropout), dtype=tf.float32) + input = tf.cond(dropout > 0, lambda: tf.where(mask, 0.0, input) * scale, lambda: input) + + def custom_grad(upstream_grad): + return tf.cond( + scale > 1, lambda: (tf.where(mask, 0.0, upstream_grad) * scale, None), lambda: (upstream_grad, None) + ) + + return input, custom_grad + + +class TFDebertaStableDropout(tf.keras.layers.Layer): + """ + Optimized dropout module for stabilizing the training + + Args: + drop_prob (float): the dropout probabilities + """ + + def __init__(self, drop_prob, **kwargs): + super().__init__(**kwargs) + self.drop_prob = tf.convert_to_tensor(drop_prob, dtype=tf.float32) + + def call(self, inputs: tf.Tensor, training: tf.Tensor = False): + if training and self.drop_prob > 0: + return TFDebertaXDropout(inputs, self.drop_prob) + return inputs + + +class TFDebertaLayerNorm(tf.keras.layers.Layer): + """LayerNorm module in the TF style (epsilon inside the square root).""" + + def __init__(self, size, eps=1e-12, **kwargs): + super().__init__(**kwargs) + self.size = size + self.eps = eps + + def build(self, input_shape): + self.gamma = self.add_weight(shape=[self.size], initializer=tf.ones_initializer(), name="weight") + self.beta = self.add_weight(shape=[self.size], initializer=tf.zeros_initializer(), name="bias") + return super().build(input_shape) + + def call(self, x: tf.Tensor) -> tf.Tensor: + mean = tf.reduce_mean(x, axis=[-1], keepdims=True) + variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True) + std = tf.math.sqrt(variance + self.eps) + return self.gamma * (x - mean) / std + self.beta + + +class TFDebertaSelfOutput(tf.keras.layers.Layer): + def __init__(self, config: DebertaConfig, **kwargs): + super().__init__(**kwargs) + self.dense = tf.keras.layers.Dense(config.hidden_size, name="dense") + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout") + + def call(self, hidden_states, input_tensor, training: bool = False): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class TFDebertaAttention(tf.keras.layers.Layer): + def __init__(self, config: DebertaConfig, **kwargs): + super().__init__(**kwargs) + self.self = TFDebertaDisentangledSelfAttention(config, name="self") + self.dense_output = TFDebertaSelfOutput(config, name="output") + self.config = config + + def call( + self, + input_tensor: tf.Tensor, + attention_mask: tf.Tensor, + query_states: tf.Tensor = None, + relative_pos: tf.Tensor = None, + rel_embeddings: tf.Tensor = None, + output_attentions: bool = False, + training: bool = False, + ) -> Tuple[tf.Tensor]: + self_outputs = self.self( + hidden_states=input_tensor, + attention_mask=attention_mask, + query_states=query_states, + relative_pos=relative_pos, + rel_embeddings=rel_embeddings, + output_attentions=output_attentions, + training=training, + ) + if query_states is None: + query_states = input_tensor + attention_output = self.dense_output( + hidden_states=self_outputs[0], input_tensor=query_states, training=training + ) + + output = (attention_output,) + self_outputs[1:] + + return output + + +class TFDebertaIntermediate(tf.keras.layers.Layer): + def __init__(self, config: DebertaConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = tf.keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = get_tf_activation(config.hidden_act) + else: + self.intermediate_act_fn = config.hidden_act + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + +class TFDebertaOutput(tf.keras.layers.Layer): + def __init__(self, config: DebertaConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = tf.keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout") + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + + return hidden_states + + +class TFDebertaLayer(tf.keras.layers.Layer): + def __init__(self, config: DebertaConfig, **kwargs): + super().__init__(**kwargs) + + self.attention = TFDebertaAttention(config, name="attention") + self.intermediate = TFDebertaIntermediate(config, name="intermediate") + self.bert_output = TFDebertaOutput(config, name="output") + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + query_states: tf.Tensor = None, + relative_pos: tf.Tensor = None, + rel_embeddings: tf.Tensor = None, + output_attentions: bool = False, + training: bool = False, + ) -> Tuple[tf.Tensor]: + attention_outputs = self.attention( + input_tensor=hidden_states, + attention_mask=attention_mask, + query_states=query_states, + relative_pos=relative_pos, + rel_embeddings=rel_embeddings, + output_attentions=output_attentions, + training=training, + ) + attention_output = attention_outputs[0] + intermediate_output = self.intermediate(hidden_states=attention_output) + layer_output = self.bert_output( + hidden_states=intermediate_output, input_tensor=attention_output, training=training + ) + outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them + + return outputs + + +class TFDebertaEncoder(tf.keras.layers.Layer): + def __init__(self, config: DebertaConfig, **kwargs): + super().__init__(**kwargs) + + self.layer = [TFDebertaLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)] + self.relative_attention = getattr(config, "relative_attention", False) + self.config = config + if self.relative_attention: + self.max_relative_positions = getattr(config, "max_relative_positions", -1) + if self.max_relative_positions < 1: + self.max_relative_positions = config.max_position_embeddings + + def build(self, input_shape): + if self.relative_attention: + self.rel_embeddings = self.add_weight( + name="rel_embeddings.weight", + shape=[self.max_relative_positions * 2, self.config.hidden_size], + initializer=get_initializer(self.config.initializer_range), + ) + return super().build(input_shape) + + def get_rel_embedding(self): + rel_embeddings = self.rel_embeddings if self.relative_attention else None + return rel_embeddings + + def get_attention_mask(self, attention_mask): + if len(shape_list(attention_mask)) <= 2: + extended_attention_mask = tf.expand_dims(tf.expand_dims(attention_mask, 1), 2) + attention_mask = extended_attention_mask * tf.expand_dims(tf.squeeze(extended_attention_mask, -2), -1) + attention_mask = tf.cast(attention_mask, tf.uint8) + elif len(shape_list(attention_mask)) == 3: + attention_mask = tf.expand_dims(attention_mask, 1) + + return attention_mask + + def get_rel_pos(self, hidden_states, query_states=None, relative_pos=None): + if self.relative_attention and relative_pos is None: + q = shape_list(query_states)[-2] if query_states is not None else shape_list(hidden_states)[-2] + relative_pos = build_relative_position(q, shape_list(hidden_states)[-2]) + return relative_pos + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + query_states: tf.Tensor = None, + relative_pos: tf.Tensor = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + training: bool = False, + ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + attention_mask = self.get_attention_mask(attention_mask) + relative_pos = self.get_rel_pos(hidden_states, query_states, relative_pos) + + if isinstance(hidden_states, Sequence): + next_kv = hidden_states[0] + else: + next_kv = hidden_states + + rel_embeddings = self.get_rel_embedding() + + for i, layer_module in enumerate(self.layer): + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_outputs = layer_module( + hidden_states=next_kv, + attention_mask=attention_mask, + query_states=query_states, + relative_pos=relative_pos, + rel_embeddings=rel_embeddings, + output_attentions=output_attentions, + training=training, + ) + hidden_states = layer_outputs[0] + + if query_states is not None: + query_states = hidden_states + if isinstance(hidden_states, Sequence): + next_kv = hidden_states[i + 1] if i + 1 < len(self.layer) else None + else: + next_kv = hidden_states + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) + + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +def build_relative_position(query_size, key_size): + """ + Build relative position according to the query and key + + We assume the absolute position of query :math:`P_q` is range from (0, query_size) and the absolute position of key + :math:`P_k` is range from (0, key_size), The relative positions from query to key is :math:`R_{q \\rightarrow k} = + P_q - P_k` + + Args: + query_size (int): the length of query + key_size (int): the length of key + + Return: + :obj:`tf.Tensor`: A tensor with shape [1, query_size, key_size] + + """ + q_ids = tf.range(query_size, dtype=tf.int32) + k_ids = tf.range(key_size, dtype=tf.int32) + rel_pos_ids = q_ids[:, None] - tf.tile(tf.reshape(k_ids, [1, -1]), [query_size, 1]) + rel_pos_ids = rel_pos_ids[:query_size, :] + rel_pos_ids = tf.expand_dims(rel_pos_ids, axis=0) + return tf.cast(rel_pos_ids, tf.int64) + + +def c2p_dynamic_expand(c2p_pos, query_layer, relative_pos): + shapes = [ + shape_list(query_layer)[0], + shape_list(query_layer)[1], + shape_list(query_layer)[2], + shape_list(relative_pos)[-1], + ] + return tf.broadcast_to(c2p_pos, shapes) + + +def p2c_dynamic_expand(c2p_pos, query_layer, key_layer): + shapes = [ + shape_list(query_layer)[0], + shape_list(query_layer)[1], + shape_list(key_layer)[-2], + shape_list(key_layer)[-2], + ] + return tf.broadcast_to(c2p_pos, shapes) + + +def pos_dynamic_expand(pos_index, p2c_att, key_layer): + shapes = shape_list(p2c_att)[:2] + [shape_list(pos_index)[-2], shape_list(key_layer)[-2]] + return tf.broadcast_to(pos_index, shapes) + + +def torch_gather(x, indices, gather_axis): + if gather_axis < 0: + gather_axis = tf.rank(x) + gather_axis + + if gather_axis != tf.rank(x) - 1: + pre_roll = tf.rank(x) - 1 - gather_axis + permutation = tf.roll(tf.range(tf.rank(x)), pre_roll, axis=0) + x = tf.transpose(x, perm=permutation) + indices = tf.transpose(indices, perm=permutation) + else: + pre_roll = 0 + + flat_x = tf.reshape(x, (-1, tf.shape(x)[-1])) + flat_indices = tf.reshape(indices, (-1, tf.shape(indices)[-1])) + gathered = tf.gather(flat_x, flat_indices, batch_dims=1) + gathered = tf.reshape(gathered, tf.shape(indices)) + + if pre_roll != 0: + permutation = tf.roll(tf.range(tf.rank(x)), -pre_roll, axis=0) + gathered = tf.transpose(gathered, perm=permutation) + + return gathered + + +class TFDebertaDisentangledSelfAttention(tf.keras.layers.Layer): + """ + Disentangled self-attention module + + Parameters: + config (:obj:`str`): + A model config class instance with the configuration to build a new model. The schema is similar to + `BertConfig`, for more details, please refer :class:`~transformers.DebertaConfig` + + """ + + def __init__(self, config: DebertaConfig, **kwargs): + super().__init__(**kwargs) + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.in_proj = tf.keras.layers.Dense( + self.all_head_size * 3, + kernel_initializer=get_initializer(config.initializer_range), + name="in_proj", + use_bias=False, + ) + self.pos_att_type = config.pos_att_type if config.pos_att_type is not None else [] + + self.relative_attention = getattr(config, "relative_attention", False) + self.talking_head = getattr(config, "talking_head", False) + + if self.talking_head: + self.head_logits_proj = tf.keras.layers.Dense( + self.num_attention_heads, + kernel_initializer=get_initializer(config.initializer_range), + name="head_logits_proj", + use_bias=False, + ) + self.head_weights_proj = tf.keras.layers.Dense( + self.num_attention_heads, + kernel_initializer=get_initializer(config.initializer_range), + name="head_weights_proj", + use_bias=False, + ) + + self.softmax = TFDebertaXSoftmax(axis=-1) + + if self.relative_attention: + self.max_relative_positions = getattr(config, "max_relative_positions", -1) + if self.max_relative_positions < 1: + self.max_relative_positions = config.max_position_embeddings + self.pos_dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="pos_dropout") + if "c2p" in self.pos_att_type or "p2p" in self.pos_att_type: + self.pos_proj = tf.keras.layers.Dense( + self.all_head_size, + kernel_initializer=get_initializer(config.initializer_range), + name="pos_proj", + use_bias=False, + ) + if "p2c" in self.pos_att_type or "p2p" in self.pos_att_type: + self.pos_q_proj = tf.keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="pos_q_proj" + ) + + self.dropout = TFDebertaStableDropout(config.attention_probs_dropout_prob, name="dropout") + + def build(self, input_shape): + self.q_bias = self.add_weight( + name="q_bias", shape=(self.all_head_size), initializer=tf.keras.initializers.Zeros() + ) + self.v_bias = self.add_weight( + name="v_bias", shape=(self.all_head_size), initializer=tf.keras.initializers.Zeros() + ) + return super().build(input_shape) + + def transpose_for_scores(self, tensor: tf.Tensor) -> tf.Tensor: + shape = shape_list(tensor)[:-1] + [self.num_attention_heads, -1] + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] + tensor = tf.reshape(tensor=tensor, shape=shape) + + # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size] + return tf.transpose(tensor, perm=[0, 2, 1, 3]) + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + query_states: tf.Tensor = None, + relative_pos: tf.Tensor = None, + rel_embeddings: tf.Tensor = None, + output_attentions: bool = False, + training: bool = False, + ) -> Tuple[tf.Tensor]: + """ + Call the module + + Args: + hidden_states (:obj:`tf.Tensor`): + Input states to the module usually the output from previous layer, it will be the Q,K and V in + `Attention(Q,K,V)` + + attention_mask (:obj:`tf.Tensor`): + An attention mask matrix of shape [`B`, `N`, `N`] where `B` is the batch size, `N` is the maximum + sequence length in which element [i,j] = `1` means the `i` th token in the input can attend to the `j` + th token. + + return_att (:obj:`bool`, optional): + Whether return the attention matrix. + + query_states (:obj:`tf.Tensor`, optional): + The `Q` state in `Attention(Q,K,V)`. + + relative_pos (:obj:`tf.Tensor`): + The relative position encoding between the tokens in the sequence. It's of shape [`B`, `N`, `N`] with + values ranging in [`-max_relative_positions`, `max_relative_positions`]. + + rel_embeddings (:obj:`tf.Tensor`): + The embedding of relative distances. It's a tensor of shape [:math:`2 \\times + \\text{max_relative_positions}`, `hidden_size`]. + + + """ + if query_states is None: + qp = self.in_proj(hidden_states) # .split(self.all_head_size, dim=-1) + query_layer, key_layer, value_layer = tf.split( + self.transpose_for_scores(qp), num_or_size_splits=3, axis=-1 + ) + else: + + def linear(w, b, x): + return tf.cond( + b is not None, + lambda: tf.matmul(x, w, transpose_b=True) + tf.transpose(b), + lambda: tf.matmul(x, w, transpose_b=True), + ) + + ws = tf.split( + tf.transpose(self.in_proj.weight[0]), num_or_size_splits=self.num_attention_heads * 3, axis=0 + ) + qkvw = tf.TensorArray(dtype=tf.float32, size=3) + for k in tf.range(3): + qkvw_inside = tf.TensorArray(dtype=tf.float32, size=self.num_attention_heads) + for i in tf.range(self.num_attention_heads): + qkvw_inside = qkvw_inside.write(i, ws[i * 3 + k]) + qkvw = qkvw.write(k, qkvw_inside.concat()) + qkvb = [None] * 3 + + q = linear(qkvw[0], qkvb[0], query_states) + k = linear(qkvw[1], qkvb[1], hidden_states) + v = linear(qkvw[2], qkvb[2], hidden_states) + query_layer = self.transpose_for_scores(q) + key_layer = self.transpose_for_scores(k) + value_layer = self.transpose_for_scores(v) + + query_layer = query_layer + self.transpose_for_scores(self.q_bias[None, None, :]) + value_layer = value_layer + self.transpose_for_scores(self.v_bias[None, None, :]) + + rel_att = None + # Take the dot product between "query" and "key" to get the raw attention scores. + scale_factor = 1 + len(self.pos_att_type) + scale = math.sqrt(shape_list(query_layer)[-1] * scale_factor) + query_layer = query_layer / scale + + attention_scores = tf.matmul(query_layer, tf.transpose(key_layer, [0, 1, 3, 2])) + if self.relative_attention: + rel_embeddings = self.pos_dropout(rel_embeddings, training=training) + rel_att = self.disentangled_att_bias(query_layer, key_layer, relative_pos, rel_embeddings, scale_factor) + + if rel_att is not None: + attention_scores = attention_scores + rel_att + + if self.talking_head: + attention_scores = tf.transpose( + self.head_logits_proj(tf.transpose(attention_scores, [0, 2, 3, 1])), [0, 3, 1, 2] + ) + + attention_probs = self.softmax(attention_scores, attention_mask) + attention_probs = self.dropout(attention_probs, training=training) + if self.talking_head: + attention_probs = tf.transpose( + self.head_weights_proj(tf.transpose(attention_probs, [0, 2, 3, 1])), [0, 3, 1, 2] + ) + + context_layer = tf.matmul(attention_probs, value_layer) + context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) + new_context_layer_shape = shape_list(context_layer)[:-2] + [-1] + context_layer = tf.reshape(context_layer, new_context_layer_shape) + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + return outputs + + def disentangled_att_bias(self, query_layer, key_layer, relative_pos, rel_embeddings, scale_factor): + + if relative_pos is None: + q = shape_list(query_layer)[-2] + relative_pos = build_relative_position(q, shape_list(key_layer)[-2]) + shape_list_pos = shape_list(relative_pos) + if len(shape_list_pos) == 2: + relative_pos = tf.expand_dims(tf.expand_dims(relative_pos, 0), 0) + elif len(shape_list_pos) == 3: + relative_pos = tf.expand_dims(relative_pos, 0) + # bxhxqxk + elif len(shape_list_pos) != 4: + raise ValueError(f"Relative position ids must be of dim 2 or 3 or 4. {len(shape_list_pos)}") + + att_span = tf.cast( + tf.minimum( + tf.maximum(shape_list(query_layer)[-2], shape_list(key_layer)[-2]), self.max_relative_positions + ), + tf.int64, + ) + rel_embeddings = tf.expand_dims( + rel_embeddings[self.max_relative_positions - att_span : self.max_relative_positions + att_span, :], 0 + ) + if "c2p" in self.pos_att_type or "p2p" in self.pos_att_type: + pos_key_layer = self.pos_proj(rel_embeddings) + pos_key_layer = self.transpose_for_scores(pos_key_layer) + + if "p2c" in self.pos_att_type or "p2p" in self.pos_att_type: + pos_query_layer = self.pos_q_proj(rel_embeddings) + pos_query_layer = self.transpose_for_scores(pos_query_layer) + + score = 0 + # content->position + if "c2p" in self.pos_att_type: + c2p_att = tf.matmul(query_layer, tf.transpose(pos_key_layer, [0, 1, 3, 2])) + c2p_pos = tf.clip_by_value(relative_pos + att_span, 0, att_span * 2 - 1) + c2p_att = torch_gather(c2p_att, c2p_dynamic_expand(c2p_pos, query_layer, relative_pos), -1) + score += c2p_att + + # position->content + if "p2c" in self.pos_att_type or "p2p" in self.pos_att_type: + pos_query_layer /= tf.math.sqrt(tf.cast(shape_list(pos_query_layer)[-1] * scale_factor, dtype=tf.float32)) + if shape_list(query_layer)[-2] != shape_list(key_layer)[-2]: + r_pos = build_relative_position(shape_list(key_layer)[-2], shape_list(key_layer)[-2]) + else: + r_pos = relative_pos + p2c_pos = tf.clip_by_value(-r_pos + att_span, 0, att_span * 2 - 1) + + if "p2c" in self.pos_att_type: + p2c_att = tf.matmul(key_layer, tf.transpose(pos_query_layer, [0, 1, 3, 2])) + p2c_att = tf.transpose( + torch_gather(p2c_att, p2c_dynamic_expand(p2c_pos, query_layer, key_layer), -1), [0, 1, 3, 2] + ) + if shape_list(query_layer)[-2] != shape_list(key_layer)[-2]: + pos_index = tf.expand_dims(relative_pos[:, :, :, 0], -1) + p2c_att = torch_gather(p2c_att, pos_dynamic_expand(pos_index, p2c_att, key_layer), -2) + score += p2c_att + + return score + + +class TFDebertaEmbeddings(tf.keras.layers.Layer): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.vocab_size = config.vocab_size + self.type_vocab_size = config.type_vocab_size + self.embedding_size = getattr(config, "embedding_size", config.hidden_size) + self.hidden_size = config.hidden_size + self.max_position_embeddings = config.max_position_embeddings + self.position_biased_input = getattr(config, "position_biased_input", True) + self.initializer_range = config.initializer_range + self.embeddings_sum = tf.keras.layers.Add() + if self.embedding_size != config.hidden_size: + self.embed_proj = tf.keras.layers.Dense(config.hidden_size, bias=False) + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = TFDebertaStableDropout(config.hidden_dropout_prob, name="dropout") + + def build(self, input_shape: tf.TensorShape): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.embedding_size], + initializer=get_initializer(self.initializer_range), + ) + + with tf.name_scope("token_type_embeddings"): + if self.type_vocab_size > 0: + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.type_vocab_size, self.embedding_size], + initializer=get_initializer(self.initializer_range), + ) + else: + self.token_type_embeddings = None + + with tf.name_scope("position_embeddings"): + if self.position_biased_input: + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(self.initializer_range), + ) + else: + self.position_embeddings = None + + super().build(input_shape) + + def call( + self, + input_ids: tf.Tensor = None, + position_ids: tf.Tensor = None, + token_type_ids: tf.Tensor = None, + inputs_embeds: tf.Tensor = None, + mask: tf.Tensor = None, + training: bool = False, + ) -> tf.Tensor: + """ + Applies embedding based on inputs tensor. + + Returns: + final_embeddings (:obj:`tf.Tensor`): output embedding tensor. + """ + assert not (input_ids is None and inputs_embeds is None) + + if input_ids is not None: + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] + + if token_type_ids is None: + token_type_ids = tf.fill(dims=input_shape, value=0) + + if position_ids is None: + position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0) + + final_embeddings = inputs_embeds + if self.position_biased_input: + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) + final_embeddings += position_embeds + if self.type_vocab_size > 0: + token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) + final_embeddings += token_type_embeds + + if self.embedding_size != self.hidden_size: + final_embeddings = self.embed_proj(final_embeddings) + + final_embeddings = self.LayerNorm(final_embeddings) + + if mask is not None: + if len(shape_list(mask)) != len(shape_list(final_embeddings)): + if len(shape_list(mask)) == 4: + mask = tf.squeeze(tf.squeeze(mask, axis=1), axis=1) + mask = tf.cast(tf.expand_dims(mask, axis=2), tf.float32) + + final_embeddings = final_embeddings * mask + + final_embeddings = self.dropout(final_embeddings, training=training) + + return final_embeddings + + +class TFDebertaPredictionHeadTransform(tf.keras.layers.Layer): + def __init__(self, config: DebertaConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = tf.keras.layers.Dense( + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name="dense", + ) + + if isinstance(config.hidden_act, str): + self.transform_act_fn = get_tf_activation(config.hidden_act) + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + + return hidden_states + + +class TFDebertaLMPredictionHead(tf.keras.layers.Layer): + def __init__(self, config: DebertaConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): + super().__init__(**kwargs) + + self.vocab_size = config.vocab_size + self.hidden_size = config.hidden_size + + self.transform = TFDebertaPredictionHeadTransform(config, name="transform") + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.input_embeddings = input_embeddings + + def build(self, input_shape: tf.TensorShape): + self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") + + super().build(input_shape) + + def get_output_embeddings(self) -> tf.keras.layers.Layer: + return self.input_embeddings + + def set_output_embeddings(self, value: tf.Variable): + self.input_embeddings.weight = value + self.input_embeddings.vocab_size = shape_list(value)[0] + + def get_bias(self) -> Dict[str, tf.Variable]: + return {"bias": self.bias} + + def set_bias(self, value: tf.Variable): + self.bias = value["bias"] + self.vocab_size = shape_list(value["bias"])[0] + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.transform(hidden_states=hidden_states) + seq_length = shape_list(hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) + hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) + + return hidden_states + + +class TFDebertaOnlyMLMHead(tf.keras.layers.Layer): + def __init__(self, config: DebertaConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): + super().__init__(**kwargs) + self.predictions = TFDebertaLMPredictionHead(config, input_embeddings, name="predictions") + + def call(self, sequence_output: tf.Tensor) -> tf.Tensor: + prediction_scores = self.predictions(hidden_states=sequence_output) + + return prediction_scores + + +# @keras_serializable +class TFDebertaMainLayer(tf.keras.layers.Layer): + config_class = DebertaConfig + + def __init__(self, config: DebertaConfig, **kwargs): + super().__init__(**kwargs) + + self.config = config + + self.embeddings = TFDebertaEmbeddings(config, name="embeddings") + self.encoder = TFDebertaEncoder(config, name="encoder") + + def get_input_embeddings(self) -> tf.keras.layers.Layer: + return self.embeddings + + def set_input_embeddings(self, value: tf.Variable): + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + raise NotImplementedError + + def call( + self, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + **kwargs, + ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, + ) + + if inputs["input_ids"] is not None and inputs["inputs_embeds"] is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif inputs["input_ids"] is not None: + input_shape = shape_list(inputs["input_ids"]) + elif inputs["inputs_embeds"] is not None: + input_shape = shape_list(inputs["inputs_embeds"])[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs["attention_mask"] is None: + inputs["attention_mask"] = tf.fill(dims=input_shape, value=1) + + if inputs["token_type_ids"] is None: + inputs["token_type_ids"] = tf.fill(dims=input_shape, value=0) + + embedding_output = self.embeddings( + input_ids=inputs["input_ids"], + position_ids=inputs["position_ids"], + token_type_ids=inputs["token_type_ids"], + inputs_embeds=inputs["inputs_embeds"], + mask=inputs["attention_mask"], + training=inputs["training"], + ) + + encoder_outputs = self.encoder( + hidden_states=embedding_output, + attention_mask=inputs["attention_mask"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + + sequence_output = encoder_outputs[0] + + if not inputs["return_dict"]: + return (sequence_output,) + encoder_outputs[1:] + + return TFBaseModelOutput( + last_hidden_state=sequence_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class TFDebertaPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = DebertaConfig + base_model_prefix = "deberta" + + +DEBERTA_START_DOCSTRING = r""" + The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention + `_ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen. It's build on top of + BERT/RoBERTa with two improvements, i.e. disentangled attention and enhanced mask decoder. With those two + improvements, it out perform BERT/RoBERTa on a majority of tasks with 80GB pretraining data. + + This model is also a `tf.keras.Model `__ subclass. Use + it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage + and behavior. + + .. note:: + + TF 2.0 models accepts two formats as inputs: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional arguments. + + This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all + the tensors in the first argument of the model call function: :obj:`model(inputs)`. + + If you choose this second option, there are three possibilities you can use to gather all the input Tensors in + the first positional argument : + + - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + + + Parameters: + config (:class:`~transformers.DebertaConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model + weights. +""" + +DEBERTA_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`np.ndarray`, :obj:`tf.Tensor`, :obj:`List[tf.Tensor]` :obj:`Dict[str, tf.Tensor]` or :obj:`Dict[str, np.ndarray]` and each example must have the shape :obj:`({0})`) + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`transformers.DebertaTokenizer`. See + :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for + details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + token_type_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: + + - 0 corresponds to a `sentence A` token, + - 1 corresponds to a `sentence B` token. + + `What are token type IDs? <../glossary.html#token-type-ids>`_ + position_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, + config.max_position_embeddings - 1]``. + + `What are position IDs? <../glossary.html#position-ids>`_ + inputs_embeds (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare DeBERTa Model transformer outputting raw hidden-states without any specific head on top.", + DEBERTA_START_DOCSTRING, +) +class TFDebertaModel(TFDebertaPreTrainedModel): + def __init__(self, config: DebertaConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.deberta = TFDebertaMainLayer(config, name="deberta") + + @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + **kwargs, + ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, + ) + outputs = self.deberta( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + token_type_ids=inputs["token_type_ids"], + position_ids=inputs["position_ids"], + inputs_embeds=inputs["inputs_embeds"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + + return outputs + + def serving_output(self, output: TFBaseModelOutput) -> TFBaseModelOutput: + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFBaseModelOutput(last_hidden_state=output.last_hidden_state, hidden_states=hs, attentions=attns) + + +@add_start_docstrings("""DeBERTa Model with a `language modeling` head on top. """, DEBERTA_START_DOCSTRING) +class TFDebertaForMaskedLM(TFDebertaPreTrainedModel, TFMaskedLanguageModelingLoss): + def __init__(self, config: DebertaConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + if config.is_decoder: + logger.warning( + "If you want to use `TFDebertaForMaskedLM` make sure `config.is_decoder=False` for " + "bi-directional self-attention." + ) + + self.deberta = TFDebertaMainLayer(config, name="deberta") + self.mlm = TFDebertaOnlyMLMHead(config, input_embeddings=self.deberta.embeddings, name="cls") + + def get_lm_head(self) -> tf.keras.layers.Layer: + return self.mlm.predictions + + @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, + **kwargs, + ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]: + r""" + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., + config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + """ + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + training=training, + kwargs_call=kwargs, + ) + outputs = self.deberta( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + token_type_ids=inputs["token_type_ids"], + position_ids=inputs["position_ids"], + inputs_embeds=inputs["inputs_embeds"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + sequence_output = outputs[0] + prediction_scores = self.mlm(sequence_output=sequence_output, training=inputs["training"]) + loss = ( + None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=prediction_scores) + ) + + if not inputs["return_dict"]: + output = (prediction_scores,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFMaskedLMOutput( + loss=loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput: + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns) + + +@add_start_docstrings( + """ + DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + DEBERTA_START_DOCSTRING, +) +class TFDebertaForSequenceClassification(TFDebertaPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config: DebertaConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + + self.deberta = TFDebertaMainLayer(config, name="deberta") + self.pooler = TFDebertaContextPooler(config, name="pooler") + + drop_out = getattr(config, "cls_dropout", None) + drop_out = self.config.hidden_dropout_prob if drop_out is None else drop_out + self.dropout = TFDebertaStableDropout(drop_out, name="cls_dropout") + self.classifier = tf.keras.layers.Dense( + units=config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name="classifier", + ) + + @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, + **kwargs, + ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]: + r""" + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., + config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + training=training, + kwargs_call=kwargs, + ) + outputs = self.deberta( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + token_type_ids=inputs["token_type_ids"], + position_ids=inputs["position_ids"], + inputs_embeds=inputs["inputs_embeds"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + sequence_output = outputs[0] + pooled_output = self.pooler(sequence_output, training=inputs["training"]) + pooled_output = self.dropout(pooled_output, training=inputs["training"]) + logits = self.classifier(pooled_output) + loss = None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=logits) + + if not inputs["return_dict"]: + output = (logits,) + outputs[1:] + + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns) + + +@add_start_docstrings( + """ + DeBERTa Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + DEBERTA_START_DOCSTRING, +) +class TFDebertaForTokenClassification(TFDebertaPreTrainedModel, TFTokenClassificationLoss): + def __init__(self, config: DebertaConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + + self.deberta = TFDebertaMainLayer(config, name="deberta") + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.classifier = tf.keras.layers.Dense( + units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + + @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, + **kwargs, + ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]: + r""" + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - + 1]``. + """ + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + training=training, + kwargs_call=kwargs, + ) + outputs = self.deberta( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + token_type_ids=inputs["token_type_ids"], + position_ids=inputs["position_ids"], + inputs_embeds=inputs["inputs_embeds"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + sequence_output = outputs[0] + sequence_output = self.dropout(sequence_output, training=inputs["training"]) + logits = self.classifier(inputs=sequence_output) + loss = None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=logits) + + if not inputs["return_dict"]: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFTokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def serving_output(self, output: TFTokenClassifierOutput) -> TFTokenClassifierOutput: + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFTokenClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns) + + +@add_start_docstrings( + """ + DeBERTa Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + DEBERTA_START_DOCSTRING, +) +class TFDebertaForQuestionAnswering(TFDebertaPreTrainedModel, TFQuestionAnsweringLoss): + def __init__(self, config: DebertaConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + + self.deberta = TFDebertaMainLayer(config, name="deberta") + self.qa_outputs = tf.keras.layers.Dense( + units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + ) + + @add_start_docstrings_to_model_forward(DEBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + start_positions: Optional[Union[np.ndarray, tf.Tensor]] = None, + end_positions: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, + **kwargs, + ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]: + r""" + start_positions (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the + sequence are not taken into account for computing the loss. + end_positions (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the + sequence are not taken into account for computing the loss. + """ + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + start_positions=start_positions, + end_positions=end_positions, + training=training, + kwargs_call=kwargs, + ) + outputs = self.deberta( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + token_type_ids=inputs["token_type_ids"], + position_ids=inputs["position_ids"], + inputs_embeds=inputs["inputs_embeds"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + sequence_output = outputs[0] + logits = self.qa_outputs(inputs=sequence_output) + start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1) + start_logits = tf.squeeze(input=start_logits, axis=-1) + end_logits = tf.squeeze(input=end_logits, axis=-1) + loss = None + + if inputs["start_positions"] is not None and inputs["end_positions"] is not None: + labels = {"start_position": inputs["start_positions"]} + labels["end_position"] = inputs["end_positions"] + loss = self.compute_loss(labels=labels, logits=(start_logits, end_logits)) + + if not inputs["return_dict"]: + output = (start_logits, end_logits) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def serving_output(self, output: TFQuestionAnsweringModelOutput) -> TFQuestionAnsweringModelOutput: + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFQuestionAnsweringModelOutput( + start_logits=output.start_logits, end_logits=output.end_logits, hidden_states=hs, attentions=attns + ) diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py index 96c6868e1b470..146ec785e8f3a 100644 --- a/src/transformers/utils/dummy_tf_objects.py +++ b/src/transformers/utils/dummy_tf_objects.py @@ -634,6 +634,63 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["tf"]) +TF_DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class TFDebertaForMaskedLM: + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["tf"]) + + +class TFDebertaForQuestionAnswering: + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["tf"]) + + +class TFDebertaForSequenceClassification: + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["tf"]) + + +class TFDebertaForTokenClassification: + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["tf"]) + + +class TFDebertaModel: + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["tf"]) + + +class TFDebertaPreTrainedModel: + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["tf"]) + + TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/tests/test_modeling_tf_deberta.py b/tests/test_modeling_tf_deberta.py new file mode 100644 index 0000000000000..8d490569ee95c --- /dev/null +++ b/tests/test_modeling_tf_deberta.py @@ -0,0 +1,282 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +from transformers import DebertaConfig, is_tf_available +from transformers.testing_utils import require_tf, slow + +from .test_configuration_common import ConfigTester +from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor + + +if is_tf_available(): + import tensorflow as tf + + from transformers import ( + TFDebertaForMaskedLM, + TFDebertaForQuestionAnswering, + TFDebertaForSequenceClassification, + TFDebertaForTokenClassification, + TFDebertaModel, + ) + + +class TFDebertaModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = 13 + self.seq_length = 7 + self.is_training = True + self.use_input_mask = True + self.use_token_type_ids = True + self.use_labels = True + self.vocab_size = 99 + self.hidden_size = 32 + self.num_hidden_layers = 5 + self.num_attention_heads = 4 + self.intermediate_size = 37 + self.hidden_act = "gelu" + self.hidden_dropout_prob = 0.1 + self.attention_probs_dropout_prob = 0.1 + self.max_position_embeddings = 512 + self.type_vocab_size = 16 + self.relative_attention = False + self.max_relative_positions = -1 + self.position_biased_input = True + self.type_sequence_label_size = 2 + self.initializer_range = 0.02 + self.num_labels = 3 + self.num_choices = 4 + self.scope = None + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + + config = DebertaConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + relative_attention=self.relative_attention, + max_relative_positions=self.max_relative_positions, + position_biased_input=self.position_biased_input, + initializer_range=self.initializer_range, + return_dict=True, + ) + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFDebertaModel(config=config) + inputs = {"input_ids": input_ids, "attention_mask": input_mask, "token_type_ids": token_type_ids} + + inputs = [input_ids, input_mask] + result = model(inputs) + + result = model(input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + def create_and_check_for_masked_lm( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFDebertaForMaskedLM(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = TFDebertaForSequenceClassification(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_token_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = TFDebertaForTokenClassification(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.num_labels)) + + def create_and_check_for_question_answering( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = TFDebertaForQuestionAnswering(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + + result = model(inputs) + self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_tf +class TFDebertaModelTest(TFModelTesterMixin, unittest.TestCase): + + all_model_classes = ( + ( + TFDebertaModel, + TFDebertaForMaskedLM, + TFDebertaForQuestionAnswering, + TFDebertaForSequenceClassification, + TFDebertaForTokenClassification, + ) + if is_tf_available() + else () + ) + + test_head_masking = False + test_onnx = False + + def setUp(self): + self.model_tester = TFDebertaModelTester(self) + self.config_tester = ConfigTester(self, config_class=DebertaConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) + + def test_for_token_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_token_classification(*config_and_inputs) + + @slow + def test_model_from_pretrained(self): + model = TFDebertaModel.from_pretrained("kamalkraj/deberta-base") + self.assertIsNotNone(model) + + +@require_tf +class TFDeBERTaModelIntegrationTest(unittest.TestCase): + @unittest.skip(reason="Model not available yet") + def test_inference_masked_lm(self): + pass + + @slow + def test_inference_no_head(self): + model = TFDebertaModel.from_pretrained("kamalkraj/deberta-base") + input_ids = tf.constant([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]]) + attention_mask = tf.constant([[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) + output = model(input_ids, attention_mask=attention_mask)[0] + + expected_slice = tf.constant( + [ + [ + [-0.59855896, -0.80552566, -0.8462135], + [1.4484025, -0.93483794, -0.80593085], + [0.3122741, 0.00316059, -1.4131377], + ] + ] + ) + tf.debugging.assert_near(output[:, 1:4, 1:4], expected_slice, atol=1e-4)