From 4e82e98a22d88c153d1dfc736a69edff7528f7db Mon Sep 17 00:00:00 2001 From: Lysandre Date: Thu, 10 Mar 2022 12:25:20 +0100 Subject: [PATCH 1/2] Support for torch 1.11 --- src/transformers/models/deberta/modeling_deberta.py | 5 ++++- src/transformers/models/deberta_v2/modeling_deberta_v2.py | 5 ++++- src/transformers/models/sew_d/modeling_sew_d.py | 5 +++-- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py index 701f212d9fef..65780555664e 100644 --- a/src/transformers/models/deberta/modeling_deberta.py +++ b/src/transformers/models/deberta/modeling_deberta.py @@ -18,6 +18,7 @@ from collections.abc import Sequence import torch +from packaging import version from torch import _softmax_backward_data, nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss @@ -37,6 +38,8 @@ logger = logging.get_logger(__name__) +convert_to_dtype = not version.parse(torch.__version__) < version.parse("1.11") + _CONFIG_FOR_DOC = "DebertaConfig" _TOKENIZER_FOR_DOC = "DebertaTokenizer" _CHECKPOINT_FOR_DOC = "microsoft/deberta-base" @@ -115,7 +118,7 @@ def forward(self, input, mask, dim): @staticmethod def backward(self, grad_output): (output,) = self.saved_tensors - inputGrad = _softmax_backward_data(grad_output, output, self.dim, output) + inputGrad = _softmax_backward_data(grad_output, output, self.dim, output.dtype if convert_to_dtype else output) return inputGrad, None, None @staticmethod diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index 903b153111f3..3723dd312b38 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -19,6 +19,7 @@ import numpy as np import torch +from packaging import version from torch import _softmax_backward_data, nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss @@ -49,6 +50,8 @@ "microsoft/deberta-v2-xxlarge-mnli", ] +convert_to_dtype = not version.parse(torch.__version__) < version.parse("1.11") + # Copied from transformers.models.deberta.modeling_deberta.ContextPooler class ContextPooler(nn.Module): @@ -116,7 +119,7 @@ def forward(self, input, mask, dim): @staticmethod def backward(self, grad_output): (output,) = self.saved_tensors - inputGrad = _softmax_backward_data(grad_output, output, self.dim, output) + inputGrad = _softmax_backward_data(grad_output, output, self.dim, output.dtype if convert_to_dtype else output) return inputGrad, None, None @staticmethod diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py index af7dcba4b9a5..2ecc2b9626a2 100644 --- a/src/transformers/models/sew_d/modeling_sew_d.py +++ b/src/transformers/models/sew_d/modeling_sew_d.py @@ -22,6 +22,7 @@ import numpy as np import torch import torch.utils.checkpoint +from packaging import version from torch import _softmax_backward_data, nn from torch.nn import CrossEntropyLoss, LayerNorm @@ -37,7 +38,7 @@ logger = logging.get_logger(__name__) - +convert_to_dtype = not version.parse(torch.__version__) < version.parse("1.11") _HIDDEN_STATES_START_POSITION = 1 @@ -545,7 +546,7 @@ def forward(self, input, mask, dim): @staticmethod def backward(self, grad_output): (output,) = self.saved_tensors - inputGrad = _softmax_backward_data(grad_output, output, self.dim, output) + inputGrad = _softmax_backward_data(grad_output, output, self.dim, output.dtype if convert_to_dtype else output) return inputGrad, None, None @staticmethod From 5fd7210ed1a7978f284043fd8248b877ef9241c8 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Thu, 10 Mar 2022 14:18:14 +0100 Subject: [PATCH 2/2] Address Sylvain's comment --- .../models/deberta/modeling_deberta.py | 9 +++------ .../models/deberta_v2/modeling_deberta_v2.py | 8 +++----- .../models/sew_d/modeling_sew_d.py | 8 +++----- src/transformers/pytorch_utils.py | 18 +++++++++++++++++- 4 files changed, 26 insertions(+), 17 deletions(-) diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py index 65780555664e..e75e4c9719ad 100644 --- a/src/transformers/models/deberta/modeling_deberta.py +++ b/src/transformers/models/deberta/modeling_deberta.py @@ -18,8 +18,7 @@ from collections.abc import Sequence import torch -from packaging import version -from torch import _softmax_backward_data, nn +from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN @@ -32,14 +31,12 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import softmax_backward_data from ...utils import logging from .configuration_deberta import DebertaConfig logger = logging.get_logger(__name__) - -convert_to_dtype = not version.parse(torch.__version__) < version.parse("1.11") - _CONFIG_FOR_DOC = "DebertaConfig" _TOKENIZER_FOR_DOC = "DebertaTokenizer" _CHECKPOINT_FOR_DOC = "microsoft/deberta-base" @@ -118,7 +115,7 @@ def forward(self, input, mask, dim): @staticmethod def backward(self, grad_output): (output,) = self.saved_tensors - inputGrad = _softmax_backward_data(grad_output, output, self.dim, output.dtype if convert_to_dtype else output) + inputGrad = softmax_backward_data(self, grad_output, output, self.dim, output) return inputGrad, None, None @staticmethod diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index 3723dd312b38..108f08e4704a 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -19,8 +19,7 @@ import numpy as np import torch -from packaging import version -from torch import _softmax_backward_data, nn +from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss from ...activations import ACT2FN @@ -33,6 +32,7 @@ TokenClassifierOutput, ) from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import softmax_backward_data from ...utils import logging from .configuration_deberta_v2 import DebertaV2Config @@ -50,8 +50,6 @@ "microsoft/deberta-v2-xxlarge-mnli", ] -convert_to_dtype = not version.parse(torch.__version__) < version.parse("1.11") - # Copied from transformers.models.deberta.modeling_deberta.ContextPooler class ContextPooler(nn.Module): @@ -119,7 +117,7 @@ def forward(self, input, mask, dim): @staticmethod def backward(self, grad_output): (output,) = self.saved_tensors - inputGrad = _softmax_backward_data(grad_output, output, self.dim, output.dtype if convert_to_dtype else output) + inputGrad = softmax_backward_data(self, grad_output, output, self.dim, output) return inputGrad, None, None @staticmethod diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py index 2ecc2b9626a2..7443a67bcc8c 100644 --- a/src/transformers/models/sew_d/modeling_sew_d.py +++ b/src/transformers/models/sew_d/modeling_sew_d.py @@ -22,8 +22,7 @@ import numpy as np import torch import torch.utils.checkpoint -from packaging import version -from torch import _softmax_backward_data, nn +from torch import nn from torch.nn import CrossEntropyLoss, LayerNorm from transformers.deepspeed import is_deepspeed_zero3_enabled @@ -32,13 +31,12 @@ from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward from ...modeling_outputs import BaseModelOutput, CausalLMOutput, SequenceClassifierOutput from ...modeling_utils import PreTrainedModel -from ...pytorch_utils import torch_int_div +from ...pytorch_utils import softmax_backward_data, torch_int_div from ...utils import logging from .configuration_sew_d import SEWDConfig logger = logging.get_logger(__name__) -convert_to_dtype = not version.parse(torch.__version__) < version.parse("1.11") _HIDDEN_STATES_START_POSITION = 1 @@ -546,7 +544,7 @@ def forward(self, input, mask, dim): @staticmethod def backward(self, grad_output): (output,) = self.saved_tensors - inputGrad = _softmax_backward_data(grad_output, output, self.dim, output.dtype if convert_to_dtype else output) + inputGrad = softmax_backward_data(self, grad_output, output, self.dim, output) return inputGrad, None, None @staticmethod diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py index b41f438d9c3a..ee0c94bd9c70 100644 --- a/src/transformers/pytorch_utils.py +++ b/src/transformers/pytorch_utils.py @@ -14,18 +14,34 @@ import torch from packaging import version +from torch import _softmax_backward_data from .utils import logging logger = logging.get_logger(__name__) +is_torch_less_than_1_8 = version.parse(torch.__version__) < version.parse("1.8.0") +is_torch_less_than_1_11 = version.parse(torch.__version__) < version.parse("1.11") + def torch_int_div(tensor1, tensor2): """ A function that performs integer division across different versions of PyTorch. """ - if version.parse(torch.__version__) < version.parse("1.8.0"): + if is_torch_less_than_1_8: return tensor1 // tensor2 else: return torch.div(tensor1, tensor2, rounding_mode="floor") + + +def softmax_backward_data(parent, grad_output, output, dim, self): + """ + A function that calls the internal `_softmax_backward_data` PyTorch method and that adjusts the arguments according + to the torch version detected. + """ + + if is_torch_less_than_1_11: + return _softmax_backward_data(grad_output, output, parent.dim, self) + else: + return _softmax_backward_data(grad_output, output, parent.dim, self.dtype)