huggingface · patrickvonplaten · Mar 3, 2021 · Mar 3, 2021 · Mar 3, 2021 · Mar 3, 2021
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
@@ -643,7 +643,7 @@ def forward(
         attention_outputs = self_attention_outputs[2:]  # Keep self-attention outputs and relative position weights
 
         # clamp inf values to enable fp16 training
-        if torch.isinf(hidden_states).any():
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
             clamp_value = torch.finfo(hidden_states.dtype).max - 1000
             hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
@@ -668,7 +668,9 @@ def forward(
                 output_attentions=output_attentions,
             )
             hidden_states = cross_attention_outputs[0]
-            if torch.isinf(hidden_states).any():
+
+            # clamp inf values to enable fp16 training
+            if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
                 clamp_value = torch.finfo(hidden_states.dtype).max - 1000
                 hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
 
@@ -681,9 +683,12 @@ def forward(
 
         # Apply Feed Forward layer
         hidden_states = self.layer[-1](hidden_states)
-        if torch.isinf(hidden_states).any():
+
+        # clamp inf values to enable fp16 training
+        if hidden_states.dtype == torch.float16 and torch.isinf(hidden_states).any():
             clamp_value = torch.finfo(hidden_states.dtype).max - 1000
             hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
         outputs = (hidden_states,)
 
         outputs = outputs + (present_key_value_state,) + attention_outputs