huggingface · akashpalla · Oct 16, 2025 · Oct 17, 2025 · Oct 17, 2025 · zucchini-nlp
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
@@ -319,9 +319,10 @@ def forward(
         values = values.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
         # CLIP text model uses both `causal_attention_mask` and `attention_mask`
         # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask`
-        if self.config._attn_implementation == "flash_attention_2":
-            self.is_causal = causal_attention_mask is not None
+        if "flash" in self.config._attn_implementation:
+            is_causal = causal_attention_mask is not None
         else:
+            is_causal = self.is_causal
             if attention_mask is not None and causal_attention_mask is not None:
                 attention_mask = attention_mask + causal_attention_mask
             elif causal_attention_mask is not None:
@@ -337,7 +338,7 @@ def forward(
             keys,
             values,
             attention_mask,
-            is_causal=self.is_causal,
+            is_causal=is_causal,
             scaling=self.scale,
             dropout=0.0 if not self.training else self.dropout,
             output_attentions=output_attentions,
@@ -611,7 +612,7 @@ def forward(
         )
 
         # expand attention_mask
-        if attention_mask is not None and self.config._attn_implementation != "flash_attention_2":
+        if attention_mask is not None and "flash" not in self.config._attn_implementation:
             # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
             attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
 

diff --git a/src/transformers/models/metaclip_2/modeling_metaclip_2.py b/src/transformers/models/metaclip_2/modeling_metaclip_2.py
@@ -219,9 +219,10 @@ def forward(
         values = values.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
         # METACLIP_2 text model uses both `causal_attention_mask` and `attention_mask`
         # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask`
-        if self.config._attn_implementation == "flash_attention_2":
-            self.is_causal = causal_attention_mask is not None
+        if "flash" in self.config._attn_implementation:
+            is_causal = causal_attention_mask is not None
         else:
+            is_causal = self.is_causal
             if attention_mask is not None and causal_attention_mask is not None:
                 attention_mask = attention_mask + causal_attention_mask
             elif causal_attention_mask is not None:
@@ -237,7 +238,7 @@ def forward(
             keys,
             values,
             attention_mask,
-            is_causal=self.is_causal,
+            is_causal=is_causal,
             scaling=self.scale,
             dropout=0.0 if not self.training else self.dropout,
             output_attentions=output_attentions,