Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions tests/models/siglip/test_modeling_siglip.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@

from transformers import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
from transformers.testing_utils import (
is_flaky,
require_flash_attn,
require_torch,
require_torch_gpu,
Expand Down Expand Up @@ -97,13 +96,13 @@ def __init__(
self,
parent,
batch_size=12,
image_size=30,
image_size=4,
patch_size=2,
num_channels=3,
is_training=True,
hidden_size=64,
num_hidden_layers=2,
num_attention_heads=4,
num_attention_heads=2,
intermediate_size=37,
dropout=0.1,
attention_dropout=0.1,
Expand Down Expand Up @@ -255,7 +254,6 @@ def test_model_from_pretrained(self):
self.assertIsNotNone(model)

@parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION)
@is_flaky()
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

even with this (retry 5 times), we still had some failures from time to time.

Now we don't need this anymore

def test_eager_matches_sdpa_inference(self, *args):
# adding only flaky decorator here and call the parent test method
return getattr(ModelTesterMixin, self._testMethodName)(self)
Expand All @@ -273,7 +271,7 @@ def __init__(
vocab_size=99,
hidden_size=64,
num_hidden_layers=2,
num_attention_heads=4,
num_attention_heads=2,
intermediate_size=37,
dropout=0.1,
attention_dropout=0.1,
Expand Down
15 changes: 15 additions & 0 deletions tests/test_modeling_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,10 +469,25 @@ def _test_eager_matches_sdpa_inference(
logits_sdpa = _logits_sdpa
logits_eager = _logits_eager

# Avoid test flakiness with bf16!
# bf16 is not good at precision when the magnitude is larger. We have some models like `SiglipVision` with
# this test passing all the time for fp32/fp16 but flaky with bf16. Furthermore, `llama` and `clip` have
# this test passing all the time for bf16: it turns out their outputs are of smaller size (0.1 and 1.0)
# while `siglip` has outputs with maximal values around 3.0/4.0.
outputs_magnitude = float(
(torch.max(logits_sdpa.abs().amax(), logits_eager.abs().amax())).detach().to("cpu")
)
# The choice of `3e-2` in `outputs_magnitude * 1e-2` might not work if a model has even more larger outputs.
# (we can try to analyze the `rtol` more closely element-wise in the future and adjust the `rtol` instead of `atol`).
computed_atol = outputs_magnitude * 3e-2
Comment on lines +480 to +482
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, i think with rtol we can find one value that works for siglip and llama

Copy link
Collaborator Author

@ydshieh ydshieh Sep 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

running this (parameterized) tests 1000 times (for each set of parameters) for siglip, no failure. 😢

(but yeah, the comment here is more for future weird models :-) )

if dtype == torch.bfloat16:
atol = max(atol, computed_atol)

results = [
torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol)
for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager)
]

# If 80% batch elements have matched results, it's fine
if np.mean(results) < 0.8:
mean_relative_diff = ((logits_sdpa - logits_eager).abs() / (logits_eager.abs() + 1e-12)).mean()
Expand Down