diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index e207914671b4..f4bf732b5322 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -299,6 +299,10 @@
         title: AceStepTransformer1DModel
       - local: api/models/allegro_transformer3d
         title: AllegroTransformer3DModel
+      - local: api/models/anyflow_far_transformer3d
+        title: AnyFlowFARTransformer3DModel
+      - local: api/models/anyflow_transformer3d
+        title: AnyFlowTransformer3DModel
       - local: api/models/aura_flow_transformer2d
         title: AuraFlowTransformer2DModel
       - local: api/models/transformer_bria_fibo
@@ -631,6 +635,8 @@
     - sections:
       - local: api/pipelines/allegro
         title: Allegro
+      - local: api/pipelines/anyflow
+        title: AnyFlow
       - local: api/pipelines/chronoedit
         title: ChronoEdit
       - local: api/pipelines/cogvideox
@@ -706,6 +712,8 @@
       title: EulerAncestralDiscreteScheduler
     - local: api/schedulers/euler
       title: EulerDiscreteScheduler
+    - local: api/schedulers/flow_map_euler_discrete
+      title: FlowMapEulerDiscreteScheduler
     - local: api/schedulers/flow_match_euler_discrete
       title: FlowMatchEulerDiscreteScheduler
     - local: api/schedulers/flow_match_heun_discrete
diff --git a/docs/source/en/api/models/anyflow_far_transformer3d.md b/docs/source/en/api/models/anyflow_far_transformer3d.md
new file mode 100644
index 000000000000..3a9909b4887a
--- /dev/null
+++ b/docs/source/en/api/models/anyflow_far_transformer3d.md
@@ -0,0 +1,45 @@
+<!-- Copyright 2026 The AnyFlow Team, NVIDIA Corp., and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
+License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# AnyFlowFARTransformer3DModel
+
+The causal (FAR) 3D Transformer used by [`AnyFlowFARPipeline`](../pipelines/anyflow#anyflowfarpipeline) —
+the FAR variant of [AnyFlow](https://huggingface.co/papers/2605.13724) (Yuchao Gu, Guian Fang et al., NUS
+ShowLab × NVIDIA). It extends the v0.35.1 Wan2.1 backbone with three additions:
+
+1. **FAR causal block-mask** via `torch.nn.attention.flex_attention`, supporting frame-level autoregressive
+   generation as introduced in [FAR (Gu et al., 2025)](https://arxiv.org/abs/2503.19325).
+2. **Compressed-frame patch embedding** (`far_patch_embedding`) for context (already-generated) frames,
+   warm-started from the full-resolution `patch_embedding` at construction time via trilinear interpolation.
+3. **Dual-timestep flow-map embedding** (same as
+   [`AnyFlowTransformer3DModel`](anyflow_transformer3d)) — every forward call conditions on both the source
+   timestep ``t`` and the target timestep ``r``.
+
+The chunk schedule (`chunk_partition`) is **not** baked into the model config. It is a per-call argument to
+`forward`, so the same checkpoint handles different `num_frames` configurations without retraining.
+
+```python
+from diffusers import AnyFlowFARTransformer3DModel
+
+# Causal AnyFlow checkpoint (FAR):
+transformer = AnyFlowFARTransformer3DModel.from_pretrained(
+    "nvidia/AnyFlow-FAR-Wan2.1-1.3B-Diffusers", subfolder="transformer"
+)
+```
+
+## AnyFlowFARTransformer3DModel
+
+[[autodoc]] AnyFlowFARTransformer3DModel
+
+## AnyFlowFARTransformerOutput
+
+[[autodoc]] models.transformers.transformer_anyflow_far.AnyFlowFARTransformerOutput
diff --git a/docs/source/en/api/models/anyflow_transformer3d.md b/docs/source/en/api/models/anyflow_transformer3d.md
new file mode 100644
index 000000000000..95888080c0ce
--- /dev/null
+++ b/docs/source/en/api/models/anyflow_transformer3d.md
@@ -0,0 +1,36 @@
+<!-- Copyright 2026 The AnyFlow Team, NVIDIA Corp., and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
+License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# AnyFlowTransformer3DModel
+
+The bidirectional 3D Transformer used by [`AnyFlowPipeline`](../pipelines/anyflow#anyflowpipeline). It is the
+v0.35.1 Wan2.1 backbone with one structural change: the timestep embedder is replaced by
+``AnyFlowDualTimestepTextImageEmbedding``, so every forward call conditions on both the source timestep
+``t`` and the target timestep ``r``. This is the embedding required to learn the flow map
+:math:`\Phi_{r\leftarrow t}` introduced in
+[AnyFlow](https://huggingface.co/papers/2605.13724) (Yuchao Gu, Guian Fang et al., NUS ShowLab × NVIDIA).
+
+For frame-level autoregressive (FAR causal) generation, use
+[`AnyFlowFARTransformer3DModel`](anyflow_far_transformer3d) instead.
+
+```python
+from diffusers import AnyFlowTransformer3DModel
+
+# Bidirectional AnyFlow checkpoint (T2V):
+transformer = AnyFlowTransformer3DModel.from_pretrained(
+    "nvidia/AnyFlow-Wan2.1-T2V-1.3B-Diffusers", subfolder="transformer"
+)
+```
+
+## AnyFlowTransformer3DModel
+
+[[autodoc]] AnyFlowTransformer3DModel
diff --git a/docs/source/en/api/pipelines/anyflow.md b/docs/source/en/api/pipelines/anyflow.md
new file mode 100644
index 000000000000..9358b8d454fc
--- /dev/null
+++ b/docs/source/en/api/pipelines/anyflow.md
@@ -0,0 +1,218 @@
+<!-- Copyright 2026 The AnyFlow Team, NVIDIA Corp., and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
+License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <a href="https://github.com/huggingface/diffusers/blob/main/src/diffusers/loaders/lora_pipeline.py">
+            <img alt="LoRA" src="https://img.shields.io/badge/LoRA-supported-green">
+        </a>
+    </div>
+</div>
+
+# AnyFlow
+
+[AnyFlow: Any-Step Video Diffusion Model with On-Policy Flow Map Distillation](https://huggingface.co/papers/2605.13724) by Yuchao Gu, Guian Fang and collaborators at [NUS ShowLab](https://sites.google.com/view/showlab) in collaboration with NVIDIA.
+
+*Few-step video generation has been significantly advanced by consistency models. However, their performance often degrades in any-step video diffusion models due to the fixed-point formulation. To address this limitation, we present AnyFlow, the first any-step video diffusion distillation framework built on flow maps. Instead of learning only the mapping z_t → z_0, AnyFlow learns transitions z_t → z_r over arbitrary time intervals, enabling a single model to adapt to different inference budgets. We design an improved forward flow map training recipe that fine-tunes pretrained video diffusion models into flow map models, and introduce Flow Map Backward Simulation to enable on-policy distillation for flow map models. Extensive experiments across both bidirectional and causal architectures, at scales ranging from 1.3B to 14B, on text-to-video and image-to-video tasks demonstrate that AnyFlow outperforms consistency-based baselines while preserving high fidelity and flexible sampling under varying step budgets.*
+
+The original training code is at [`NVlabs/AnyFlow`](https://github.com/NVlabs/AnyFlow). The project page is at [nvlabs.github.io/AnyFlow](https://nvlabs.github.io/AnyFlow).
+
+The following AnyFlow checkpoints are supported:
+
+| Checkpoint | Backbone | Description |
+|------------|----------|-------------|
+| [`nvidia/AnyFlow-Wan2.1-T2V-1.3B-Diffusers`](https://huggingface.co/nvidia/AnyFlow-Wan2.1-T2V-1.3B-Diffusers) | Wan2.1 1.3B | Bidirectional T2V, lightweight |
+| [`nvidia/AnyFlow-Wan2.1-T2V-14B-Diffusers`](https://huggingface.co/nvidia/AnyFlow-Wan2.1-T2V-14B-Diffusers) | Wan2.1 14B | Bidirectional T2V, full quality |
+| [`nvidia/AnyFlow-FAR-Wan2.1-1.3B-Diffusers`](https://huggingface.co/nvidia/AnyFlow-FAR-Wan2.1-1.3B-Diffusers) | FAR + Wan2.1 1.3B | Causal T2V / I2V / V2V |
+| [`nvidia/AnyFlow-FAR-Wan2.1-14B-Diffusers`](https://huggingface.co/nvidia/AnyFlow-FAR-Wan2.1-14B-Diffusers) | FAR + Wan2.1 14B | Causal T2V / I2V / V2V |
+
+All four are grouped under the [`nvidia/anyflow`](https://huggingface.co/collections/nvidia/anyflow) Hugging Face collection.
+
+> [!TIP]
+> Choose `AnyFlowPipeline` for traditional bidirectional text-to-video generation. Choose `AnyFlowFARPipeline` for streaming I2V, video continuation (V2V), or any setup that benefits from frame-by-frame autoregressive sampling.
+
+> [!TIP]
+> AnyFlow supports any-step sampling: a single distilled checkpoint can be evaluated at 1, 2, 4, 8, 16... NFE without retraining. Quality scales monotonically with steps in our benchmarks.
+
+### Optimizing Memory and Inference Speed
+
+<hfoptions id="optimization">
+<hfoption id="memory">
+
+```py
+import torch
+from diffusers import AnyFlowPipeline
+from diffusers.hooks import apply_group_offloading
+
+pipe = AnyFlowPipeline.from_pretrained(
+    "nvidia/AnyFlow-Wan2.1-T2V-14B-Diffusers", torch_dtype=torch.bfloat16
+)
+apply_group_offloading(pipe.transformer, onload_device="cuda", offload_type="leaf_level")
+pipe.vae.enable_slicing()
+pipe.vae.enable_tiling()
+```
+
+</hfoption>
+<hfoption id="inference speed">
+
+```py
+import torch
+from diffusers import AnyFlowPipeline
+
+pipe = AnyFlowPipeline.from_pretrained(
+    "nvidia/AnyFlow-Wan2.1-T2V-14B-Diffusers", torch_dtype=torch.bfloat16
+).to("cuda")
+pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs")
+```
+
+</hfoption>
+</hfoptions>
+
+### Generation with AnyFlow (Bidirectional T2V)
+
+<hfoptions id="anyflow-bidi">
+<hfoption id="usage">
+
+```py
+import torch
+from diffusers import AnyFlowPipeline
+from diffusers.utils import export_to_video
+
+pipe = AnyFlowPipeline.from_pretrained(
+    "nvidia/AnyFlow-Wan2.1-T2V-1.3B-Diffusers", torch_dtype=torch.bfloat16
+).to("cuda")
+
+prompt = "A red panda eating bamboo in a forest, cinematic lighting"
+video = pipe(prompt, num_inference_steps=4, num_frames=33).frames[0]
+export_to_video(video, "out.mp4", fps=16)
+```
+
+</hfoption>
+</hfoptions>
+
+### Generation with AnyFlow (FAR Causal)
+
+The causal pipeline selects between T2V / I2V / V2V via the ``video`` (or ``video_latents``) argument:
+omit both for plain text-to-video, or pass ``video=<tensor>`` of shape ``(B, T, C, H, W)`` in ``[0, 1]``
+with ``T = 4n + 1`` to condition on existing frames. Use a single conditioning frame for I2V and a longer
+clip for V2V continuation. If you already have pre-encoded latents in the model layout, pass them via
+``video_latents=<tensor>`` to skip VAE encoding. ``video`` and ``video_latents`` are mutually exclusive.
+
+> [!IMPORTANT]
+> `AnyFlowFARPipeline.default_chunk_partition = [1, 3, 3, 3, 3, 3, 3, 2]` (sum 21) is matched to the
+> released checkpoints' canonical 81 raw frames (21 latent frames at the VAE temporal stride of 4). When
+> you change `num_frames`, you must also pass a matching `chunk_partition` summing to
+> `(num_frames - 1) // 4 + 1`, otherwise the pipeline raises an `AssertionError`.
+
+<hfoptions id="anyflow-far">
+<hfoption id="t2v">
+
+```py
+import torch
+from diffusers import AnyFlowFARPipeline
+from diffusers.utils import export_to_video
+
+pipe = AnyFlowFARPipeline.from_pretrained(
+    "nvidia/AnyFlow-FAR-Wan2.1-1.3B-Diffusers", torch_dtype=torch.bfloat16
+).to("cuda")
+
+video = pipe(
+    prompt="A cat surfing a wave, sunset",
+    num_inference_steps=4,
+    num_frames=81,
+).frames[0]
+export_to_video(video, "out.mp4", fps=16)
+```
+
+</hfoption>
+<hfoption id="i2v">
+
+```py
+import numpy as np
+import torch
+from diffusers import AnyFlowFARPipeline
+from diffusers.utils import export_to_video, load_image
+
+pipe = AnyFlowFARPipeline.from_pretrained(
+    "nvidia/AnyFlow-FAR-Wan2.1-1.3B-Diffusers", torch_dtype=torch.bfloat16
+).to("cuda")
+
+# Wrap the conditioning image as a one-frame video tensor: (1, 1, 3, H, W) in [0, 1].
+first_frame = load_image("path/to/first_frame.png").resize((832, 480))
+arr = np.asarray(first_frame).astype("float32") / 255.0  # (480, 832, 3)
+context_tensor = torch.from_numpy(arr).permute(2, 0, 1).unsqueeze(0).unsqueeze(1).to("cuda")
+
+video = pipe(
+    prompt="a cat walks across a sunlit lawn",
+    video=context_tensor,
+    num_inference_steps=4,
+    num_frames=81,
+).frames[0]
+export_to_video(video, "out.mp4", fps=16)
+```
+
+</hfoption>
+<hfoption id="v2v">
+
+```py
+import numpy as np
+import torch
+from diffusers import AnyFlowFARPipeline
+from diffusers.utils import export_to_video, load_video
+
+pipe = AnyFlowFARPipeline.from_pretrained(
+    "nvidia/AnyFlow-FAR-Wan2.1-1.3B-Diffusers", torch_dtype=torch.bfloat16
+).to("cuda")
+
+# Context clip — 9 raw frames map to 3 latent frames (9 = 4·2 + 1, 3 = 2 + 1).
+context_frames = load_video("path/to/context.mp4")[:9]
+arr = np.stack([np.asarray(f.resize((832, 480))) for f in context_frames]).astype("float32") / 255.0
+# np.stack gives (T, H, W, C) = (9, 480, 832, 3) → permute to (T, C, H, W) then add batch.
+context_tensor = torch.from_numpy(arr).permute(0, 3, 1, 2).unsqueeze(0).to("cuda")  # (1, 9, 3, 480, 832)
+
+video = pipe(
+    prompt="continue the story",
+    video=context_tensor,
+    num_inference_steps=4,
+    num_frames=81,
+    # Override chunk_partition so the first chunk covers exactly the 3 latent context frames.
+    chunk_partition=[3, 3, 3, 3, 3, 3, 3],
+).frames[0]
+export_to_video(video, "out.mp4", fps=16)
+```
+
+</hfoption>
+</hfoptions>
+
+## Notes
+
+- Classifier-free guidance is fused into the released checkpoints, so inference does not run a second guided forward pass. Keep the default `guidance_scale=1.0` unless your own checkpoint requires otherwise.
+- `FlowMapEulerDiscreteScheduler` is general-purpose. You can attach it to any flow-map-distilled checkpoint via `from_pretrained(..., scheduler=FlowMapEulerDiscreteScheduler.from_config(...))`.
+- `AnyFlowPipeline` uses [`AnyFlowTransformer3DModel`](../models/anyflow_transformer3d) (bidirectional). `AnyFlowFARPipeline` uses [`AnyFlowFARTransformer3DModel`](../models/anyflow_far_transformer3d), which adds a compressed-frame patch embedding and the FAR causal block-mask.
+- LoRA loading is supported via `WanLoraLoaderMixin`, the same mixin used by the upstream Wan pipelines.
+- For training recipes (forward flow-map training and on-policy distillation), refer to the original AnyFlow training framework at [`NVlabs/AnyFlow`](https://github.com/NVlabs/AnyFlow); training is out of scope for diffusers.
+
+## AnyFlowPipeline
+
+[[autodoc]] AnyFlowPipeline
+  - all
+  - __call__
+
+## AnyFlowFARPipeline
+
+[[autodoc]] AnyFlowFARPipeline
+  - all
+  - __call__
+
+## AnyFlowPipelineOutput
+
+[[autodoc]] pipelines.anyflow.pipeline_output.AnyFlowPipelineOutput
diff --git a/docs/source/en/api/schedulers/flow_map_euler_discrete.md b/docs/source/en/api/schedulers/flow_map_euler_discrete.md
new file mode 100644
index 000000000000..27a0c8612d70
--- /dev/null
+++ b/docs/source/en/api/schedulers/flow_map_euler_discrete.md
@@ -0,0 +1,28 @@
+<!-- Copyright 2026 The AnyFlow Team, NVIDIA Corp., and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
+License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# FlowMapEulerDiscreteScheduler
+
+`FlowMapEulerDiscreteScheduler` is an Euler-style sampler designed for flow-map-distilled diffusion
+models. Flow-map models learn arbitrary-interval transitions $\mathbf{z}_t \to \mathbf{z}_r$ rather than
+the fixed $\mathbf{z}_t \to \mathbf{z}_0$ mapping of consistency models. Both endpoints of the step are
+caller-provided, which is what enables any-step sampling: a single distilled checkpoint can be evaluated at
+1, 2, 4, 8, 16... NFE without retraining.
+
+The scheduler was introduced in
+[AnyFlow: Any-Step Video Diffusion Model with On-Policy Flow Map Distillation](https://huggingface.co/papers/2605.13724)
+and ships with the `AnyFlowPipeline` and `AnyFlowFARPipeline` integrations, but it is not
+AnyFlow-specific — any flow-map-distilled checkpoint can use it.
+
+## FlowMapEulerDiscreteScheduler
+
+[[autodoc]] FlowMapEulerDiscreteScheduler
diff --git a/docs/source/zh/_toctree.yml b/docs/source/zh/_toctree.yml
index af51506746b2..b49820dd76e7 100644
--- a/docs/source/zh/_toctree.yml
+++ b/docs/source/zh/_toctree.yml
@@ -130,6 +130,8 @@
 - title: Specific pipeline examples
   isExpanded: false
   sections:
+  - local: using-diffusers/anyflow
+    title: AnyFlow
   - local: using-diffusers/consisid
     title: ConsisID
   - local: using-diffusers/helios
diff --git a/docs/source/zh/using-diffusers/anyflow.md b/docs/source/zh/using-diffusers/anyflow.md
new file mode 100644
index 000000000000..575cdb1c1cb8
--- /dev/null
+++ b/docs/source/zh/using-diffusers/anyflow.md
@@ -0,0 +1,253 @@
+<!-- Copyright 2026 The AnyFlow Team, NVIDIA Corp., and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is
+distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See
+the License for the specific language governing permissions and limitations under the License.
+-->
+
+# AnyFlow
+
+[AnyFlow](https://huggingface.co/papers/2605.13724) 是一个视频扩散**蒸馏**框架，把预训练的 Wan2.1 教师
+模型蒸馏成在标准 Euler 采样下支持*任意步数 (any-step)* 的学生模型。同一个蒸馏出来的 checkpoint 可以
+在 1、2、4、8、16... NFE 下推理，**质量随步数单调提升** —— 这一点和 consistency models 不同，后者
+NFE 增加反而经常掉点。
+
+核心思路是学习 **flow map** $\Phi_{r\leftarrow t}: \mathbf{z}_t \to \mathbf{z}_r$（任意 $1 \ge t \ge r \ge 0$），
+而不是 consistency models 学的固定端点映射 $\mathbf{z}_t \to \mathbf{z}_0$。Flow map 的可组合性消除了
+采样步之间的 re-noising；on-policy 蒸馏阶段额外用 **DMD 反向散度监督** + **Flow-Map backward simulation**
+（3 段 shortcut）补上 consistency 蒸馏遗留的 exposure-bias 缺口。
+
+AnyFlow 由 Yuchao Gu、Guian Fang 等人在 [NUS ShowLab](https://sites.google.com/view/showlab) 与 NVIDIA 合作完成。原始训练代码在 [`NVlabs/AnyFlow`](https://github.com/NVlabs/AnyFlow)，项目主页是 [nvlabs.github.io/AnyFlow](https://nvlabs.github.io/AnyFlow)。4 个发布 checkpoint 归在 [`nvidia/anyflow`](https://huggingface.co/collections/nvidia/anyflow) Hugging Face collection 里。
+
+本文档梳理实战要点：怎么选 pipeline、怎么用 any-step 采样、怎么把 AnyFlow 嵌进 T2V / I2V / V2V 工作流。
+
+## Bidirectional 还是 Causal —— 怎么选 pipeline
+
+AnyFlow 提供两个 pipeline 形态，scheduler 和蒸馏方法相同，区别在于**怎么对帧采样**：
+
+- [`AnyFlowPipeline`](../api/pipelines/anyflow#anyflowpipeline) —— **bidirectional** T2V。一次性对整个
+  视频张量去噪，全局自注意力。**纯 prompt 输入、不要流式输出**时选这个。
+- [`AnyFlowFARPipeline`](../api/pipelines/anyflow#anyflowfarpipeline) —— **causal (FAR)**。
+  按 chunk 分段去噪，块稀疏因果注意力 + 跨 chunk 复用 KV cache。**图生视频 (I2V)**、**视频续写 (V2V)**、
+  或任何受益于逐帧自回归采样的场景选这个。同一个模型通过 `video`（像素空间）或 `video_latents`
+  （已编码 latent）这两个互斥 kwarg 来切换三种任务模式。
+
+简化对照表：
+
+| 场景 | Pipeline | 调用方式 |
+|------|----------|----------|
+| 纯文生视频，固定 NFE 求最大质量 | `AnyFlowPipeline` | `pipe(prompt, ...)` |
+| 图生视频（首帧给定） | `AnyFlowFARPipeline` | `pipe(prompt, video=<单帧 tensor>, ...)` |
+| 视频续写 / V2V | `AnyFlowFARPipeline` | `pipe(prompt, video=<多帧 tensor>, ...)` |
+| 流式 / 渐进式生成 | `AnyFlowFARPipeline` | — |
+
+高分辨率下 bidirectional 单 token 更快；causal 牺牲一点单步速度，换来在所有 latent 帧分配前就能开始
+采样的能力，对超长序列尤其有用。
+
+## 加载 checkpoint
+
+NVIDIA 发布了 4 个 AnyFlow checkpoint，pipeline × 规模各一份：
+
+```py
+import torch
+from diffusers import AnyFlowPipeline, AnyFlowFARPipeline
+
+# Bidirectional, 轻量
+pipe = AnyFlowPipeline.from_pretrained(
+    "nvidia/AnyFlow-Wan2.1-T2V-1.3B-Diffusers", torch_dtype=torch.bfloat16
+).to("cuda")
+
+# Bidirectional, 满血
+pipe = AnyFlowPipeline.from_pretrained(
+    "nvidia/AnyFlow-Wan2.1-T2V-14B-Diffusers", torch_dtype=torch.bfloat16
+).to("cuda")
+
+# Causal (FAR), 1.3B
+pipe = AnyFlowFARPipeline.from_pretrained(
+    "nvidia/AnyFlow-FAR-Wan2.1-1.3B-Diffusers", torch_dtype=torch.bfloat16
+).to("cuda")
+
+# Causal (FAR), 14B
+pipe = AnyFlowFARPipeline.from_pretrained(
+    "nvidia/AnyFlow-FAR-Wan2.1-14B-Diffusers", torch_dtype=torch.bfloat16
+).to("cuda")
+```
+
+四个 checkpoint 共用同一份 [`FlowMapEulerDiscreteScheduler`](../api/schedulers/flow_map_euler_discrete)，
+默认 `shift=5.0`。
+
+## Any-step 采样
+
+AnyFlow 最关键的特性是同一个 checkpoint **不需重新调度**，NFE 越大质量越高。固定 prompt、扫一下步数
+就能看出模型怎么在延迟和保真度之间权衡：
+
+```py
+import torch
+from diffusers import AnyFlowPipeline
+from diffusers.utils import export_to_video
+
+pipe = AnyFlowPipeline.from_pretrained(
+    "nvidia/AnyFlow-Wan2.1-T2V-1.3B-Diffusers", torch_dtype=torch.bfloat16
+).to("cuda")
+
+prompt = "森林里一只小熊猫在啃竹子，电影感光照"
+
+for nfe in [1, 2, 4, 8, 16, 32]:
+    # 每轮重建 generator —— 这样跨步数对比时唯一变量是 NFE。
+    generator = torch.Generator("cuda").manual_seed(0)
+    video = pipe(prompt, num_inference_steps=nfe, num_frames=33, generator=generator).frames[0]
+    export_to_video(video, f"out_nfe{nfe}.mp4", fps=16)
+```
+
+paper 的 Tab 3 / Fig 1 表明：每个 AnyFlow checkpoint 在 4 → 32 NFE 范围 VBench Quality 都单调上升，而
+consistency 类基线（rCM、Self-Forcing）在同区间反而掉点。
+
+> [!TIP]
+> Classifier-free guidance (CFG) 已经在训练阶段融进权重。pipeline 推理
+> 时**不会**再跑一次 unconditional 前向 —— guidance 直接由蒸馏后的权重带出。release 出来的 checkpoint
+> 都用默认的 `guidance_scale=1.0` 即可。
+
+## 图生视频 与 视频续写
+
+Causal pipeline 用同一个蒸馏模型支持三种任务模式，**通过 `video` / `video_latents` 二选一来选**：
+
+- `video` —— 像素空间张量，形状 `(B, T, C, H, W)` ∈ `[0, 1]`，pipeline 内部会过一遍 `VideoProcessor`
+  + VAE 编码；
+- `video_latents` —— 已经在模型布局下的 latent，跳过 VAE 编码；
+- 两者都不传 —— 纯文生视频；
+- 两者同时传 —— 抛 `ValueError`（互斥）。
+
+Context tensor 的帧数必须满足 `T = 4n + 1`，跟 VAE 时间步长对齐。
+
+> [!IMPORTANT]
+> FAR pipeline 是分块 (chunk) rollout，`num_frames` 必须配合 chunk 调度。默认
+> `chunk_partition=[1, 3, 3, 3, 3, 3, 3, 2]`（求和 21）对应发布 checkpoint 的标准 `num_frames=81`
+> （21 = (81 − 1) // 4 + 1）。改 `num_frames` 时**必须**显式传匹配的 `chunk_partition`，使其求和等于
+> `(num_frames - 1) // 4 + 1`，否则 pipeline 会抛 `AssertionError`。比如 `num_frames=33` 对应 9 个 latent
+> 帧，可用 `chunk_partition=[1, 4, 4]`。
+
+```py
+import numpy as np
+import torch
+from diffusers import AnyFlowFARPipeline
+from diffusers.utils import export_to_video, load_image, load_video
+
+pipe = AnyFlowFARPipeline.from_pretrained(
+    "nvidia/AnyFlow-FAR-Wan2.1-1.3B-Diffusers", torch_dtype=torch.bfloat16
+).to("cuda")
+
+
+def to_video_tensor(images, height=480, width=832):
+    """把 PIL 列表转成 FAR pipeline 需要的 (B, T, C, H, W) [0, 1] 张量。"""
+    frames = np.stack([np.asarray(img.resize((width, height))) for img in images]).astype("float32") / 255.0
+    # frames: (T, H, W, C) → (T, C, H, W) → 加 batch 维 → (1, T, C, H, W)
+    return torch.from_numpy(frames).permute(0, 3, 1, 2).unsqueeze(0)
+
+
+# 1) 文生视频（无 context）。81 帧匹配默认 chunk_partition。
+video = pipe(prompt="一只猫在夕阳下冲浪", num_inference_steps=4, num_frames=81).frames[0]
+export_to_video(video, "t2v.mp4", fps=16)
+
+# 2) 图生视频 —— 单帧 context 经过 VAE 是 1 个 latent，正好对上默认 chunk_partition 的第一项 (`[1, ...]`)。
+first_frame = load_image("path/to/first_frame.png")
+context_tensor = to_video_tensor([first_frame]).to("cuda")  # (1, 1, 3, 480, 832), [0, 1]
+video = pipe(
+    prompt="一只猫走过阳光下的草坪",
+    video=context_tensor,
+    num_inference_steps=4,
+    num_frames=81,
+).frames[0]
+export_to_video(video, "i2v.mp4", fps=16)
+
+# 3) 视频续写。9 帧 raw context → 3 个 latent context；显式覆盖 chunk_partition，让第一块正好覆盖 context。
+context_frames = load_video("path/to/context.mp4")[:9]  # 9 = 4·2 + 1
+context_tensor = to_video_tensor(context_frames).to("cuda")  # (1, 9, 3, 480, 832)
+video = pipe(
+    prompt="继续这个故事",
+    video=context_tensor,
+    num_inference_steps=4,
+    num_frames=81,
+    chunk_partition=[3, 3, 3, 3, 3, 3, 3],  # 7 个 chunk × 3 = 21 latent；首块就是 context
+).frames[0]
+export_to_video(video, "v2v.mp4", fps=16)
+```
+
+底层 patchify chunk 调度根据 `video` / `video_latents` 是否给定自动调整：纯文生用 kernel 2 (full) 和
+4 (compressed)；有 context 时第一个 chunk 改成 kernel 1，让条件帧保留全分辨率。
+
+如果你已经有 VAE 编码过的 latent，可以直接传 `video_latents=<tensor>` 跳过 `vae_encode` 步骤
+（和 `video` 互斥）。
+
+## 显存与推理速度
+
+14B 的 AnyFlow 模型用 group offload + VAE slicing 单卡 40 GB 能跑：
+
+```py
+import torch
+from diffusers import AnyFlowPipeline
+from diffusers.hooks import apply_group_offloading
+
+pipe = AnyFlowPipeline.from_pretrained(
+    "nvidia/AnyFlow-Wan2.1-T2V-14B-Diffusers", torch_dtype=torch.bfloat16
+)
+apply_group_offloading(pipe.transformer, onload_device="cuda", offload_type="leaf_level")
+pipe.vae.enable_slicing()
+pipe.vae.enable_tiling()
+```
+
+延迟方面，`torch.compile` 对 transformer（最重的模块）效果很好：
+
+```py
+pipe = pipe.to("cuda")
+pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs")
+```
+
+编译开销跑几步就摊销掉；配合 AnyFlow 的低 NFE（4-8 步），`torch.compile` 在 14B 上相比 eager
+模式有明显加速。
+
+## LoRA 微调
+
+两个 pipeline 都复用 [`WanLoraLoaderMixin`](../api/loaders/lora)，因此为对应 Wan2.1 backbone 训练的
+LoRA adapter 直接加载即可：
+
+```py
+pipe.load_lora_weights("path/or/repo/with/wan_lora")
+```
+
+如果要做**继续 on-policy 蒸馏微调**（用论文里相同的 DMD 反向散度监督配方训新 LoRA），请参考原始
+AnyFlow 训练框架 [`NVlabs/AnyFlow`](https://github.com/NVlabs/AnyFlow)，这套训练流程不在
+diffusers 范围内。
+
+## 常见坑
+
+- **永远 `guidance_scale=1.0`。** 蒸馏后的 checkpoint 已经把 CFG 融进权重。设 `> 1` 会多跑一遍
+  unconditional 前向、延迟翻倍、质量微降。
+- **Bidirectional pipeline 不支持流式。** 所有 `num_frames` 一起去噪。需要边采边播请用 causal pipeline。
+- **Causal pipeline KV cache 假设 chunk 调度跨调用一致。** 中途重建 cache 不被 release 模型支持。
+- **`num_frames` 必须满足 VAE 时间步长。** release checkpoint 用 `(N - 1) % 4 == 0` 的值（如 9、17、33、81）。
+
+## 引用
+
+```bibtex
+@misc{gu2026anyflowanystepvideodiffusion,
+  title={AnyFlow: Any-Step Video Diffusion Model with On-Policy Flow Map Distillation},
+  author={Yuchao Gu and Guian Fang and Yuxin Jiang and Weijia Mao and Song Han and Han Cai and Mike Zheng Shou},
+  year={2026},
+  eprint={2605.13724},
+  archivePrefix={arXiv},
+  primaryClass={cs.CV},
+  url={https://arxiv.org/abs/2605.13724},
+}
+
+@article{gu2025long,
+  title={Long-Context Autoregressive Video Modeling with Next-Frame Prediction},
+  author={Gu, Yuchao and Mao, Weijia and Shou, Mike Zheng},
+  journal={arXiv preprint arXiv:2503.19325},
+  year={2025}
+}
+```
diff --git a/scripts/convert_anyflow_to_diffusers.py b/scripts/convert_anyflow_to_diffusers.py
new file mode 100644
index 000000000000..60574ca23a1e
--- /dev/null
+++ b/scripts/convert_anyflow_to_diffusers.py
@@ -0,0 +1,152 @@
+# Copyright 2026 The AnyFlow Team, NVIDIA Corp., and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Convert AnyFlow training checkpoints to the diffusers ``save_pretrained`` layout.
+
+The AnyFlow training pipeline emits ``.pt`` files containing an ``ema`` key whose value is a flat state
+dict for the transformer. This script:
+
+1. Loads the matching base Wan2.1 pipeline from the Hub (provides VAE, tokenizer, and text encoder).
+2. Constructs an ``AnyFlowTransformer3DModel`` with the right config flags for the chosen variant.
+3. Loads the ``ema`` weights into the transformer.
+4. Wraps everything in an ``AnyFlowPipeline`` (bidirectional) or ``AnyFlowFARPipeline`` (FAR causal).
+5. Calls ``pipeline.save_pretrained(output_dir)``.
+
+Example:
+
+```bash
+python scripts/convert_anyflow_to_diffusers.py \\
+    --variant AnyFlow-FAR-Wan2.1-1.3B-Diffusers \\
+    --ckpt /path/to/anyflow-checkpoint.pt \\
+    --output-dir /path/to/output/AnyFlow-FAR-Wan2.1-1.3B-Diffusers
+```
+"""
+
+import argparse
+import logging
+import os
+
+import torch
+
+from diffusers import (
+    AnyFlowFARPipeline,
+    AnyFlowFARTransformer3DModel,
+    AnyFlowPipeline,
+    AnyFlowTransformer3DModel,
+    FlowMapEulerDiscreteScheduler,
+)
+
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+
+
+# Per-variant configuration. ``base_model`` is fetched from the Hub to source the matching VAE / text encoder.
+VARIANTS = {
+    "AnyFlow-FAR-Wan2.1-1.3B-Diffusers": {
+        "base_model": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
+        "transformer_cls": AnyFlowFARTransformer3DModel,
+        "transformer_kwargs": {"full_chunk_limit": 3, "compressed_patch_size": [1, 4, 4]},
+        "pipeline_cls": AnyFlowFARPipeline,
+    },
+    "AnyFlow-FAR-Wan2.1-14B-Diffusers": {
+        "base_model": "Wan-AI/Wan2.1-T2V-14B-Diffusers",
+        "transformer_cls": AnyFlowFARTransformer3DModel,
+        "transformer_kwargs": {"full_chunk_limit": 3, "compressed_patch_size": [1, 4, 4]},
+        "pipeline_cls": AnyFlowFARPipeline,
+    },
+    "AnyFlow-Wan2.1-T2V-1.3B-Diffusers": {
+        "base_model": "Wan-AI/Wan2.1-T2V-1.3B-Diffusers",
+        "transformer_cls": AnyFlowTransformer3DModel,
+        "transformer_kwargs": {},
+        "pipeline_cls": AnyFlowPipeline,
+    },
+    "AnyFlow-Wan2.1-T2V-14B-Diffusers": {
+        "base_model": "Wan-AI/Wan2.1-T2V-14B-Diffusers",
+        "transformer_cls": AnyFlowTransformer3DModel,
+        "transformer_kwargs": {},
+        "pipeline_cls": AnyFlowPipeline,
+    },
+}
+
+
+def build_pipeline(variant: str, ckpt_path: str):
+    if variant not in VARIANTS:
+        raise ValueError(f"Unknown variant {variant!r}. Choices: {list(VARIANTS)}.")
+    spec = VARIANTS[variant]
+
+    transformer = spec["transformer_cls"].from_pretrained(
+        spec["base_model"],
+        subfolder="transformer",
+        gate_value=0.25,
+        deltatime_type="r",
+        **spec["transformer_kwargs"],
+    )
+    # NVlabs/AnyFlow training checkpoints are wrapped Python objects (the `ema` key carries metadata
+    # alongside tensors), so the unpickle is required. Only run this script on checkpoints you trust.
+    state_dict = torch.load(ckpt_path, map_location="cpu", weights_only=False)["ema"]
+    missing, unexpected = transformer.load_state_dict(state_dict, strict=False)
+    if unexpected:
+        logger.warning(
+            "Unexpected keys in state dict (ignored): %s%s",
+            unexpected[:5],
+            "..." if len(unexpected) > 5 else "",
+        )
+    if missing:
+        logger.warning(
+            "Missing keys not loaded from state dict: %s%s",
+            missing[:5],
+            "..." if len(missing) > 5 else "",
+        )
+
+    scheduler = FlowMapEulerDiscreteScheduler(num_train_timesteps=1000, shift=5.0)
+
+    pipeline = spec["pipeline_cls"].from_pretrained(
+        spec["base_model"],
+        transformer=transformer,
+        scheduler=scheduler,
+    )
+    return pipeline
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert an AnyFlow training checkpoint into a diffusers pipeline directory."
+    )
+    parser.add_argument(
+        "--variant",
+        required=True,
+        choices=list(VARIANTS),
+        help="Which AnyFlow variant the checkpoint corresponds to.",
+    )
+    parser.add_argument(
+        "--ckpt",
+        required=True,
+        help="Path to the AnyFlow training checkpoint (a .pt file containing an 'ema' key).",
+    )
+    parser.add_argument(
+        "--output-dir",
+        required=True,
+        help="Destination directory for pipeline.save_pretrained.",
+    )
+    args = parser.parse_args()
+
+    os.makedirs(args.output_dir, exist_ok=True)
+    pipeline = build_pipeline(args.variant, args.ckpt)
+    pipeline.save_pretrained(args.output_dir)
+    logger.info("Saved %s pipeline to %s", args.variant, args.output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index d120d0a22818..3a8332dc0c3a 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -191,6 +191,8 @@
         [
             "AceStepTransformer1DModel",
             "AllegroTransformer3DModel",
+            "AnyFlowFARTransformer3DModel",
+            "AnyFlowTransformer3DModel",
             "AsymmetricAutoencoderKL",
             "AttentionBackendName",
             "AuraFlowTransformer2DModel",
@@ -380,6 +382,7 @@
             "EDMEulerScheduler",
             "EulerAncestralDiscreteScheduler",
             "EulerDiscreteScheduler",
+            "FlowMapEulerDiscreteScheduler",
             "FlowMatchEulerDiscreteScheduler",
             "FlowMatchHeunDiscreteScheduler",
             "FlowMatchLCMScheduler",
@@ -511,6 +514,8 @@
             "AnimateDiffSparseControlNetPipeline",
             "AnimateDiffVideoToVideoControlNetPipeline",
             "AnimateDiffVideoToVideoPipeline",
+            "AnyFlowFARPipeline",
+            "AnyFlowPipeline",
             "AudioLDM2Pipeline",
             "AudioLDM2ProjectionModel",
             "AudioLDM2UNet2DConditionModel",
@@ -1019,6 +1024,8 @@
         from .models import (
             AceStepTransformer1DModel,
             AllegroTransformer3DModel,
+            AnyFlowFARTransformer3DModel,
+            AnyFlowTransformer3DModel,
             AsymmetricAutoencoderKL,
             AttentionBackendName,
             AuraFlowTransformer2DModel,
@@ -1204,6 +1211,7 @@
             EDMEulerScheduler,
             EulerAncestralDiscreteScheduler,
             EulerDiscreteScheduler,
+            FlowMapEulerDiscreteScheduler,
             FlowMatchEulerDiscreteScheduler,
             FlowMatchHeunDiscreteScheduler,
             FlowMatchLCMScheduler,
@@ -1316,6 +1324,8 @@
             AnimateDiffSparseControlNetPipeline,
             AnimateDiffVideoToVideoControlNetPipeline,
             AnimateDiffVideoToVideoPipeline,
+            AnyFlowFARPipeline,
+            AnyFlowPipeline,
             AudioLDM2Pipeline,
             AudioLDM2ProjectionModel,
             AudioLDM2UNet2DConditionModel,
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index ff8e16aad447..a4aea6361ece 100755
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -95,6 +95,8 @@
     _import_structure["transformers.t5_film_transformer"] = ["T5FilmDecoder"]
     _import_structure["transformers.transformer_2d"] = ["Transformer2DModel"]
     _import_structure["transformers.transformer_allegro"] = ["AllegroTransformer3DModel"]
+    _import_structure["transformers.transformer_anyflow"] = ["AnyFlowTransformer3DModel"]
+    _import_structure["transformers.transformer_anyflow_far"] = ["AnyFlowFARTransformer3DModel"]
     _import_structure["transformers.transformer_bria"] = ["BriaTransformer2DModel"]
     _import_structure["transformers.transformer_bria_fibo"] = ["BriaFiboTransformer2DModel"]
     _import_structure["transformers.transformer_chroma"] = ["ChromaTransformer2DModel"]
@@ -214,6 +216,8 @@
         from .transformers import (
             AceStepTransformer1DModel,
             AllegroTransformer3DModel,
+            AnyFlowFARTransformer3DModel,
+            AnyFlowTransformer3DModel,
             AuraFlowTransformer2DModel,
             BriaFiboTransformer2DModel,
             BriaTransformer2DModel,
diff --git a/src/diffusers/models/transformers/__init__.py b/src/diffusers/models/transformers/__init__.py
index 156b54e7f07d..bb10b101c1b9 100755
--- a/src/diffusers/models/transformers/__init__.py
+++ b/src/diffusers/models/transformers/__init__.py
@@ -18,6 +18,8 @@
     from .t5_film_transformer import T5FilmDecoder
     from .transformer_2d import Transformer2DModel
     from .transformer_allegro import AllegroTransformer3DModel
+    from .transformer_anyflow import AnyFlowTransformer3DModel
+    from .transformer_anyflow_far import AnyFlowFARTransformer3DModel
     from .transformer_bria import BriaTransformer2DModel
     from .transformer_bria_fibo import BriaFiboTransformer2DModel
     from .transformer_chroma import ChromaTransformer2DModel
diff --git a/src/diffusers/models/transformers/transformer_anyflow.py b/src/diffusers/models/transformers/transformer_anyflow.py
new file mode 100644
index 000000000000..4ac1af4c0d0b
--- /dev/null
+++ b/src/diffusers/models/transformers/transformer_anyflow.py
@@ -0,0 +1,726 @@
+# Copyright 2026 The AnyFlow Team, NVIDIA Corp., and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This file derives from the FAR architecture (Gu et al., 2025, arXiv:2503.19325) and adds the
+# AnyFlow dual-timestep flow-map embedding (AnyFlowDualTimestepTextImageEmbedding) introduced by
+# Yuchao Gu, Guian Fang et al. (arXiv:2605.13724). The base 3D DiT structure is adapted from the
+# v0.35.1 Wan2.1 transformer (transformer_wan.py); upstream Wan has since been refactored, so
+# this file is intentionally self-contained rather than annotated with `# Copied from`.
+
+import math
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
+from ...utils import apply_lora_scale, logging
+from ..attention import AttentionModuleMixin, FeedForward
+from ..attention_dispatch import dispatch_attention_fn
+from ..embeddings import PixArtAlphaTextProjection, TimestepEmbedding, Timesteps, get_1d_rotary_pos_embed
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import FP32LayerNorm, RMSNorm
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def apply_rotary_emb(hidden_states: torch.Tensor, freqs: torch.Tensor):
+    # MPS / NPU backends do not support complex128 / float64; fall back to float32 on those devices.
+    is_mps = hidden_states.device.type == "mps"
+    is_npu = hidden_states.device.type == "npu"
+    rotary_dtype = torch.float32 if (is_mps or is_npu) else torch.float64
+    x_rotated = torch.view_as_complex(hidden_states.to(rotary_dtype).unflatten(3, (-1, 2)))
+    x_out = torch.view_as_real(x_rotated * freqs).flatten(3, 4)
+    return x_out.type_as(hidden_states)
+
+
+class AnyFlowAttnProcessor:
+    """
+    Bidirectional self-attention processor for AnyFlow. Routes through
+    :func:`~diffusers.models.attention_dispatch.dispatch_attention_fn` so any SDPA-compatible backend is supported
+    (SDPA, flash-attn, xformers, flex, …). FAR causal generation lives in
+    :class:`~diffusers.models.transformers.transformer_anyflow_far.AnyFlowCausalAttnProcessor`.
+    """
+
+    _attention_backend = None
+    _parallel_config = None
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "AnyFlowAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to 2.0 or higher."
+            )
+
+    def __call__(
+        self,
+        attn: "AnyFlowAttention",
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[Any] = None,
+        rotary_emb: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # Layout (B, H, L, D) for rotary application; transposed to (B, L, H, D) before dispatch.
+        query = query.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+        key = key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+        value = value.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+
+        if rotary_emb is not None:
+            query = apply_rotary_emb(query, rotary_emb["query"])
+            key = apply_rotary_emb(key, rotary_emb["key"])
+
+        hidden_states = dispatch_attention_fn(
+            query.transpose(1, 2),
+            key.transpose(1, 2),
+            value.transpose(1, 2),
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
+            parallel_config=self._parallel_config,
+        )
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.type_as(query)
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+
+
+class AnyFlowCrossAttnProcessor:
+    """
+    Cross-attention processor for AnyFlow. Always uses the dispatched SDPA-compatible backend; no rotary embedding or
+    KV cache is applied to the text→video cross-attention path.
+    """
+
+    _attention_backend = None
+    _parallel_config = None
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "AnyFlowCrossAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to 2.0 or higher."
+            )
+
+    def __call__(
+        self,
+        attn: "AnyFlowAttention",
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # (B, L, H, D) layout for dispatch_attention_fn.
+        query = query.unflatten(2, (attn.heads, -1))
+        key = key.unflatten(2, (attn.heads, -1))
+        value = value.unflatten(2, (attn.heads, -1))
+
+        hidden_states = dispatch_attention_fn(
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
+            parallel_config=self._parallel_config,
+        )
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.type_as(query)
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+
+
+class AnyFlowAttention(torch.nn.Module, AttentionModuleMixin):
+    """
+    Attention module used by :class:`AnyFlowTransformerBlock`. Layout matches the legacy
+    :class:`~diffusers.models.attention_processor.Attention` so existing AnyFlow checkpoints load bit-exactly into this
+    class.
+    """
+
+    _default_processor_cls = AnyFlowAttnProcessor
+    _available_processors = [AnyFlowAttnProcessor, AnyFlowCrossAttnProcessor]
+
+    def __init__(
+        self,
+        dim: int,
+        heads: int,
+        dim_head: int,
+        eps: float = 1e-6,
+        processor: Optional[Any] = None,
+    ):
+        super().__init__()
+        self.heads = heads
+        self.inner_dim = heads * dim_head
+
+        self.to_q = torch.nn.Linear(dim, self.inner_dim, bias=True)
+        self.to_k = torch.nn.Linear(dim, self.inner_dim, bias=True)
+        self.to_v = torch.nn.Linear(dim, self.inner_dim, bias=True)
+        self.to_out = torch.nn.ModuleList(
+            [
+                torch.nn.Linear(self.inner_dim, dim, bias=True),
+                torch.nn.Dropout(0.0),
+            ]
+        )
+        # ``rms_norm_across_heads`` per-axis: normalize Q and K across the entire ``heads * dim_head``
+        # channel axis. We use diffusers' RMSNorm (rather than ``torch.nn.RMSNorm``) so the numerics
+        # match the legacy Attention class that produced the released checkpoints.
+        self.norm_q = RMSNorm(self.inner_dim, eps=eps)
+        self.norm_k = RMSNorm(self.inner_dim, eps=eps)
+
+        self.set_processor(processor if processor is not None else self._default_processor_cls())
+
+    def forward(self, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor:
+        return self.processor(self, hidden_states, **kwargs)
+
+
+class AnyFlowImageEmbedding(torch.nn.Module):
+    def __init__(self, in_features: int, out_features: int):
+        super().__init__()
+
+        self.norm1 = FP32LayerNorm(in_features)
+        self.ff = FeedForward(in_features, out_features, mult=1, activation_fn="gelu")
+        self.norm2 = FP32LayerNorm(out_features)
+
+    def forward(self, encoder_hidden_states_image: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.norm1(encoder_hidden_states_image)
+        hidden_states = self.ff(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+        return hidden_states
+
+
+class AnyFlowDualTimestepTextImageEmbedding(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        gate_value: float,
+        deltatime_type: str,
+        time_freq_dim: int,
+        time_proj_dim: int,
+        text_embed_dim: int,
+        image_embed_dim: Optional[int] = None,
+    ):
+        super().__init__()
+
+        self.timesteps_proj = Timesteps(num_channels=time_freq_dim, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.time_embedder = TimestepEmbedding(in_channels=time_freq_dim, time_embed_dim=dim)
+        self.delta_embedder = TimestepEmbedding(in_channels=time_freq_dim, time_embed_dim=dim)
+        self.act_fn = nn.SiLU()
+        self.time_proj = nn.Linear(dim, time_proj_dim)
+        self.text_embedder = PixArtAlphaTextProjection(text_embed_dim, dim, act_fn="gelu_tanh")
+
+        self.image_embedder = None
+        if image_embed_dim is not None:
+            self.image_embedder = AnyFlowImageEmbedding(image_embed_dim, dim)
+
+        self.register_buffer("delta_emb_gate", torch.tensor([gate_value], dtype=torch.float32), persistent=False)
+        self.deltatime_type = deltatime_type
+
+    def forward_timestep(
+        self, timestep: torch.Tensor, delta_timestep: torch.Tensor, encoder_hidden_states, token_per_frame
+    ):
+        batch_size, num_frames = timestep.shape
+        timestep = timestep.reshape(-1)
+        delta_timestep = delta_timestep.reshape(-1)
+
+        timestep = self.timesteps_proj(timestep)
+
+        time_embedder_dtype = next(iter(self.time_embedder.parameters())).dtype
+        if timestep.dtype != time_embedder_dtype and time_embedder_dtype != torch.int8:
+            timestep = timestep.to(time_embedder_dtype)
+        temb = self.time_embedder(timestep).type_as(encoder_hidden_states)
+
+        delta_timestep = self.timesteps_proj(delta_timestep)
+
+        delta_embedder_dtype = next(iter(self.delta_embedder.parameters())).dtype
+        if delta_timestep.dtype != delta_embedder_dtype and delta_embedder_dtype != torch.int8:
+            delta_timestep = delta_timestep.to(delta_embedder_dtype)
+        delta_emb = self.delta_embedder(delta_timestep).type_as(encoder_hidden_states)
+
+        gate = self.delta_emb_gate.to(delta_embedder_dtype)
+
+        rt_emb = (1 - gate) * temb + gate * delta_emb
+        timestep_proj = self.time_proj(self.act_fn(rt_emb))
+
+        rt_emb = rt_emb.unflatten(0, (batch_size, num_frames)).repeat_interleave(token_per_frame, dim=1)
+        timestep_proj = timestep_proj.unflatten(0, (batch_size, num_frames)).repeat_interleave(token_per_frame, dim=1)
+
+        return rt_emb, timestep_proj
+
+    def forward(
+        self,
+        timestep: torch.Tensor,
+        r_timestep: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_states_image: Optional[torch.Tensor] = None,
+        layout_cfg=None,
+    ):
+        if self.deltatime_type == "r":
+            delta_timestep = r_timestep
+        elif self.deltatime_type == "t-r":
+            delta_timestep = timestep - r_timestep
+        else:
+            raise NotImplementedError
+
+        timestep, timestep_proj = self.forward_timestep(
+            timestep, delta_timestep, encoder_hidden_states, layout_cfg["full_token_per_frame"]
+        )
+
+        encoder_hidden_states = self.text_embedder(encoder_hidden_states)
+        if encoder_hidden_states_image is not None:
+            encoder_hidden_states_image = self.image_embedder(encoder_hidden_states_image)
+
+        return timestep, timestep_proj, encoder_hidden_states, encoder_hidden_states_image
+
+
+class AnyFlowRotaryPosEmbed(nn.Module):
+    """Rotary positional embedding for the bidirectional AnyFlow transformer.
+
+    The FAR causal variant lives in :mod:`~diffusers.models.transformers.transformer_anyflow_far` and additionally
+    handles compressed-frame chunks; this bidi class produces frequencies for the single full-resolution token grid
+    only.
+    """
+
+    def __init__(
+        self,
+        attention_head_dim: int,
+        patch_size: Tuple[int, int, int],
+        max_seq_len: int,
+        theta: float = 10000.0,
+    ):
+        super().__init__()
+
+        self.attention_head_dim = attention_head_dim
+        self.patch_size = patch_size
+        self.max_seq_len = max_seq_len
+        self.theta = theta
+
+        # Frequency table is lazily built per-device in ``_build_freqs``: MPS / NPU don't support
+        # complex128, so we downcast to complex64 there.
+        self._freqs_cache: Optional[Tuple[Any, torch.Tensor]] = None
+
+    def _build_freqs(self, device: torch.device) -> torch.Tensor:
+        cache_key = (device.type, str(device))
+        if self._freqs_cache is not None and self._freqs_cache[0] == cache_key:
+            return self._freqs_cache[1]
+
+        is_mps = device.type == "mps"
+        is_npu = device.type == "npu"
+        freqs_dtype = torch.float32 if (is_mps or is_npu) else torch.float64
+
+        h_dim = w_dim = 2 * (self.attention_head_dim // 6)
+        t_dim = self.attention_head_dim - h_dim - w_dim
+
+        freqs_list = []
+        for dim in (t_dim, h_dim, w_dim):
+            f = get_1d_rotary_pos_embed(
+                dim,
+                self.max_seq_len,
+                self.theta,
+                use_real=False,
+                repeat_interleave_real=False,
+                freqs_dtype=freqs_dtype,
+            )
+            freqs_list.append(f.to(device))
+        freqs = torch.cat(freqs_list, dim=1)
+        self._freqs_cache = (cache_key, freqs)
+        return freqs
+
+    def _forward_full_frame(self, num_frames, height, width, device) -> torch.Tensor:
+        ppf, pph, ppw = num_frames, height, width
+
+        freqs_full = self._build_freqs(device)
+        if min(ppf, pph, ppw) <= 0:
+            freq_channels = self.attention_head_dim // 2
+            return torch.empty((ppf, pph, ppw, freq_channels), dtype=freqs_full.dtype, device=device)
+
+        freqs = freqs_full.split_with_sizes(
+            [
+                self.attention_head_dim // 2 - 2 * (self.attention_head_dim // 6),
+                self.attention_head_dim // 6,
+                self.attention_head_dim // 6,
+            ],
+            dim=1,
+        )
+
+        freqs_f = freqs[0][:ppf].view(ppf, 1, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_h = freqs[1][:pph].view(1, pph, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_w = freqs[2][:ppw].view(1, 1, ppw, -1).expand(ppf, pph, ppw, -1)
+        freqs = torch.cat([freqs_f, freqs_h, freqs_w], dim=-1)
+        return freqs
+
+    def forward(self, layout_cfg, device):
+        freqs = self._forward_full_frame(
+            num_frames=layout_cfg["total_frames"],
+            height=layout_cfg["full_frame_shape"][0],
+            width=layout_cfg["full_frame_shape"][1],
+            device=device,
+        )
+        freqs = freqs.flatten(start_dim=0, end_dim=2)
+        freqs = freqs[None, None, ...]
+        return {"query": freqs, "key": freqs}
+
+
+class AnyFlowTransformerBlock(nn.Module):
+    """AnyFlow transformer block.
+
+    The self-attention processor is chosen at construction by ``is_causal``: the bidirectional transformer passes
+    ``is_causal=False`` (the default), the FAR causal transformer passes ``is_causal=True``. The forward pass is
+    identical in both modes — only the processor differs, so all causal-specific machinery (BlockMask, KV cache) lives
+    inside the processor.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        ffn_dim: int,
+        num_heads: int,
+        cross_attn_norm: bool = False,
+        eps: float = 1e-6,
+        is_causal: bool = False,
+    ):
+        super().__init__()
+
+        self.is_causal = is_causal
+
+        # 1. Self-attention. The causal processor lives in the FAR sibling module; lazy-import to
+        # avoid a circular import at module load time.
+        if is_causal:
+            from .transformer_anyflow_far import AnyFlowCausalAttnProcessor
+
+            self_attn_processor = AnyFlowCausalAttnProcessor()
+        else:
+            self_attn_processor = AnyFlowAttnProcessor()
+
+        self.norm1 = FP32LayerNorm(dim, eps, elementwise_affine=False)
+        self.attn1 = AnyFlowAttention(
+            dim=dim,
+            heads=num_heads,
+            dim_head=dim // num_heads,
+            eps=eps,
+            processor=self_attn_processor,
+        )
+
+        # 2. Cross-attention
+        self.attn2 = AnyFlowAttention(
+            dim=dim,
+            heads=num_heads,
+            dim_head=dim // num_heads,
+            eps=eps,
+            processor=AnyFlowCrossAttnProcessor(),
+        )
+        self.norm2 = FP32LayerNorm(dim, eps, elementwise_affine=True) if cross_attn_norm else nn.Identity()
+
+        # 3. Feed-forward
+        self.ffn = FeedForward(dim, inner_dim=ffn_dim, activation_fn="gelu-approximate")
+        self.norm3 = FP32LayerNorm(dim, eps, elementwise_affine=False)
+
+        self.scale_shift_table = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        rotary_emb: torch.Tensor,
+        attention_mask: torch.Tensor,
+        kv_cache=None,
+        kv_cache_flag=None,
+    ) -> torch.Tensor:
+        shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
+            self.scale_shift_table + temb.float()
+        ).chunk(6, dim=2)
+        shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
+            shift_msa.squeeze(2),
+            scale_msa.squeeze(2),
+            gate_msa.squeeze(2),
+            c_shift_msa.squeeze(2),
+            c_scale_msa.squeeze(2),
+            c_gate_msa.squeeze(2),
+        )  # noqa: E501
+
+        # 1. Self-attention
+        norm_hidden_states = (self.norm1(hidden_states.float()) * (1 + scale_msa) + shift_msa).type_as(hidden_states)
+        attn1_kwargs = {
+            "hidden_states": norm_hidden_states,
+            "rotary_emb": rotary_emb,
+            "attention_mask": attention_mask,
+        }
+        # KV cache kwargs are only consumed by the FAR causal processor; the bidi processor
+        # doesn't accept them, so we forward them only when they're actually populated.
+        if kv_cache is not None:
+            attn1_kwargs["kv_cache"] = kv_cache
+            attn1_kwargs["kv_cache_flag"] = kv_cache_flag
+        attn_output = self.attn1(**attn1_kwargs)
+        hidden_states = (hidden_states.float() + attn_output * gate_msa).type_as(hidden_states)
+
+        # 2. Cross-attention
+        norm_hidden_states = self.norm2(hidden_states.float()).type_as(hidden_states)
+        attn_output = self.attn2(hidden_states=norm_hidden_states, encoder_hidden_states=encoder_hidden_states)
+        hidden_states = hidden_states + attn_output
+
+        # 3. Feed-forward
+        norm_hidden_states = (self.norm3(hidden_states.float()) * (1 + c_scale_msa) + c_shift_msa).type_as(
+            hidden_states
+        )
+        ff_output = self.ffn(norm_hidden_states)
+        hidden_states = (hidden_states.float() + ff_output.float() * c_gate_msa).type_as(hidden_states)
+
+        return hidden_states
+
+
+class AnyFlowTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+    r"""
+    Bidirectional 3D Transformer for AnyFlow flow-map sampling.
+
+    The architecture is the v0.35.1 Wan2.1 3D DiT backbone with one structural change: the timestep embedder is
+    replaced by ``AnyFlowDualTimestepTextImageEmbedding`` so that every forward call conditions on both the source
+    timestep ``t`` and the target timestep ``r``. This is the embedding required to learn the flow map
+    :math:`\Phi_{r\leftarrow t}` introduced in [AnyFlow](https://huggingface.co/papers/2605.13724) by Yuchao Gu, Guian
+    Fang et al.
+
+    For frame-level autoregressive (FAR causal) generation, use ``AnyFlowFARTransformer3DModel`` instead; that variant
+    adds the FAR causal block-mask and a compressed-frame patch embedding on top of the same backbone.
+
+    Args:
+        patch_size (`Tuple[int]`, defaults to `(1, 2, 2)`):
+            3D patch dimensions for video embedding (t_patch, h_patch, w_patch).
+        num_attention_heads (`int`, defaults to `40`):
+            Number of attention heads.
+        attention_head_dim (`int`, defaults to `128`):
+            The number of channels in each head.
+        in_channels (`int`, defaults to `16`):
+            The number of channels in the input latent.
+        out_channels (`int`, defaults to `16`):
+            The number of channels in the output latent.
+        text_dim (`int`, defaults to `4096`):
+            Input dimension for text embeddings (UMT5).
+        freq_dim (`int`, defaults to `256`):
+            Dimension for sinusoidal time embeddings.
+        ffn_dim (`int`, defaults to `13824`):
+            Intermediate dimension in feed-forward network.
+        num_layers (`int`, defaults to `40`):
+            Number of transformer blocks.
+        cross_attn_norm (`bool`, defaults to `True`):
+            Enable cross-attention normalization.
+        eps (`float`, defaults to `1e-6`):
+            Epsilon for normalization layers.
+        image_dim (`Optional[int]`, *optional*, defaults to `None`):
+            Image embedding dimension for I2V conditioning (`1280` for the original Wan2.1-I2V model).
+        rope_max_seq_len (`int`, defaults to `1024`):
+            Maximum sequence length used to precompute rotary position frequencies.
+        gate_value (`float`, defaults to `0.25`):
+            Mixing gate between source-timestep and delta-timestep embeddings (the AnyFlow paper's :math:`g` parameter,
+            fixed at 0.25 in stage-1 distillation).
+        deltatime_type (`str`, defaults to `'r'`):
+            Either ``"r"`` (delta is the target timestep) or ``"t-r"`` (delta is the absolute interval).
+    """
+
+    _supports_gradient_checkpointing = True
+    _skip_layerwise_casting_patterns = ["patch_embedding", "condition_embedder", "norm"]
+    _no_split_modules = ["AnyFlowTransformerBlock"]
+    _keep_in_fp32_modules = ["time_embedder", "scale_shift_table", "norm1", "norm2", "norm3"]
+    _repeated_blocks = ["AnyFlowTransformerBlock"]
+
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: Tuple[int] = (1, 2, 2),
+        num_attention_heads: int = 40,
+        attention_head_dim: int = 128,
+        in_channels: int = 16,
+        out_channels: int = 16,
+        text_dim: int = 4096,
+        freq_dim: int = 256,
+        ffn_dim: int = 13824,
+        num_layers: int = 40,
+        cross_attn_norm: bool = True,
+        eps: float = 1e-6,
+        image_dim: Optional[int] = None,
+        rope_max_seq_len: int = 1024,
+        gate_value: float = 0.25,
+        deltatime_type: str = "r",
+    ) -> None:
+        super().__init__()
+
+        inner_dim = num_attention_heads * attention_head_dim
+        out_channels = out_channels or in_channels
+
+        # 1. Patch & position embedding (full-frame only).
+        self.rope = AnyFlowRotaryPosEmbed(attention_head_dim, patch_size, rope_max_seq_len)
+        self.patch_embedding = nn.Conv3d(in_channels, inner_dim, kernel_size=patch_size, stride=patch_size)
+
+        # 2. Condition embedding (always dual-timestep for AnyFlow distilled checkpoints).
+        self.condition_embedder = AnyFlowDualTimestepTextImageEmbedding(
+            dim=inner_dim,
+            gate_value=gate_value,
+            deltatime_type=deltatime_type,
+            time_freq_dim=freq_dim,
+            time_proj_dim=inner_dim * 6,
+            text_embed_dim=text_dim,
+            image_embed_dim=image_dim,
+        )
+
+        # 3. Transformer blocks
+        self.blocks = nn.ModuleList(
+            [
+                AnyFlowTransformerBlock(inner_dim, ffn_dim, num_attention_heads, cross_attn_norm, eps)
+                for _ in range(num_layers)
+            ]
+        )
+
+        # 4. Output norm & projection
+        self.norm_out = FP32LayerNorm(inner_dim, eps, elementwise_affine=False)
+        self.proj_out = nn.Linear(inner_dim, out_channels * math.prod(patch_size))
+        self.scale_shift_table = nn.Parameter(torch.randn(1, 2, inner_dim) / inner_dim**0.5)
+
+        self.gradient_checkpointing = False
+
+    def _unpack_latent_sequence(self, latents, num_frames, height, width, patch_size):
+        batch_size, num_patches, channels = latents.shape
+        height, width = height // patch_size, width // patch_size
+
+        latents = latents.view(
+            batch_size * num_frames, height, width, patch_size, patch_size, channels // (patch_size * patch_size)
+        )
+        latents = latents.permute(0, 5, 1, 3, 2, 4)
+        latents = latents.reshape(
+            batch_size, num_frames, channels // (patch_size * patch_size), height * patch_size, width * patch_size
+        )
+        return latents
+
+    @apply_lora_scale("attention_kwargs")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.Tensor,
+        r_timestep: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_states_image: Optional[torch.Tensor] = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[Transformer2DModelOutput, Tuple]:
+        """
+        Bidirectional flow-map forward pass. ``hidden_states`` is laid out as ``(B, F, C, H, W)`` (per-frame latents).
+        The input is patchified with the standard ``patch_embedding`` (kernel = stride = ``patch_size``) and denoised
+        with global bidirectional self-attention over the resulting flat token sequence.
+
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+                Input video latents.
+            timestep (`torch.Tensor`):
+                Source (noisier) flow-map timestep `t`.
+            r_timestep (`torch.Tensor`):
+                Target (cleaner) flow-map timestep `r`; defines the destination of the flow-map step.
+            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_len, embed_dims)`):
+                Text-conditioning embeddings.
+            encoder_hidden_states_image (`torch.Tensor`, *optional*):
+                Image-conditioning embeddings; concatenated before the text tokens when provided.
+            attention_kwargs (`dict`, *optional*):
+                Kwargs forwarded to the `AttentionProcessor` as defined under `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.transformer_2d.Transformer2DModelOutput`] if `return_dict` is True, otherwise a `tuple` whose
+            first element is the predicted velocity tensor.
+        """
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+
+        full_token_per_frame = (height * width) // (self.config.patch_size[1] * self.config.patch_size[2])
+
+        layout_cfg = {
+            "total_frames": num_frames,
+            "full_frame_shape": (height // self.config.patch_size[1], width // self.config.patch_size[2]),
+            "full_token_per_frame": full_token_per_frame,
+        }
+
+        rotary_emb = self.rope(layout_cfg=layout_cfg, device=hidden_states.device)
+
+        hidden_states = self.patch_embedding(hidden_states)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+
+        temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image = self.condition_embedder(
+            timestep,
+            r_timestep,
+            encoder_hidden_states,
+            encoder_hidden_states_image,
+            layout_cfg=layout_cfg,
+        )
+        timestep_proj = timestep_proj.unflatten(2, (6, -1))
+
+        attention_mask = None
+
+        if encoder_hidden_states_image is not None:
+            encoder_hidden_states = torch.concat([encoder_hidden_states_image, encoder_hidden_states], dim=1)
+
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+            for block in self.blocks:
+                hidden_states = self._gradient_checkpointing_func(
+                    block, hidden_states, encoder_hidden_states, timestep_proj, rotary_emb, attention_mask
+                )
+        else:
+            for block in self.blocks:
+                hidden_states = block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb, attention_mask)
+
+        # Output norm, projection & unpatchify. `temb` is always 3D from `condition_embedder.forward()`
+        # (broadcast over total tokens), so no ndim==2 branch is needed.
+        shift, scale = (self.scale_shift_table.unsqueeze(0) + temb.unsqueeze(2)).chunk(2, dim=2)
+        shift = shift.squeeze(2)
+        scale = scale.squeeze(2)
+
+        # Move shift/scale to hidden_states' device for multi-GPU accelerate inference.
+        shift = shift.to(hidden_states.device)
+        scale = scale.to(hidden_states.device)
+
+        hidden_states = (self.norm_out(hidden_states.float()) * (1 + scale) + shift).type_as(hidden_states)
+        hidden_states = self.proj_out(hidden_states)
+
+        output = self._unpack_latent_sequence(
+            hidden_states,
+            num_frames=layout_cfg["total_frames"],
+            height=height,
+            width=width,
+            patch_size=self.config.patch_size[1],
+        )
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
diff --git a/src/diffusers/models/transformers/transformer_anyflow_far.py b/src/diffusers/models/transformers/transformer_anyflow_far.py
new file mode 100644
index 000000000000..4d2291a634af
--- /dev/null
+++ b/src/diffusers/models/transformers/transformer_anyflow_far.py
@@ -0,0 +1,1510 @@
+# Copyright 2026 The AnyFlow Team, NVIDIA Corp., and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This file is the FAR causal sibling of `transformer_anyflow.py`. Shared submodules are duplicated
+# via `# Copied from` so `make fix-copies` keeps both files in sync; this keeps each transformer
+# variant readable in isolation. The FAR architecture comes from Gu et al., 2025
+# (arXiv:2503.19325); the dual-timestep flow-map embedding is AnyFlow's contribution
+# (Yuchao Gu, Guian Fang et al., arXiv:2605.13724).
+
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.attention.flex_attention import create_block_mask
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
+from ...utils import BaseOutput, apply_lora_scale, logging
+from ..attention import AttentionModuleMixin, FeedForward
+from ..attention_dispatch import dispatch_attention_fn
+from ..embeddings import PixArtAlphaTextProjection, TimestepEmbedding, Timesteps, get_1d_rotary_pos_embed
+from ..modeling_outputs import Transformer2DModelOutput
+from ..modeling_utils import ModelMixin
+from ..normalization import FP32LayerNorm, RMSNorm
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+# Copied from diffusers.models.transformers.transformer_anyflow.apply_rotary_emb
+def apply_rotary_emb(hidden_states: torch.Tensor, freqs: torch.Tensor):
+    # MPS / NPU backends do not support complex128 / float64; fall back to float32 on those devices.
+    is_mps = hidden_states.device.type == "mps"
+    is_npu = hidden_states.device.type == "npu"
+    rotary_dtype = torch.float32 if (is_mps or is_npu) else torch.float64
+    x_rotated = torch.view_as_complex(hidden_states.to(rotary_dtype).unflatten(3, (-1, 2)))
+    x_out = torch.view_as_real(x_rotated * freqs).flatten(3, 4)
+    return x_out.type_as(hidden_states)
+
+
+@dataclass
+class AnyFlowFARTransformerOutput(BaseOutput):
+    """
+    Output dataclass for ``AnyFlowFARTransformer3DModel``'s causal forward paths.
+
+    Args:
+        sample (`torch.Tensor` or `None`):
+            Predicted denoising target for the autoregressive chunk. ``None`` for the cache-prefill path, which only
+            writes the KV cache and produces no usable sample.
+        kv_cache (`list[dict[str, torch.Tensor]]`, *optional*):
+            Per-block KV cache state used by subsequent autoregressive steps.
+    """
+
+    sample: Optional[torch.Tensor] = None
+    kv_cache: Optional[List[Dict[str, torch.Tensor]]] = None
+
+
+class AnyFlowCausalAttnProcessor:
+    """
+    Causal self-attention processor for AnyFlow FAR. Routes through
+    :func:`~diffusers.models.attention_dispatch.dispatch_attention_fn` with the ``flex`` backend and a precomputed
+    :class:`~torch.nn.attention.flex_attention.BlockMask`. Supports KV-cache prefill (cache-write step) and
+    autoregressive read (cache-read step).
+
+    Requires the ``flex`` attention backend — the ``BlockMask`` produced by
+    :class:`AnyFlowFARTransformer3DModel._build_causal_mask` is consumed only by the flex backend. A clear
+    :class:`ValueError` is raised if a non-flex backend is configured via ``_attention_backend``.
+    """
+
+    _attention_backend = "flex"
+    _parallel_config = None
+
+    _SUPPORTED_BACKENDS = ("flex", "_native_flex")
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "AnyFlowCausalAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to 2.0 or higher."
+            )
+
+    def __call__(
+        self,
+        attn,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[Any] = None,
+        rotary_emb: Optional[Dict[str, torch.Tensor]] = None,
+        kv_cache: Optional[Dict[str, torch.Tensor]] = None,
+        kv_cache_flag: Optional[Dict[str, Any]] = None,
+    ) -> torch.Tensor:
+        if self._attention_backend not in self._SUPPORTED_BACKENDS:
+            raise ValueError(
+                f"AnyFlowCausalAttnProcessor requires the 'flex' attention backend "
+                f"(got {self._attention_backend!r}). FAR causal generation builds a "
+                f"flex_attention.BlockMask which is only consumed by the flex backend in "
+                f"`dispatch_attention_fn`."
+            )
+
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # Layout (B, H, L, D) is required by KV-cache slicing and rotary application.
+        query = query.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+        key = key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+        value = value.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+
+        if kv_cache is not None:
+            if kv_cache_flag["is_cache_step"]:
+                kv_cache["compressed_cache"][0, :, :, : kv_cache_flag["num_compressed_tokens"], :] = key[
+                    :, :, : kv_cache_flag["num_compressed_tokens"]
+                ]
+                kv_cache["compressed_cache"][1, :, :, : kv_cache_flag["num_compressed_tokens"], :] = value[
+                    :, :, : kv_cache_flag["num_compressed_tokens"]
+                ]
+                kv_cache["full_cache"][0, :, :, : kv_cache_flag["num_full_tokens"], :] = key[
+                    :, :, kv_cache_flag["num_compressed_tokens"] :
+                ]
+                kv_cache["full_cache"][1, :, :, : kv_cache_flag["num_full_tokens"], :] = value[
+                    :, :, kv_cache_flag["num_compressed_tokens"] :
+                ]
+            else:
+                key = torch.cat(
+                    [
+                        kv_cache["compressed_cache"][0, :, :, : kv_cache_flag["num_cached_compressed_tokens"], :],
+                        kv_cache["full_cache"][0, :, :, : kv_cache_flag["num_cached_full_tokens"], :],
+                        key,
+                    ],
+                    dim=2,
+                )
+                value = torch.cat(
+                    [
+                        kv_cache["compressed_cache"][1, :, :, : kv_cache_flag["num_cached_compressed_tokens"], :],
+                        kv_cache["full_cache"][1, :, :, : kv_cache_flag["num_cached_full_tokens"], :],
+                        value,
+                    ],
+                    dim=2,
+                )
+
+        if rotary_emb is not None:
+            query = apply_rotary_emb(query, rotary_emb["query"])
+            key = apply_rotary_emb(key, rotary_emb["key"])
+
+        # BlockMask block-size is 128 — pad seq_len to a multiple of 128. Tiny dummy components may
+        # have head_dim < 16; flex_attention requires head_dim >= 16, so right-pad q/k/v on the head
+        # dim with zeros and override `scale` so the result matches the original head_dim.
+        seq_len = query.shape[2]
+        head_dim = query.shape[3]
+        padded_length = int(math.ceil(seq_len / 128.0) * 128.0 - seq_len)
+        if padded_length > 0:
+            pad_shape = [query.shape[0], query.shape[1], padded_length, head_dim]
+            query = torch.cat([query, torch.zeros(pad_shape, device=query.device, dtype=query.dtype)], dim=2)
+            key = torch.cat([key, torch.zeros(pad_shape, device=key.device, dtype=key.dtype)], dim=2)
+            value = torch.cat([value, torch.zeros(pad_shape, device=value.device, dtype=value.dtype)], dim=2)
+
+        head_pad = max(0, 16 - head_dim)
+        scale = 1.0 / (head_dim**0.5) if head_pad > 0 else None
+        if head_pad > 0:
+            query = F.pad(query, (0, head_pad))
+            key = F.pad(key, (0, head_pad))
+            value = F.pad(value, (0, head_pad))
+
+        # `dispatch_attention_fn` expects (B, L, H, D); the flex backend permutes back to
+        # (B, H, L, D) internally before calling flex_attention — same kernel call as the bare
+        # flex_attention path, same numerics. Verified against
+        # `attention_dispatch._native_flex_attention`.
+        hidden_states = dispatch_attention_fn(
+            query.transpose(1, 2),
+            key.transpose(1, 2),
+            value.transpose(1, 2),
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            scale=scale,
+            backend=self._attention_backend,
+            parallel_config=self._parallel_config,
+        )
+        # `dispatch_attention_fn` returns (B, L, H, D). Trim head pad on the last axis, then trim
+        # seq pad on dim=1, then fold heads back into the channel dim.
+        if head_pad > 0:
+            hidden_states = hidden_states[..., :head_dim]
+        if padded_length > 0:
+            hidden_states = hidden_states[:, :seq_len, :, :]
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.type_as(query)
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+
+
+# Copied from diffusers.models.transformers.transformer_anyflow.AnyFlowAttnProcessor
+class AnyFlowAttnProcessor:
+    """
+    Bidirectional self-attention processor for AnyFlow. Routes through
+    :func:`~diffusers.models.attention_dispatch.dispatch_attention_fn` so any SDPA-compatible backend is supported
+    (SDPA, flash-attn, xformers, flex, …). FAR causal generation lives in
+    :class:`~diffusers.models.transformers.transformer_anyflow_far.AnyFlowCausalAttnProcessor`.
+    """
+
+    _attention_backend = None
+    _parallel_config = None
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "AnyFlowAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to 2.0 or higher."
+            )
+
+    def __call__(
+        self,
+        attn: "AnyFlowAttention",
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[Any] = None,
+        rotary_emb: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # Layout (B, H, L, D) for rotary application; transposed to (B, L, H, D) before dispatch.
+        query = query.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+        key = key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+        value = value.unflatten(2, (attn.heads, -1)).transpose(1, 2)
+
+        if rotary_emb is not None:
+            query = apply_rotary_emb(query, rotary_emb["query"])
+            key = apply_rotary_emb(key, rotary_emb["key"])
+
+        hidden_states = dispatch_attention_fn(
+            query.transpose(1, 2),
+            key.transpose(1, 2),
+            value.transpose(1, 2),
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
+            parallel_config=self._parallel_config,
+        )
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.type_as(query)
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+
+
+# Copied from diffusers.models.transformers.transformer_anyflow.AnyFlowCrossAttnProcessor
+class AnyFlowCrossAttnProcessor:
+    """
+    Cross-attention processor for AnyFlow. Always uses the dispatched SDPA-compatible backend; no rotary embedding or
+    KV cache is applied to the text→video cross-attention path.
+    """
+
+    _attention_backend = None
+    _parallel_config = None
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "AnyFlowCrossAttnProcessor requires PyTorch 2.0. To use it, please upgrade PyTorch to 2.0 or higher."
+            )
+
+    def __call__(
+        self,
+        attn: "AnyFlowAttention",
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        # (B, L, H, D) layout for dispatch_attention_fn.
+        query = query.unflatten(2, (attn.heads, -1))
+        key = key.unflatten(2, (attn.heads, -1))
+        value = value.unflatten(2, (attn.heads, -1))
+
+        hidden_states = dispatch_attention_fn(
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
+            parallel_config=self._parallel_config,
+        )
+        hidden_states = hidden_states.flatten(2, 3)
+        hidden_states = hidden_states.type_as(query)
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        return hidden_states
+
+
+# Copied from diffusers.models.transformers.transformer_anyflow.AnyFlowAttention with AnyFlowAttnProcessor->AnyFlowCausalAttnProcessor
+class AnyFlowAttention(torch.nn.Module, AttentionModuleMixin):
+    """
+    Attention module used by :class:`AnyFlowTransformerBlock`. Layout matches the legacy
+    :class:`~diffusers.models.attention_processor.Attention` so existing AnyFlow checkpoints load bit-exactly into this
+    class.
+    """
+
+    _default_processor_cls = AnyFlowCausalAttnProcessor
+    _available_processors = [AnyFlowCausalAttnProcessor, AnyFlowCrossAttnProcessor]
+
+    def __init__(
+        self,
+        dim: int,
+        heads: int,
+        dim_head: int,
+        eps: float = 1e-6,
+        processor: Optional[Any] = None,
+    ):
+        super().__init__()
+        self.heads = heads
+        self.inner_dim = heads * dim_head
+
+        self.to_q = torch.nn.Linear(dim, self.inner_dim, bias=True)
+        self.to_k = torch.nn.Linear(dim, self.inner_dim, bias=True)
+        self.to_v = torch.nn.Linear(dim, self.inner_dim, bias=True)
+        self.to_out = torch.nn.ModuleList(
+            [
+                torch.nn.Linear(self.inner_dim, dim, bias=True),
+                torch.nn.Dropout(0.0),
+            ]
+        )
+        # ``rms_norm_across_heads`` per-axis: normalize Q and K across the entire ``heads * dim_head``
+        # channel axis. We use diffusers' RMSNorm (rather than ``torch.nn.RMSNorm``) so the numerics
+        # match the legacy Attention class that produced the released checkpoints.
+        self.norm_q = RMSNorm(self.inner_dim, eps=eps)
+        self.norm_k = RMSNorm(self.inner_dim, eps=eps)
+
+        self.set_processor(processor if processor is not None else self._default_processor_cls())
+
+    def forward(self, hidden_states: torch.Tensor, **kwargs) -> torch.Tensor:
+        return self.processor(self, hidden_states, **kwargs)
+
+
+# Copied from diffusers.models.transformers.transformer_anyflow.AnyFlowImageEmbedding
+class AnyFlowImageEmbedding(torch.nn.Module):
+    def __init__(self, in_features: int, out_features: int):
+        super().__init__()
+
+        self.norm1 = FP32LayerNorm(in_features)
+        self.ff = FeedForward(in_features, out_features, mult=1, activation_fn="gelu")
+        self.norm2 = FP32LayerNorm(out_features)
+
+    def forward(self, encoder_hidden_states_image: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.norm1(encoder_hidden_states_image)
+        hidden_states = self.ff(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+        return hidden_states
+
+
+class AnyFlowDualTimestepTextImageEmbeddingCausal(nn.Module):
+    """Causal variant of :class:`AnyFlowDualTimestepTextImageEmbedding`.
+
+    Splits the per-frame timestep stream into a full-resolution suffix (length ``far_cfg["num_full_frames"]``) and a
+    FAR-compressed prefix, expanding each segment by its own ``token_per_frame`` factor so the assembled time embedding
+    aligns with the chunk-mixed token sequence. Optionally concatenates a ``clean_timestep`` embedding for the training
+    rollout.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        gate_value: float,
+        deltatime_type: str,
+        time_freq_dim: int,
+        time_proj_dim: int,
+        text_embed_dim: int,
+        image_embed_dim: Optional[int] = None,
+    ):
+        super().__init__()
+
+        self.timesteps_proj = Timesteps(num_channels=time_freq_dim, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.time_embedder = TimestepEmbedding(in_channels=time_freq_dim, time_embed_dim=dim)
+        self.delta_embedder = TimestepEmbedding(in_channels=time_freq_dim, time_embed_dim=dim)
+        self.act_fn = nn.SiLU()
+        self.time_proj = nn.Linear(dim, time_proj_dim)
+        self.text_embedder = PixArtAlphaTextProjection(text_embed_dim, dim, act_fn="gelu_tanh")
+
+        self.image_embedder = None
+        if image_embed_dim is not None:
+            self.image_embedder = AnyFlowImageEmbedding(image_embed_dim, dim)
+
+        self.register_buffer("delta_emb_gate", torch.tensor([gate_value], dtype=torch.float32), persistent=False)
+        self.deltatime_type = deltatime_type
+
+    # Copied from diffusers.models.transformers.transformer_anyflow.AnyFlowDualTimestepTextImageEmbedding.forward_timestep
+    def forward_timestep(
+        self, timestep: torch.Tensor, delta_timestep: torch.Tensor, encoder_hidden_states, token_per_frame
+    ):
+        batch_size, num_frames = timestep.shape
+        timestep = timestep.reshape(-1)
+        delta_timestep = delta_timestep.reshape(-1)
+
+        timestep = self.timesteps_proj(timestep)
+
+        time_embedder_dtype = next(iter(self.time_embedder.parameters())).dtype
+        if timestep.dtype != time_embedder_dtype and time_embedder_dtype != torch.int8:
+            timestep = timestep.to(time_embedder_dtype)
+        temb = self.time_embedder(timestep).type_as(encoder_hidden_states)
+
+        delta_timestep = self.timesteps_proj(delta_timestep)
+
+        delta_embedder_dtype = next(iter(self.delta_embedder.parameters())).dtype
+        if delta_timestep.dtype != delta_embedder_dtype and delta_embedder_dtype != torch.int8:
+            delta_timestep = delta_timestep.to(delta_embedder_dtype)
+        delta_emb = self.delta_embedder(delta_timestep).type_as(encoder_hidden_states)
+
+        gate = self.delta_emb_gate.to(delta_embedder_dtype)
+
+        rt_emb = (1 - gate) * temb + gate * delta_emb
+        timestep_proj = self.time_proj(self.act_fn(rt_emb))
+
+        rt_emb = rt_emb.unflatten(0, (batch_size, num_frames)).repeat_interleave(token_per_frame, dim=1)
+        timestep_proj = timestep_proj.unflatten(0, (batch_size, num_frames)).repeat_interleave(token_per_frame, dim=1)
+
+        return rt_emb, timestep_proj
+
+    def forward(
+        self,
+        timestep: torch.Tensor,
+        r_timestep: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_states_image: Optional[torch.Tensor] = None,
+        far_cfg=None,
+        clean_timestep=None,
+    ):
+        if self.deltatime_type == "r":
+            delta_timestep = r_timestep
+        elif self.deltatime_type == "t-r":
+            delta_timestep = timestep - r_timestep
+        else:
+            raise NotImplementedError
+
+        full_frame_timestep, full_frame_timestep_proj = self.forward_timestep(
+            timestep[:, -far_cfg["num_full_frames"] :],
+            delta_timestep[:, -far_cfg["num_full_frames"] :],
+            encoder_hidden_states,
+            far_cfg["full_token_per_frame"],
+        )
+        compressed_frame_timestep, compressed_frame_timestep_proj = self.forward_timestep(
+            timestep[:, : -far_cfg["num_full_frames"]],
+            delta_timestep[:, : -far_cfg["num_full_frames"]],
+            encoder_hidden_states,
+            far_cfg["compressed_token_per_frame"],
+        )
+
+        if clean_timestep is not None:
+            clean_timestep, clean_timestep_proj = self.forward_timestep(
+                clean_timestep, clean_timestep, encoder_hidden_states, far_cfg["full_token_per_frame"]
+            )
+            timestep = torch.cat([compressed_frame_timestep, full_frame_timestep, clean_timestep], dim=1)
+            timestep_proj = torch.cat(
+                [compressed_frame_timestep_proj, full_frame_timestep_proj, clean_timestep_proj], dim=1
+            )
+        else:
+            timestep = torch.cat([compressed_frame_timestep, full_frame_timestep], dim=1)
+            timestep_proj = torch.cat([compressed_frame_timestep_proj, full_frame_timestep_proj], dim=1)
+
+        encoder_hidden_states = self.text_embedder(encoder_hidden_states)
+        if encoder_hidden_states_image is not None:
+            encoder_hidden_states_image = self.image_embedder(encoder_hidden_states_image)
+
+        return timestep, timestep_proj, encoder_hidden_states, encoder_hidden_states_image
+
+
+# Copied from diffusers.models.transformers.transformer_anyflow.AnyFlowTransformerBlock
+class AnyFlowTransformerBlock(nn.Module):
+    """AnyFlow transformer block.
+
+    The self-attention processor is chosen at construction by ``is_causal``: the bidirectional transformer passes
+    ``is_causal=False`` (the default), the FAR causal transformer passes ``is_causal=True``. The forward pass is
+    identical in both modes — only the processor differs, so all causal-specific machinery (BlockMask, KV cache) lives
+    inside the processor.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        ffn_dim: int,
+        num_heads: int,
+        cross_attn_norm: bool = False,
+        eps: float = 1e-6,
+        is_causal: bool = False,
+    ):
+        super().__init__()
+
+        self.is_causal = is_causal
+
+        # 1. Self-attention. The causal processor lives in the FAR sibling module; lazy-import to
+        # avoid a circular import at module load time.
+        if is_causal:
+            from .transformer_anyflow_far import AnyFlowCausalAttnProcessor
+
+            self_attn_processor = AnyFlowCausalAttnProcessor()
+        else:
+            self_attn_processor = AnyFlowAttnProcessor()
+
+        self.norm1 = FP32LayerNorm(dim, eps, elementwise_affine=False)
+        self.attn1 = AnyFlowAttention(
+            dim=dim,
+            heads=num_heads,
+            dim_head=dim // num_heads,
+            eps=eps,
+            processor=self_attn_processor,
+        )
+
+        # 2. Cross-attention
+        self.attn2 = AnyFlowAttention(
+            dim=dim,
+            heads=num_heads,
+            dim_head=dim // num_heads,
+            eps=eps,
+            processor=AnyFlowCrossAttnProcessor(),
+        )
+        self.norm2 = FP32LayerNorm(dim, eps, elementwise_affine=True) if cross_attn_norm else nn.Identity()
+
+        # 3. Feed-forward
+        self.ffn = FeedForward(dim, inner_dim=ffn_dim, activation_fn="gelu-approximate")
+        self.norm3 = FP32LayerNorm(dim, eps, elementwise_affine=False)
+
+        self.scale_shift_table = nn.Parameter(torch.randn(1, 6, dim) / dim**0.5)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        rotary_emb: torch.Tensor,
+        attention_mask: torch.Tensor,
+        kv_cache=None,
+        kv_cache_flag=None,
+    ) -> torch.Tensor:
+        shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
+            self.scale_shift_table + temb.float()
+        ).chunk(6, dim=2)
+        shift_msa, scale_msa, gate_msa, c_shift_msa, c_scale_msa, c_gate_msa = (
+            shift_msa.squeeze(2),
+            scale_msa.squeeze(2),
+            gate_msa.squeeze(2),
+            c_shift_msa.squeeze(2),
+            c_scale_msa.squeeze(2),
+            c_gate_msa.squeeze(2),
+        )  # noqa: E501
+
+        # 1. Self-attention
+        norm_hidden_states = (self.norm1(hidden_states.float()) * (1 + scale_msa) + shift_msa).type_as(hidden_states)
+        attn1_kwargs = {
+            "hidden_states": norm_hidden_states,
+            "rotary_emb": rotary_emb,
+            "attention_mask": attention_mask,
+        }
+        # KV cache kwargs are only consumed by the FAR causal processor; the bidi processor
+        # doesn't accept them, so we forward them only when they're actually populated.
+        if kv_cache is not None:
+            attn1_kwargs["kv_cache"] = kv_cache
+            attn1_kwargs["kv_cache_flag"] = kv_cache_flag
+        attn_output = self.attn1(**attn1_kwargs)
+        hidden_states = (hidden_states.float() + attn_output * gate_msa).type_as(hidden_states)
+
+        # 2. Cross-attention
+        norm_hidden_states = self.norm2(hidden_states.float()).type_as(hidden_states)
+        attn_output = self.attn2(hidden_states=norm_hidden_states, encoder_hidden_states=encoder_hidden_states)
+        hidden_states = hidden_states + attn_output
+
+        # 3. Feed-forward
+        norm_hidden_states = (self.norm3(hidden_states.float()) * (1 + c_scale_msa) + c_shift_msa).type_as(
+            hidden_states
+        )
+        ff_output = self.ffn(norm_hidden_states)
+        hidden_states = (hidden_states.float() + ff_output.float() * c_gate_msa).type_as(hidden_states)
+
+        return hidden_states
+
+
+class AnyFlowCausalRotaryPosEmbed(nn.Module):
+    """
+    Rotary positional embedding for the FAR causal transformer.
+
+    Produces position frequencies for both the full-resolution noisy chunk(s) and the FAR-compressed context chunk(s);
+    the compressed branch downscales the per-axis frequency table via complex average pooling so the compressed grid
+    stays aligned with the full grid.
+    """
+
+    def __init__(
+        self,
+        attention_head_dim: int,
+        patch_size: Tuple[int, int, int],
+        compressed_patch_size: Tuple[int, int, int],
+        max_seq_len: int,
+        theta: float = 10000.0,
+    ):
+        super().__init__()
+
+        self.attention_head_dim = attention_head_dim
+        self.patch_size = patch_size
+        self.compressed_patch_size = compressed_patch_size
+        self.max_seq_len = max_seq_len
+        self.theta = theta
+
+        # Frequency table is lazily built per-device in ``_build_freqs``: MPS / NPU don't support
+        # complex128, so we downcast to complex64 there.
+        self._freqs_cache: Optional[Tuple[Any, torch.Tensor]] = None
+
+    # Copied from diffusers.models.transformers.transformer_anyflow.AnyFlowRotaryPosEmbed._build_freqs
+    def _build_freqs(self, device: torch.device) -> torch.Tensor:
+        cache_key = (device.type, str(device))
+        if self._freqs_cache is not None and self._freqs_cache[0] == cache_key:
+            return self._freqs_cache[1]
+
+        is_mps = device.type == "mps"
+        is_npu = device.type == "npu"
+        freqs_dtype = torch.float32 if (is_mps or is_npu) else torch.float64
+
+        h_dim = w_dim = 2 * (self.attention_head_dim // 6)
+        t_dim = self.attention_head_dim - h_dim - w_dim
+
+        freqs_list = []
+        for dim in (t_dim, h_dim, w_dim):
+            f = get_1d_rotary_pos_embed(
+                dim,
+                self.max_seq_len,
+                self.theta,
+                use_real=False,
+                repeat_interleave_real=False,
+                freqs_dtype=freqs_dtype,
+            )
+            freqs_list.append(f.to(device))
+        freqs = torch.cat(freqs_list, dim=1)
+        self._freqs_cache = (cache_key, freqs)
+        return freqs
+
+    def avg_pool_complex(self, freq: torch.Tensor, kernel_size: int, stride: int):
+        real = freq.real  # [B, C, L], float
+        real = real.transpose(0, 1).unsqueeze(0)
+        imag = freq.imag  # [B, C, L], float
+        imag = imag.transpose(0, 1).unsqueeze(0)
+
+        pr = F.avg_pool1d(real, kernel_size, stride)
+        pi = F.avg_pool1d(imag, kernel_size, stride)
+
+        pr = pr.squeeze(0).transpose(0, 1)
+        pi = pi.squeeze(0).transpose(0, 1)
+
+        norm = torch.sqrt(pr**2 + pi**2)
+        pr_unit = pr / norm
+        pi_unit = pi / norm
+
+        return torch.complex(pr_unit, pi_unit)
+
+    def _forward_compressed_frame(self, num_frames, height, width, device):
+        ppf, pph, ppw = num_frames, height, width
+        # Tiny dummy components (e.g. height=16/width=16 with compressed_patch_size=(1,4,4) and
+        # an upstream VAE stride of 8) can produce 0-element grids; the .view(0, k, 1, -1) reshape
+        # below would be ambiguous. Real ckpts use 60x104 latents and never hit this path.
+        freqs_full = self._build_freqs(device)
+        if min(ppf, pph, ppw) <= 0:
+            freq_channels = self.attention_head_dim // 2
+            return torch.empty((ppf, pph, ppw, freq_channels), dtype=freqs_full.dtype, device=device)
+        downscale = [self.compressed_patch_size[i] // self.patch_size[i] for i in range(len(self.patch_size))]
+
+        freqs = freqs_full.split_with_sizes(
+            [
+                self.attention_head_dim // 2 - 2 * (self.attention_head_dim // 6),
+                self.attention_head_dim // 6,
+                self.attention_head_dim // 6,
+            ],
+            dim=1,
+        )
+
+        freqs_f = self.avg_pool_complex(freqs[0], kernel_size=downscale[0], stride=downscale[0])
+        freqs_h = self.avg_pool_complex(freqs[1], kernel_size=downscale[1], stride=downscale[1])
+        freqs_w = self.avg_pool_complex(freqs[2], kernel_size=downscale[2], stride=downscale[2])
+
+        freqs_f = freqs_f[:ppf].view(ppf, 1, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_h = freqs_h[:pph].view(1, pph, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_w = freqs_w[:ppw].view(1, 1, ppw, -1).expand(ppf, pph, ppw, -1)
+
+        freqs = torch.cat([freqs_f, freqs_h, freqs_w], dim=-1)
+        return freqs
+
+    # Copied from diffusers.models.transformers.transformer_anyflow.AnyFlowRotaryPosEmbed._forward_full_frame
+    def _forward_full_frame(self, num_frames, height, width, device) -> torch.Tensor:
+        ppf, pph, ppw = num_frames, height, width
+
+        freqs_full = self._build_freqs(device)
+        if min(ppf, pph, ppw) <= 0:
+            freq_channels = self.attention_head_dim // 2
+            return torch.empty((ppf, pph, ppw, freq_channels), dtype=freqs_full.dtype, device=device)
+
+        freqs = freqs_full.split_with_sizes(
+            [
+                self.attention_head_dim // 2 - 2 * (self.attention_head_dim // 6),
+                self.attention_head_dim // 6,
+                self.attention_head_dim // 6,
+            ],
+            dim=1,
+        )
+
+        freqs_f = freqs[0][:ppf].view(ppf, 1, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_h = freqs[1][:pph].view(1, pph, 1, -1).expand(ppf, pph, ppw, -1)
+        freqs_w = freqs[2][:ppw].view(1, 1, ppw, -1).expand(ppf, pph, ppw, -1)
+        freqs = torch.cat([freqs_f, freqs_h, freqs_w], dim=-1)
+        return freqs
+
+    def forward(self, far_cfg, device, clean_hidden_states=None):
+        full_frame_freqs = self._forward_full_frame(
+            num_frames=far_cfg["total_frames"],
+            height=far_cfg["full_frame_shape"][0],
+            width=far_cfg["full_frame_shape"][1],
+            device=device,
+        )
+        compressed_frame_freqs = self._forward_compressed_frame(
+            num_frames=far_cfg["total_frames"],
+            height=far_cfg["compressed_frame_shape"][0],
+            width=far_cfg["compressed_frame_shape"][1],
+            device=device,
+        )
+
+        compressed_frame_freqs, full_frame_freqs = (
+            compressed_frame_freqs[: far_cfg["num_compressed_frames"]],
+            full_frame_freqs[far_cfg["num_compressed_frames"] :],
+        )
+
+        compressed_frame_freqs = compressed_frame_freqs.flatten(start_dim=0, end_dim=2)
+        full_frame_freqs = full_frame_freqs.flatten(start_dim=0, end_dim=2)
+
+        if clean_hidden_states is not None:
+            freqs = torch.cat([compressed_frame_freqs, full_frame_freqs, full_frame_freqs], dim=0)
+        else:
+            freqs = torch.cat([compressed_frame_freqs, full_frame_freqs], dim=0)
+
+        freqs = freqs[None, None, ...]
+
+        return {"query": freqs, "key": freqs}
+
+
+class AnyFlowFARTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
+    r"""
+    Causal (FAR) 3D Transformer for AnyFlow flow-map sampling with frame-level autoregressive generation.
+
+    Extends the v0.35.1 Wan2.1 backbone with:
+
+    * **FAR causal block-mask** via :func:`torch.nn.attention.flex_attention`, supporting frame-level autoregressive
+      generation (FAR; [Gu et al., 2025](https://arxiv.org/abs/2503.19325)).
+    * **Compressed-frame patch embedding** ``far_patch_embedding`` for context (already-generated) frames, initialized
+      from ``patch_embedding`` via trilinear interpolation so a freshly constructed model is already at a reasonable
+      starting point even before LoRA fine-tuning.
+    * **Dual-timestep flow-map embedding** for any-step sampling (same as ``AnyFlowTransformer3DModel``).
+
+    Use ``AnyFlowTransformer3DModel`` instead for plain bidirectional T2V — that variant skips the FAR causal masking
+    and ``far_patch_embedding`` and is ~5–10% smaller.
+
+    Args:
+        patch_size (`Tuple[int]`, defaults to `(1, 2, 2)`):
+            3D patch dimensions for full-resolution chunks.
+        compressed_patch_size (`Tuple[int]`, defaults to `(1, 4, 4)`):
+            Larger patch dimensions for the FAR-compressed (context) chunks.
+        full_chunk_limit (`int`, defaults to `3`):
+            Maximum number of full-resolution chunks before earlier chunks are demoted to compressed FAR context. The
+            released checkpoints use ``3``.
+        num_attention_heads (`int`, defaults to `40`):
+            Number of attention heads.
+        attention_head_dim (`int`, defaults to `128`):
+            The number of channels in each head.
+        in_channels (`int`, defaults to `16`):
+            The number of channels in the input latent.
+        out_channels (`int`, defaults to `16`):
+            The number of channels in the output latent.
+        text_dim (`int`, defaults to `4096`):
+            Input dimension for text embeddings (UMT5).
+        freq_dim (`int`, defaults to `256`):
+            Dimension for sinusoidal time embeddings.
+        ffn_dim (`int`, defaults to `13824`):
+            Intermediate dimension in feed-forward network.
+        num_layers (`int`, defaults to `40`):
+            Number of transformer blocks.
+        cross_attn_norm (`bool`, defaults to `True`):
+            Enable cross-attention normalization.
+        eps (`float`, defaults to `1e-6`):
+            Epsilon for normalization layers.
+        image_dim (`Optional[int]`, *optional*, defaults to `None`):
+            Image embedding dimension for I2V conditioning.
+        rope_max_seq_len (`int`, defaults to `1024`):
+            Maximum sequence length used to precompute rotary position frequencies.
+        gate_value (`float`, defaults to `0.25`):
+            Mixing gate between source-timestep and delta-timestep embeddings.
+        deltatime_type (`str`, defaults to `'r'`):
+            Either ``"r"`` (delta is the target timestep) or ``"t-r"`` (delta is the absolute interval).
+
+    .. note::
+        ``chunk_partition`` is **not** a model config field — it is a per-call argument passed to :meth:`forward`.
+        Different inference setups (varying ``num_frames`` or full-vs-compressed schedules) therefore do not require
+        separate checkpoints.
+    """
+
+    _supports_gradient_checkpointing = True
+    _skip_layerwise_casting_patterns = ["patch_embedding", "far_patch_embedding", "condition_embedder", "norm"]
+    _no_split_modules = ["AnyFlowTransformerBlock"]
+    _keep_in_fp32_modules = ["time_embedder", "scale_shift_table", "norm1", "norm2", "norm3"]
+    _repeated_blocks = ["AnyFlowTransformerBlock"]
+
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: Tuple[int] = (1, 2, 2),
+        compressed_patch_size: Tuple[int] = (1, 4, 4),
+        full_chunk_limit: int = 3,
+        num_attention_heads: int = 40,
+        attention_head_dim: int = 128,
+        in_channels: int = 16,
+        out_channels: int = 16,
+        text_dim: int = 4096,
+        freq_dim: int = 256,
+        ffn_dim: int = 13824,
+        num_layers: int = 40,
+        cross_attn_norm: bool = True,
+        eps: float = 1e-6,
+        image_dim: Optional[int] = None,
+        rope_max_seq_len: int = 1024,
+        gate_value: float = 0.25,
+        deltatime_type: str = "r",
+    ) -> None:
+        super().__init__()
+
+        inner_dim = num_attention_heads * attention_head_dim
+        out_channels = out_channels or in_channels
+
+        # 1. Patch & position embedding (full + FAR-compressed branches).
+        self.rope = AnyFlowCausalRotaryPosEmbed(
+            attention_head_dim, patch_size, compressed_patch_size, rope_max_seq_len
+        )
+        self.patch_embedding = nn.Conv3d(in_channels, inner_dim, kernel_size=patch_size, stride=patch_size)
+
+        self.far_patch_embedding = nn.Conv3d(
+            in_channels, inner_dim, kernel_size=compressed_patch_size, stride=compressed_patch_size
+        )
+        # Warm-start the compressed branch from the full-resolution branch by trilinear interpolation. This
+        # matches FAR-Dev's `setup_far_model()` initialization. State-dict loading will overwrite these
+        # weights for trained checkpoints; the warm-start only matters when constructing a fresh model.
+        original_weight = self.patch_embedding.weight.data.view(-1, 1, *patch_size)
+        new_weight = F.interpolate(original_weight, size=compressed_patch_size, mode="trilinear", align_corners=False)
+        new_weight = new_weight.view(inner_dim, in_channels, *compressed_patch_size)
+        with torch.no_grad():
+            self.far_patch_embedding.weight.copy_(new_weight)
+            self.far_patch_embedding.bias.copy_(self.patch_embedding.bias)
+
+        # 2. Condition embedding (always dual-timestep for AnyFlow distilled checkpoints).
+        self.condition_embedder = AnyFlowDualTimestepTextImageEmbeddingCausal(
+            dim=inner_dim,
+            gate_value=gate_value,
+            deltatime_type=deltatime_type,
+            time_freq_dim=freq_dim,
+            time_proj_dim=inner_dim * 6,
+            text_embed_dim=text_dim,
+            image_embed_dim=image_dim,
+        )
+
+        # 3. Transformer blocks (causal self-attn processor)
+        self.blocks = nn.ModuleList(
+            [
+                AnyFlowTransformerBlock(inner_dim, ffn_dim, num_attention_heads, cross_attn_norm, eps, is_causal=True)
+                for _ in range(num_layers)
+            ]
+        )
+
+        # 4. Output norm & projection
+        self.norm_out = FP32LayerNorm(inner_dim, eps, elementwise_affine=False)
+        self.proj_out = nn.Linear(inner_dim, out_channels * math.prod(patch_size))
+        self.scale_shift_table = nn.Parameter(torch.randn(1, 2, inner_dim) / inner_dim**0.5)
+
+        self.gradient_checkpointing = False
+
+    @apply_lora_scale("attention_kwargs")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        timestep: torch.Tensor,
+        r_timestep: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        chunk_partition: List[int],
+        encoder_hidden_states_image: Optional[torch.Tensor] = None,
+        clean_hidden_states: Optional[torch.Tensor] = None,
+        clean_timestep: Optional[torch.Tensor] = None,
+        kv_cache: Optional[List[Dict[str, torch.Tensor]]] = None,
+        kv_cache_flag: Optional[Dict[str, Any]] = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[Transformer2DModelOutput, AnyFlowFARTransformerOutput, Tuple]:
+        """
+        FAR causal forward pass. Dispatches to one of three internal paths:
+
+        * ``kv_cache is None`` → causal training rollout (returns :class:`Transformer2DModelOutput`).
+        * ``kv_cache is not None`` and ``kv_cache_flag["is_cache_step"]`` → cache-prefill (returns
+          :class:`AnyFlowFARTransformerOutput` with ``sample=None``).
+        * Otherwise → autoregressive inference step (returns :class:`AnyFlowFARTransformerOutput`).
+
+        Args:
+            hidden_states (`torch.Tensor`):
+                Latent input of shape ``(B, F, C, H, W)``.
+            timestep (`torch.Tensor`):
+                Source (noisier) flow-map timestep `t`.
+            r_timestep (`torch.Tensor`):
+                Target (cleaner) flow-map timestep `r`.
+            encoder_hidden_states (`torch.Tensor`):
+                UMT5 text embeddings.
+            chunk_partition (`List[int]`):
+                Per-chunk frame counts; total must match the number of latent frames in ``hidden_states``.
+            encoder_hidden_states_image (`torch.Tensor`, *optional*):
+                I2V image embedding; concatenated before text tokens when provided.
+            clean_hidden_states (`torch.Tensor`, *optional*):
+                Clean (noise-free) conditioning frames used by the training rollout.
+            clean_timestep (`torch.Tensor`, *optional*):
+                Timesteps for the clean conditioning frames in the training rollout.
+            kv_cache (`List[Dict[str, torch.Tensor]]`, *optional*):
+                Per-block KV cache for autoregressive inference. `None` selects the training path.
+            kv_cache_flag (`Dict[str, Any]`, *optional*):
+                KV-cache metadata (e.g. ``is_cache_step`` flag and token counts).
+            attention_kwargs (`dict`, *optional*):
+                Forwarded to the attention processors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                If `False`, returns positional tuples instead of an output dataclass.
+        """
+        # `attention_kwargs` is consumed by the @apply_lora_scale decorator on this method;
+        # it does not need to thread through to the inner _forward_* paths.
+        common = {
+            "hidden_states": hidden_states,
+            "chunk_partition": chunk_partition,
+            "timestep": timestep,
+            "r_timestep": r_timestep,
+            "encoder_hidden_states": encoder_hidden_states,
+            "encoder_hidden_states_image": encoder_hidden_states_image,
+            "return_dict": return_dict,
+        }
+        if kv_cache is not None:
+            common["kv_cache"] = kv_cache
+            common["kv_cache_flag"] = kv_cache_flag
+            if kv_cache_flag is not None and kv_cache_flag.get("is_cache_step"):
+                return self._forward_cache(
+                    clean_hidden_states=clean_hidden_states,
+                    clean_timestep=clean_timestep,
+                    **common,
+                )
+            return self._forward_inference(**common)
+        return self._forward_train(
+            clean_hidden_states=clean_hidden_states,
+            clean_timestep=clean_timestep,
+            **common,
+        )
+
+    def _unpack_latent_sequence(self, latents, num_frames, height, width, patch_size):
+        batch_size, num_patches, channels = latents.shape
+        height, width = height // patch_size, width // patch_size
+
+        latents = latents.view(
+            batch_size * num_frames, height, width, patch_size, patch_size, channels // (patch_size * patch_size)
+        )
+
+        latents = latents.permute(0, 5, 1, 3, 2, 4)
+        latents = latents.reshape(
+            batch_size, num_frames, channels // (patch_size * patch_size), height * patch_size, width * patch_size
+        )
+        return latents
+
+    def _forward_far_patchify(self, hidden_states, far_cfg, clean_hidden_states=None):
+        full_hidden_states, compressed_hidden_states = (
+            hidden_states[:, :, far_cfg["num_compressed_frames"] :],
+            hidden_states[:, :, : far_cfg["num_compressed_frames"]],
+        )  # noqa: E501
+
+        patchified_full_hidden_states = (
+            self.patch_embedding(full_hidden_states).flatten(start_dim=2, end_dim=4).transpose(1, 2)
+        )
+        if clean_hidden_states is not None:
+            clean_hidden_states = (
+                self.patch_embedding(clean_hidden_states).flatten(start_dim=2, end_dim=4).transpose(1, 2)
+            )
+            patchified_full_hidden_states = torch.cat([patchified_full_hidden_states, clean_hidden_states], dim=1)
+
+        if far_cfg["num_compressed_frames"] > 0:
+            patchified_compressed_hidden_states = (
+                self.far_patch_embedding(compressed_hidden_states).flatten(start_dim=2, end_dim=4).transpose(1, 2)
+            )
+            hidden_states = torch.cat([patchified_compressed_hidden_states, patchified_full_hidden_states], dim=1)
+        else:
+            hidden_states = patchified_full_hidden_states
+        return hidden_states
+
+    def _forward_far_patchify_inference(self, hidden_states):
+        hidden_states = self.patch_embedding(hidden_states).flatten(start_dim=2, end_dim=4).transpose(1, 2)
+        return hidden_states
+
+    def _build_causal_mask(self, far_cfg, clean_hidden_states, device, dtype):
+        chunk_partition = far_cfg["chunk_partition"]
+
+        noise_seq_len = clean_seq_len = far_cfg["num_full_frames"] * far_cfg["full_token_per_frame"]
+        context_seq_len = far_cfg["num_compressed_frames"] * far_cfg["compressed_token_per_frame"]
+
+        noise_start = context_seq_len
+        noise_end = noise_start + noise_seq_len
+
+        clean_start = context_seq_len + noise_seq_len
+        clean_end = clean_start + clean_seq_len
+
+        if clean_hidden_states is not None:
+            real_seq_len = context_seq_len + noise_seq_len + clean_seq_len
+        else:
+            real_seq_len = context_seq_len + noise_seq_len
+
+        padded_seq_len = int(math.ceil(real_seq_len / 128.0) * 128.0)
+
+        if clean_hidden_states is not None:
+            context_chunk_partition, noise_chunk_partition = (
+                chunk_partition[: far_cfg["num_compressed_chunk"]],
+                chunk_partition[far_cfg["num_compressed_chunk"] :],
+            )  # noqa: E501
+
+            if len(context_chunk_partition) != 0:
+                context_frame_idx = torch.cat(
+                    [
+                        torch.ones(chunk_len * far_cfg["compressed_token_per_frame"], device=device) * chunk_idx
+                        for chunk_idx, chunk_len in enumerate(context_chunk_partition)
+                    ]
+                )  # noqa: E501
+            else:
+                context_frame_idx = None
+            noise_frame_idx = clean_frame_idx = torch.cat(
+                [
+                    torch.ones(chunk_len * far_cfg["full_token_per_frame"], device=device)
+                    * (chunk_idx + len(context_chunk_partition))
+                    for chunk_idx, chunk_len in enumerate(noise_chunk_partition)
+                ]
+            )  # noqa: E501
+            pad_frame_idx = torch.zeros(padded_seq_len - real_seq_len, device=device)
+
+            if len(context_chunk_partition) != 0:
+                frame_idx = torch.cat([context_frame_idx, noise_frame_idx, clean_frame_idx, pad_frame_idx], dim=0)
+            else:
+                frame_idx = torch.cat([noise_frame_idx, clean_frame_idx, pad_frame_idx], dim=0)
+
+            def mask_mod(b, h, q_idx, kv_idx):
+                # q_idx, kv_idx: LongTensor, range: [0, padded_seq_len)
+
+                # 1) whether is padding
+                is_padding = (q_idx >= real_seq_len) | (kv_idx >= real_seq_len)
+
+                # 2) chunk causal
+                base = frame_idx[q_idx] >= frame_idx[kv_idx]
+
+                # 3) interval mask
+                q_is_noise = (q_idx >= noise_start) & (q_idx < noise_end)
+                q_is_clean = (q_idx >= clean_start) & (q_idx < clean_end)
+
+                k_is_noise = (kv_idx >= noise_start) & (kv_idx < noise_end)
+                k_is_clean = (kv_idx >= clean_start) & (kv_idx < clean_end)
+
+                # 4) clean -> noise: disallowed
+                is_clean_to_noise = q_is_clean & k_is_noise
+
+                # 5) noise -> noise: only same frame
+                same_frame_idx = frame_idx[q_idx] == frame_idx[kv_idx]
+
+                noise_to_noise = q_is_noise & k_is_noise
+                noise_to_clean = q_is_noise & k_is_clean
+
+                noise_to_noise_allow = noise_to_noise & same_frame_idx
+                noise_to_noise_mask = (~noise_to_noise) | noise_to_noise_allow
+
+                noise_to_clean_same = noise_to_clean & same_frame_idx
+                noise_to_clean_disallow = noise_to_clean_same
+
+                # attention mask is chunk casual
+                allowed = base & ~is_padding & ~is_clean_to_noise & noise_to_noise_mask & ~noise_to_clean_disallow
+                return allowed
+
+            return create_block_mask(
+                mask_mod,
+                B=None,
+                H=None,
+                Q_LEN=padded_seq_len,
+                KV_LEN=padded_seq_len,
+                device=device,
+                _compile=False,
+            )
+        else:
+            context_chunk_partition, noise_chunk_partition = (
+                chunk_partition[: far_cfg["num_compressed_chunk"]],
+                chunk_partition[far_cfg["num_compressed_chunk"] :],
+            )  # noqa: E501
+
+            if len(context_chunk_partition) != 0:
+                context_frame_idx = torch.cat(
+                    [
+                        torch.ones(chunk_len * far_cfg["compressed_token_per_frame"], device=device) * chunk_idx
+                        for chunk_idx, chunk_len in enumerate(context_chunk_partition)
+                    ]
+                )  # noqa: E501
+            else:
+                context_frame_idx = None
+
+            noise_frame_idx = torch.cat(
+                [
+                    torch.ones(chunk_len * far_cfg["full_token_per_frame"], device=device)
+                    * (chunk_idx + len(context_chunk_partition))
+                    for chunk_idx, chunk_len in enumerate(noise_chunk_partition)
+                ]
+            )  # noqa: E501
+            pad_frame_idx = torch.zeros(padded_seq_len - real_seq_len, device=device)
+
+            if len(context_chunk_partition) != 0:
+                frame_idx = torch.cat([context_frame_idx, noise_frame_idx, pad_frame_idx], dim=0)
+            else:
+                frame_idx = torch.cat([noise_frame_idx, pad_frame_idx], dim=0)
+
+            def mask_mod(b, h, q_idx, kv_idx):
+                is_padding = (q_idx >= real_seq_len) | (kv_idx >= real_seq_len)
+                base = frame_idx[q_idx] >= frame_idx[kv_idx]
+                return base & ~is_padding
+
+            return create_block_mask(
+                mask_mod,
+                B=None,
+                H=None,
+                Q_LEN=padded_seq_len,
+                KV_LEN=padded_seq_len,
+                device=device,
+                _compile=False,
+            )
+
+    def _forward_inference(
+        self,
+        hidden_states: torch.Tensor,
+        chunk_partition,
+        timestep: torch.LongTensor,
+        r_timestep: torch.LongTensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_states_image: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        kv_cache=None,
+        kv_cache_flag=None,
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+
+        full_token_per_frame = (height // self.config.patch_size[1]) * (width // self.config.patch_size[2])
+        compressed_token_per_frame = (height // self.config.compressed_patch_size[1]) * (
+            width // self.config.compressed_patch_size[2]
+        )
+
+        total_chunks = 1 + kv_cache_flag["num_cached_chunks"]
+
+        if total_chunks >= self.config.full_chunk_limit:
+            num_full_chunk, num_compressed_chunk = (
+                self.config.full_chunk_limit,
+                total_chunks - self.config.full_chunk_limit,
+            )
+        else:
+            num_full_chunk, num_compressed_chunk = total_chunks, 0
+
+        kv_cache_flag["num_cached_full_tokens"] = (
+            sum(chunk_partition[num_compressed_chunk : num_compressed_chunk + (num_full_chunk - 1)])
+            * full_token_per_frame
+        )  # noqa: E501
+        kv_cache_flag["num_cached_compressed_tokens"] = (
+            sum(chunk_partition[:num_compressed_chunk]) * compressed_token_per_frame
+        )
+
+        far_cfg = {
+            "total_frames": sum(chunk_partition),
+            "num_full_frames": sum(chunk_partition[num_compressed_chunk:]),
+            "num_compressed_frames": sum(chunk_partition[:num_compressed_chunk]),
+            "full_frame_shape": (height // self.config.patch_size[1], width // self.config.patch_size[2]),
+            "compressed_frame_shape": (
+                height // self.config.compressed_patch_size[1],
+                width // self.config.compressed_patch_size[2],
+            ),
+            "full_token_per_frame": full_token_per_frame,
+            "compressed_token_per_frame": compressed_token_per_frame,
+        }
+
+        # step 3: generate attention mask
+        attention_mask = None
+        hidden_states = self._forward_far_patchify_inference(hidden_states)
+
+        rotary_emb = self.rope(far_cfg=far_cfg, device=hidden_states.device)
+        rotary_emb["query"] = rotary_emb["query"][:, :, -hidden_states.shape[1] :]
+
+        temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image = self.condition_embedder(
+            timestep,
+            r_timestep,
+            encoder_hidden_states,
+            encoder_hidden_states_image,
+            far_cfg=far_cfg,  # noqa: E501
+        )
+        timestep_proj = timestep_proj.unflatten(2, (6, -1))
+
+        if encoder_hidden_states_image is not None:
+            encoder_hidden_states = torch.concat([encoder_hidden_states_image, encoder_hidden_states], dim=1)
+
+        # 4. Transformer blocks
+        for index_block, block in enumerate(self.blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                hidden_states = self._gradient_checkpointing_func(
+                    block,
+                    hidden_states,
+                    encoder_hidden_states,
+                    timestep_proj,
+                    rotary_emb,
+                    attention_mask,
+                    kv_cache[index_block],
+                    kv_cache_flag,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    encoder_hidden_states,
+                    timestep_proj,
+                    rotary_emb,
+                    attention_mask,
+                    kv_cache[index_block],
+                    kv_cache_flag,
+                )
+
+        # 5. Output norm, projection & unpatchify
+        shift, scale = (self.scale_shift_table + temb.unsqueeze(2)).chunk(2, dim=2)
+        shift, scale = shift.squeeze(2), scale.squeeze(2)
+
+        # Move the shift and scale tensors to the same device as hidden_states.
+        # When using multi-GPU inference via accelerate these will be on the
+        # first device rather than the last device, which hidden_states ends up
+        # on.
+        shift = shift.to(hidden_states.device)
+        scale = scale.to(hidden_states.device)
+
+        hidden_states = (self.norm_out(hidden_states.float()) * (1 + scale) + shift).type_as(hidden_states)
+
+        output = self.proj_out(hidden_states)
+        output = self._unpack_latent_sequence(
+            output, num_frames=chunk_partition[-1], height=height, width=width, patch_size=self.config.patch_size[1]
+        )
+
+        if not return_dict:
+            return output, kv_cache
+
+        return AnyFlowFARTransformerOutput(sample=output, kv_cache=kv_cache)
+
+    def _forward_cache(
+        self,
+        hidden_states: torch.Tensor,
+        chunk_partition,
+        timestep: torch.LongTensor,
+        r_timestep: torch.LongTensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_states_image: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        clean_hidden_states=None,
+        clean_timestep=None,
+        kv_cache=None,
+        kv_cache_flag=None,
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+        if clean_hidden_states is not None:
+            clean_hidden_states = clean_hidden_states.permute(0, 2, 1, 3, 4)
+
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+
+        full_token_per_frame = (height // self.config.patch_size[1]) * (width // self.config.patch_size[2])
+        compressed_token_per_frame = (height // self.config.compressed_patch_size[1]) * (
+            width // self.config.compressed_patch_size[2]
+        )
+        total_chunks = len(chunk_partition)
+
+        full_chunk_limit = self.config.full_chunk_limit - 1
+
+        if total_chunks > full_chunk_limit:
+            num_full_chunk, num_compressed_chunk = full_chunk_limit, total_chunks - full_chunk_limit
+        else:
+            num_full_chunk, num_compressed_chunk = total_chunks, 0
+
+        far_cfg = {
+            "total_frames": sum(chunk_partition),
+            "num_full_chunk": num_full_chunk,
+            "num_full_frames": sum(chunk_partition[num_compressed_chunk:]),
+            "num_compressed_chunk": num_compressed_chunk,
+            "num_compressed_frames": sum(chunk_partition[:num_compressed_chunk]),
+            "full_frame_shape": (height // self.config.patch_size[1], width // self.config.patch_size[2]),
+            "compressed_frame_shape": (
+                height // self.config.compressed_patch_size[1],
+                width // self.config.compressed_patch_size[2],
+            ),
+            "full_token_per_frame": full_token_per_frame,
+            "compressed_token_per_frame": compressed_token_per_frame,
+            "chunk_partition": chunk_partition,
+        }
+
+        kv_cache_flag["num_full_tokens"] = far_cfg["num_full_frames"] * far_cfg["full_token_per_frame"]
+        kv_cache_flag["num_compressed_tokens"] = (
+            far_cfg["num_compressed_frames"] * far_cfg["compressed_token_per_frame"]
+        )
+
+        # step 3: generate attention mask
+        attention_mask = self._build_causal_mask(
+            far_cfg, clean_hidden_states=clean_hidden_states, device=hidden_states.device, dtype=hidden_states.dtype
+        )
+
+        rotary_emb = self.rope(far_cfg=far_cfg, clean_hidden_states=clean_hidden_states, device=hidden_states.device)
+        hidden_states = self._forward_far_patchify(
+            hidden_states, far_cfg=far_cfg, clean_hidden_states=clean_hidden_states
+        )
+
+        temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image = self.condition_embedder(
+            timestep,
+            r_timestep,
+            encoder_hidden_states,
+            encoder_hidden_states_image,
+            far_cfg=far_cfg,
+            clean_timestep=clean_timestep,
+        )
+        timestep_proj = timestep_proj.unflatten(2, (6, -1))
+
+        if encoder_hidden_states_image is not None:
+            encoder_hidden_states = torch.concat([encoder_hidden_states_image, encoder_hidden_states], dim=1)
+
+        # 4. Transformer blocks
+        for index_block, block in enumerate(self.blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                hidden_states = self._gradient_checkpointing_func(
+                    block,
+                    hidden_states,
+                    encoder_hidden_states,
+                    timestep_proj,
+                    rotary_emb,
+                    attention_mask,
+                    kv_cache[index_block],
+                    kv_cache_flag,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    encoder_hidden_states,
+                    timestep_proj,
+                    rotary_emb,
+                    attention_mask,
+                    kv_cache[index_block],
+                    kv_cache_flag,
+                )
+
+        if not return_dict:
+            return None, kv_cache
+
+        return AnyFlowFARTransformerOutput(sample=None, kv_cache=kv_cache)
+
+    def _forward_train(
+        self,
+        hidden_states: torch.Tensor,
+        chunk_partition,
+        timestep: torch.LongTensor,
+        r_timestep: torch.LongTensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_states_image: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+        clean_hidden_states=None,
+        clean_timestep=None,
+    ) -> Union[torch.Tensor, Dict[str, torch.Tensor]]:
+        hidden_states = hidden_states.permute(0, 2, 1, 3, 4)
+        if clean_hidden_states is not None:
+            clean_hidden_states = clean_hidden_states.permute(0, 2, 1, 3, 4)
+
+        batch_size, num_channels, num_frames, height, width = hidden_states.shape
+
+        full_token_per_frame = (height // self.config.patch_size[1]) * (width // self.config.patch_size[2])
+        compressed_token_per_frame = (height // self.config.compressed_patch_size[1]) * (
+            width // self.config.compressed_patch_size[2]
+        )
+        total_chunks = len(chunk_partition)
+
+        if total_chunks > self.config.full_chunk_limit:
+            num_full_chunk, num_compressed_chunk = (
+                self.config.full_chunk_limit,
+                total_chunks - self.config.full_chunk_limit,
+            )
+        else:
+            num_full_chunk, num_compressed_chunk = total_chunks, 0
+
+        far_cfg = {
+            "total_frames": sum(chunk_partition),
+            "num_full_chunk": num_full_chunk,
+            "num_full_frames": sum(chunk_partition[num_compressed_chunk:]),
+            "num_compressed_chunk": num_compressed_chunk,
+            "num_compressed_frames": sum(chunk_partition[:num_compressed_chunk]),
+            "full_frame_shape": (height // self.config.patch_size[1], width // self.config.patch_size[2]),
+            "compressed_frame_shape": (
+                height // self.config.compressed_patch_size[1],
+                width // self.config.compressed_patch_size[2],
+            ),
+            "full_token_per_frame": full_token_per_frame,
+            "compressed_token_per_frame": compressed_token_per_frame,
+            "chunk_partition": chunk_partition,
+        }
+
+        # step 3: generate attention mask
+        attention_mask = self._build_causal_mask(
+            far_cfg, clean_hidden_states=clean_hidden_states, device=hidden_states.device, dtype=hidden_states.dtype
+        )
+
+        rotary_emb = self.rope(far_cfg=far_cfg, clean_hidden_states=clean_hidden_states, device=hidden_states.device)
+
+        hidden_states = self._forward_far_patchify(
+            hidden_states, far_cfg=far_cfg, clean_hidden_states=clean_hidden_states
+        )
+
+        temb, timestep_proj, encoder_hidden_states, encoder_hidden_states_image = self.condition_embedder(
+            timestep,
+            r_timestep,
+            encoder_hidden_states,
+            encoder_hidden_states_image,
+            far_cfg=far_cfg,
+            clean_timestep=clean_timestep,
+        )
+        timestep_proj = timestep_proj.unflatten(2, (6, -1))
+
+        if encoder_hidden_states_image is not None:
+            encoder_hidden_states = torch.concat([encoder_hidden_states_image, encoder_hidden_states], dim=1)
+
+        # 4. Transformer blocks
+        for index_block, block in enumerate(self.blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                hidden_states = self._gradient_checkpointing_func(
+                    block,
+                    hidden_states,
+                    encoder_hidden_states,
+                    timestep_proj,
+                    rotary_emb,
+                    attention_mask,
+                )
+            else:
+                hidden_states = block(hidden_states, encoder_hidden_states, timestep_proj, rotary_emb, attention_mask)
+
+        # 5. Output norm, projection & unpatchify
+        shift, scale = (self.scale_shift_table + temb.unsqueeze(2)).chunk(2, dim=2)
+        shift, scale = shift.squeeze(2), scale.squeeze(2)
+
+        # Move the shift and scale tensors to the same device as hidden_states.
+        # When using multi-GPU inference via accelerate these will be on the
+        # first device rather than the last device, which hidden_states ends up
+        # on.
+        shift = shift.to(hidden_states.device)
+        scale = scale.to(hidden_states.device)
+
+        hidden_states = (self.norm_out(hidden_states.float()) * (1 + scale) + shift).type_as(hidden_states)
+
+        if clean_hidden_states is not None:
+            hidden_states = hidden_states[
+                :, : -(far_cfg["num_full_frames"] * far_cfg["full_token_per_frame"])
+            ]  # remove clean copy
+        output = self.proj_out(
+            hidden_states[:, far_cfg["num_compressed_frames"] * far_cfg["compressed_token_per_frame"] :]
+        )  # remove far context
+        output = self._unpack_latent_sequence(
+            output,
+            num_frames=far_cfg["num_full_frames"],
+            height=height,
+            width=width,
+            patch_size=self.config.patch_size[1],
+        )  # noqa: E501
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
index d4b3974322b4..c0d12121d5e8 100644
--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -164,6 +164,10 @@
         "AnimateDiffVideoToVideoPipeline",
         "AnimateDiffVideoToVideoControlNetPipeline",
     ]
+    _import_structure["anyflow"] = [
+        "AnyFlowPipeline",
+        "AnyFlowFARPipeline",
+    ]
     _import_structure["bria"] = ["BriaPipeline"]
     _import_structure["bria_fibo"] = ["BriaFiboPipeline", "BriaFiboEditPipeline"]
     _import_structure["flux2"] = [
@@ -603,6 +607,10 @@
             AnimateDiffVideoToVideoControlNetPipeline,
             AnimateDiffVideoToVideoPipeline,
         )
+        from .anyflow import (
+            AnyFlowFARPipeline,
+            AnyFlowPipeline,
+        )
         from .audioldm2 import (
             AudioLDM2Pipeline,
             AudioLDM2ProjectionModel,
diff --git a/src/diffusers/pipelines/anyflow/__init__.py b/src/diffusers/pipelines/anyflow/__init__.py
new file mode 100644
index 000000000000..10603cdedc3b
--- /dev/null
+++ b/src/diffusers/pipelines/anyflow/__init__.py
@@ -0,0 +1,48 @@
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+_dummy_objects = {}
+_import_structure = {}
+
+
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["pipeline_anyflow"] = ["AnyFlowPipeline"]
+    _import_structure["pipeline_anyflow_far"] = ["AnyFlowFARPipeline"]
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .pipeline_anyflow import AnyFlowPipeline
+        from .pipeline_anyflow_far import AnyFlowFARPipeline
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
diff --git a/src/diffusers/pipelines/anyflow/pipeline_anyflow.py b/src/diffusers/pipelines/anyflow/pipeline_anyflow.py
new file mode 100644
index 000000000000..4339f5b77bcd
--- /dev/null
+++ b/src/diffusers/pipelines/anyflow/pipeline_anyflow.py
@@ -0,0 +1,642 @@
+# Copyright 2026 The AnyFlow Team, NVIDIA Corp., and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Adapted from diffusers.pipelines.wan.pipeline_wan.WanPipeline (v0.35.1) for any-step flow-map sampling.
+
+import html
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import regex as re
+import torch
+from transformers import AutoTokenizer, UMT5EncoderModel
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...loaders import WanLoraLoaderMixin
+from ...models import AnyFlowTransformer3DModel, AutoencoderKLWan
+from ...schedulers import FlowMapEulerDiscreteScheduler
+from ...utils import is_ftfy_available, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ...video_processor import VideoProcessor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import AnyFlowPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_ftfy_available():
+    import ftfy
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        >>> import torch
+        >>> from diffusers import AnyFlowPipeline
+        >>> from diffusers.utils import export_to_video
+
+        >>> pipe = AnyFlowPipeline.from_pretrained(
+        ...     "nvidia/AnyFlow-Wan2.1-T2V-14B-Diffusers", torch_dtype=torch.bfloat16
+        ... ).to("cuda")
+
+        >>> prompt = "A red panda eating bamboo in a forest, cinematic lighting"
+        >>> video = pipe(prompt, num_inference_steps=4, num_frames=33).frames[0]
+        >>> export_to_video(video, "anyflow_t2v.mp4", fps=16)
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.wan.pipeline_wan.basic_clean
+def basic_clean(text):
+    if is_ftfy_available():
+        text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+# Copied from diffusers.pipelines.wan.pipeline_wan.whitespace_clean
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+
+
+# Copied from diffusers.pipelines.wan.pipeline_wan.prompt_clean
+def prompt_clean(text):
+    text = whitespace_clean(basic_clean(text))
+    return text
+
+
+class AnyFlowPipeline(DiffusionPipeline, WanLoraLoaderMixin):
+    r"""
+    Bidirectional text-to-video generation pipeline for AnyFlow flow-map-distilled checkpoints, introduced in
+    [AnyFlow](https://huggingface.co/papers/2605.13724) by Yuchao Gu, Guian Fang et al.
+
+    AnyFlow learns arbitrary-interval transitions :math:`z_t \to z_r` rather than the fixed :math:`z_t \to z_0` mapping
+    of consistency models, so a single distilled checkpoint can be evaluated at 1, 2, 4, 8, 16... NFE without
+    retraining. This pipeline operates over the full video tensor in one bidirectional pass; for frame-level
+    autoregressive (causal) generation use ``AnyFlowFARPipeline``.
+
+    Sampling is plain Euler in mean-velocity form (``z_r = z_t - (t - r) * u``) with no re-noising. The released NVIDIA
+    checkpoints fold classifier-free guidance into the model weights, so the default ``guidance_scale=1.0`` is the
+    recommended setting.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        tokenizer ([`AutoTokenizer`]):
+            Tokenizer from [google/umt5-xxl](https://huggingface.co/google/umt5-xxl).
+        text_encoder ([`UMT5EncoderModel`]):
+            [google/umt5-xxl](https://huggingface.co/google/umt5-xxl) text encoder.
+        transformer ([`AnyFlowTransformer3DModel`]):
+            Bidirectional flow-map 3D Transformer.
+        vae ([`AutoencoderKLWan`]):
+            VAE that encodes/decodes videos to and from latent representations.
+        scheduler ([`FlowMapEulerDiscreteScheduler`]):
+            Flow-map sampler. The pipeline drives ``scheduler.step(..., timestep, sample, r_timestep)`` per inference
+            step.
+    """
+
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    def __init__(
+        self,
+        tokenizer: AutoTokenizer,
+        text_encoder: UMT5EncoderModel,
+        transformer: AnyFlowTransformer3DModel,
+        vae: AutoencoderKLWan,
+        scheduler: FlowMapEulerDiscreteScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+
+        self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal if getattr(self, "vae", None) else 4
+        self.vae_scale_factor_spatial = self.vae.config.scale_factor_spatial if getattr(self, "vae", None) else 8
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+
+    # Copied from diffusers.pipelines.wan.pipeline_wan.WanPipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: str | list[str] = None,
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 226,
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        prompt = [prompt_clean(u) for u in prompt]
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_attention_mask=True,
+            return_tensors="pt",
+        )
+        text_input_ids, mask = text_inputs.input_ids, text_inputs.attention_mask
+        seq_lens = mask.gt(0).sum(dim=1).long()
+
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), mask.to(device)).last_hidden_state
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        prompt_embeds = [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
+        prompt_embeds = torch.stack(
+            [torch.cat([u, u.new_zeros(max_sequence_length - u.size(0), u.size(1))]) for u in prompt_embeds], dim=0
+        )
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.wan.pipeline_wan.WanPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str | list[str],
+        negative_prompt: str | list[str] | None = None,
+        do_classifier_free_guidance: bool = True,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: torch.Tensor | None = None,
+        negative_prompt_embeds: torch.Tensor | None = None,
+        max_sequence_length: int = 226,
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `list[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def check_inputs(
+        self,
+        prompt,
+        negative_prompt,
+        height,
+        width,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        video=None,
+        video_latents=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 16 != 0 or width % 16 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
+
+        if video is not None and video_latents is not None:
+            raise ValueError("Provide either `video` or `video_latents`, not both.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"  # noqa: E501
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif negative_prompt is not None and (
+            not isinstance(negative_prompt, str) and not isinstance(negative_prompt, list)
+        ):
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+
+    # Copied from diffusers.pipelines.wan.pipeline_wan.WanPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_channels_latents: int = 16,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 81,
+        dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
+        generator: torch.Generator | list[torch.Generator] | None = None,
+        latents: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+
+        num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_latent_frames,
+            int(height) // self.vae_scale_factor_spatial,
+            int(width) // self.vae_scale_factor_spatial,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1.0
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+
+    def encode_video(self, video: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """Encode a pixel-space video into AnyFlow's latent layout.
+
+        Mirrors the single-helper convention of other diffusers pipelines (cf.
+        ``WanImageToVideoPipeline.encode_image``): wraps preprocessing, VAE encoding, and latent normalization into one
+        call. Output layout is ``(B, T_latent, C, H, W)``, which is what the AnyFlow transformer expects for
+        conditioning frames.
+        """
+        video = self.video_processor.preprocess_video(video, height=height, width=width).to(
+            dtype=self.vae.dtype, device=self._execution_device
+        )
+        # ``self.vae._encode`` expects (B, C, T, H, W); the AnyFlow rollout consumes (B, T_latent, C, H, W).
+        moments = self.vae._encode(video)
+        mu = torch.chunk(moments, 2, dim=1)[0]
+
+        latents_mean = torch.tensor(self.vae.config.latents_mean, device=mu.device).view(1, -1, 1, 1, 1)
+        latents_std = (1.0 / torch.tensor(self.vae.config.latents_std, device=mu.device)).view(1, -1, 1, 1, 1)
+        latents = ((mu.float() - latents_mean) * latents_std).to(mu)
+        return latents.permute(0, 2, 1, 3, 4)
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        video: Optional[torch.Tensor] = None,
+        video_latents: Optional[torch.Tensor] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 81,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 1.0,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "np",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+        use_mean_velocity: bool = True,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the video generation. If not defined, pass `prompt_embeds` instead.
+            video (`torch.Tensor`, *optional*):
+                Pre-VAE conditioning frames of shape `(B, T, C, H, W)` in `[0, 1]`. When provided, the pipeline
+                VAE-encodes them and keeps the corresponding latent prefix fixed during sampling. Mutually exclusive
+                with `video_latents`.
+            video_latents (`torch.Tensor`, *optional*):
+                Pre-encoded VAE latents in the AnyFlow layout `(B, T_latent, C, H_latent, W_latent)`. Skips VAE
+                encoding on the pipeline side. Mutually exclusive with `video`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to avoid during video generation. Ignored when not using guidance
+                (`guidance_scale < 1`).
+            height (`int`, defaults to `480`):
+                The height in pixels of the generated video.
+            width (`int`, defaults to `832`):
+                The width in pixels of the generated video.
+            num_frames (`int`, defaults to `81`):
+                The number of frames in the generated video. Must satisfy `(num_frames - 1) % vae_scale_factor_temporal
+                == 0`.
+            num_inference_steps (`int`, defaults to `50`):
+                The number of denoising steps. Distilled AnyFlow checkpoints support any-step sampling, so values as
+                low as `1`, `2`, `4`, or `8` are typical.
+            guidance_scale (`float`, defaults to `1.0`):
+                Classifier-free guidance scale. The released AnyFlow checkpoints fuse CFG into the weights during
+                training; keep at `1.0` unless you know your checkpoint expects otherwise.
+            num_videos_per_prompt (`int`, *optional*, defaults to `1`):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents to use as inputs. If not provided, latents are sampled from the supplied
+                `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to tweak text inputs (e.g., prompt weighting). If not
+                provided, embeddings are generated from `prompt`.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format. One of `"pil"`, `"np"`, `"pt"`, or `"latent"`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return an [`AnyFlowPipelineOutput`] instead of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function or [`PipelineCallback`] called at the end of each inference step. See
+                [`callbacks`](../callbacks) for details.
+            callback_on_step_end_tensor_inputs (`List[str]`, *optional*, defaults to `["latents"]`):
+                The tensor inputs forwarded to the callback. Must be a subset of `self._callback_tensor_inputs`.
+            max_sequence_length (`int`, defaults to `512`):
+                The maximum text-encoder sequence length. Longer prompts are truncated.
+            use_mean_velocity (`bool`, defaults to `True`):
+                When `True`, the flow-map model is conditioned on both the source timestep `t` and the target timestep
+                `r` to predict a mean velocity, matching the training-time behavior. Disable to mirror raw Euler
+                stepping (`r = t`).
+
+        Examples:
+
+        Returns:
+            [`~AnyFlowPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`AnyFlowPipelineOutput`] is returned, otherwise a `tuple` whose first
+                element is the generated video.
+        """
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            negative_prompt,
+            height,
+            width,
+            prompt_embeds,
+            negative_prompt_embeds,
+            video=video,
+            video_latents=video_latents,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        )
+
+        if num_frames % self.vae_scale_factor_temporal != 1:
+            logger.warning(
+                f"`num_frames - 1` has to be divisible by {self.vae_scale_factor_temporal}. Rounding to the nearest number."
+            )
+            num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
+        num_frames = max(num_frames, 1)
+
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._interrupt = False
+        self._num_timesteps = num_inference_steps
+
+        device = self._execution_device
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+
+        transformer_dtype = self.transformer.dtype
+        prompt_embeds = prompt_embeds.to(transformer_dtype)
+        if negative_prompt_embeds is not None:
+            negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
+
+        # 4. Prepare latent variables. ``prepare_latents`` returns the standard ``(B, C, T, H, W)``
+        # diffusers layout; the AnyFlow rollout expects ``(B, T, C, H, W)`` so we permute here.
+        num_channels_latents = self.transformer.config.in_channels
+        init_latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            num_frames,
+            torch.float32,
+            device,
+            generator,
+            latents,
+        )
+        init_latents = init_latents.permute(0, 2, 1, 3, 4).to(transformer_dtype)
+
+        # 5. Encode conditioning frames (or accept pre-encoded latents).
+        if video is not None:
+            video_latents = self.encode_video(video, height=height, width=width)
+        context_length = video_latents.shape[1] if video_latents is not None else 0
+
+        # 6. Denoising loop (inlined; follows the `WanPipeline.__call__` convention).
+        latents = init_latents
+        if negative_prompt_embeds is not None:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps  # length N; `step` resolves the next sigma internally.
+
+        with self.progress_bar(total=len(timesteps)) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # `r` is the target timestep for this step; equals the next sigma scaled to
+                # train-timestep units. The scheduler stores it on `sigmas[i + 1]`.
+                r = self.scheduler.sigmas[i + 1] * self.scheduler.config.num_train_timesteps
+                if t == r:
+                    progress_bar.update()
+                    continue
+
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+
+                timestep = t.expand(latent_model_input.shape[0]).unsqueeze(-1)
+                timestep = timestep.repeat((1, latent_model_input.shape[1]))
+
+                if use_mean_velocity:
+                    r_timestep = r.expand(latent_model_input.shape[0]).unsqueeze(-1)
+                    r_timestep = r_timestep.repeat((1, latent_model_input.shape[1]))
+                else:
+                    r_timestep = timestep
+
+                if video_latents is not None:
+                    latent_model_input[:, :context_length, ...] = video_latents
+                    timestep[:, :context_length] = 0
+
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    timestep=timestep,
+                    r_timestep=r_timestep,
+                    encoder_hidden_states=prompt_embeds,
+                    attention_kwargs=attention_kwargs,
+                    return_dict=False,
+                )[0]
+
+                if self.do_classifier_free_guidance:
+                    noise_uncond, noise_pred = noise_pred.chunk(2)
+                    noise_pred = noise_uncond + guidance_scale * (noise_pred - noise_uncond)
+
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs or []:
+                        if k == "latents":
+                            callback_kwargs[k] = latents
+                        elif k == "prompt_embeds":
+                            callback_kwargs[k] = prompt_embeds
+                        elif k == "negative_prompt_embeds":
+                            callback_kwargs[k] = negative_prompt_embeds
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                progress_bar.update()
+
+        if video_latents is not None:
+            latents[:, :context_length, ...] = video_latents
+        latents = latents.permute(0, 2, 1, 3, 4)
+
+        if not output_type == "latent":
+            latents = latents.to(self.vae.dtype)
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean)
+                .view(1, self.vae.config.z_dim, 1, 1, 1)
+                .to(latents.device, latents.dtype)
+            )
+            latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+                latents.device, latents.dtype
+            )
+            latents = latents / latents_std + latents_mean
+            video = self.vae.decode(latents, return_dict=False)[0]
+            video = self.video_processor.postprocess_video(video, output_type=output_type)
+        else:
+            video = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return AnyFlowPipelineOutput(frames=video)
diff --git a/src/diffusers/pipelines/anyflow/pipeline_anyflow_far.py b/src/diffusers/pipelines/anyflow/pipeline_anyflow_far.py
new file mode 100644
index 000000000000..0b1e0efa3404
--- /dev/null
+++ b/src/diffusers/pipelines/anyflow/pipeline_anyflow_far.py
@@ -0,0 +1,794 @@
+# Copyright 2026 The AnyFlow Team, NVIDIA Corp., and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Adapted from diffusers.pipelines.wan.pipeline_wan.WanPipeline (v0.35.1) for FAR causal flow-map sampling.
+
+import copy
+import html
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import regex as re
+import torch
+from tqdm import tqdm
+from transformers import AutoTokenizer, UMT5EncoderModel
+
+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
+from ...loaders import WanLoraLoaderMixin
+from ...models import AnyFlowFARTransformer3DModel, AutoencoderKLWan
+from ...schedulers import FlowMapEulerDiscreteScheduler
+from ...utils import is_ftfy_available, logging, replace_example_docstring
+from ...utils.torch_utils import randn_tensor
+from ...video_processor import VideoProcessor
+from ..pipeline_utils import DiffusionPipeline
+from .pipeline_output import AnyFlowPipelineOutput
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+if is_ftfy_available():
+    import ftfy
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        >>> import numpy as np
+        >>> import torch
+        >>> from diffusers import AnyFlowFARPipeline
+        >>> from diffusers.utils import export_to_video, load_image
+
+        >>> pipe = AnyFlowFARPipeline.from_pretrained(
+        ...     "nvidia/AnyFlow-FAR-Wan2.1-1.3B-Diffusers", torch_dtype=torch.bfloat16
+        ... ).to("cuda")
+
+        >>> # Single-frame I2V: wrap the conditioning image as a (1, 1, 3, H, W) tensor in [0, 1].
+        >>> first_frame = load_image("path/to/first_frame.png").resize((832, 480))
+        >>> arr = np.asarray(first_frame).astype("float32") / 255.0
+        >>> context = torch.from_numpy(arr).permute(2, 0, 1).unsqueeze(0).unsqueeze(1).to("cuda")
+
+        >>> video = pipe(
+        ...     prompt="a cat walks across a sunlit lawn",
+        ...     video=context,
+        ...     num_inference_steps=4,
+        ...     num_frames=81,
+        ... ).frames[0]
+        >>> export_to_video(video, "anyflow_far.mp4", fps=16)
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.wan.pipeline_wan.basic_clean
+def basic_clean(text):
+    if is_ftfy_available():
+        text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+# Copied from diffusers.pipelines.wan.pipeline_wan.whitespace_clean
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+
+
+# Copied from diffusers.pipelines.wan.pipeline_wan.prompt_clean
+def prompt_clean(text):
+    text = whitespace_clean(basic_clean(text))
+    return text
+
+
+class AnyFlowFARPipeline(DiffusionPipeline, WanLoraLoaderMixin):
+    r"""
+    Causal (FAR-based) text-to-video / image-to-video / video-to-video pipeline for AnyFlow checkpoints, introduced in
+    [AnyFlow](https://huggingface.co/papers/2605.13724) by Yuchao Gu, Guian Fang et al.
+
+    The pipeline drives a frame-level autoregressive sampling loop over chunks: each chunk is denoised with flow-map
+    steps while attending only to past chunks via block-sparse causal attention, and intermediate KV cache is reused
+    across chunks.
+
+    The task mode (T2V / I2V / V2V) is selected by which conditioning argument is passed to ``__call__``:
+
+    - both ``video=None`` and ``video_latents=None`` — pure text-to-video.
+    - ``video=<tensor of shape (B, T, C, H, W) in [0, 1] with T = 4n + 1>`` — pre-VAE conditioning frames; the pipeline
+      VAE-encodes them. Pass a single-frame video for I2V or a multi-frame clip for V2V.
+    - ``video_latents=<latent tensor of shape (B, T_latent, C, H_latent, W_latent)>`` — already-encoded latents in the
+      FAR layout (skips the VAE encode step).
+
+    The FAR backbone is the causal Wan2.1 variant introduced by FAR (Gu et al., 2025; arXiv:2503.19325). Inference is
+    plain Euler in mean-velocity form per chunk with no re-noising. Joint T2V / I2V / V2V is supported by a single
+    distilled model.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+
+    Args:
+        tokenizer ([`AutoTokenizer`]):
+            Tokenizer from [google/umt5-xxl](https://huggingface.co/google/umt5-xxl).
+        text_encoder ([`UMT5EncoderModel`]):
+            [google/umt5-xxl](https://huggingface.co/google/umt5-xxl) text encoder.
+        transformer ([`AnyFlowFARTransformer3DModel`]):
+            FAR causal flow-map 3D Transformer.
+        vae ([`AutoencoderKLWan`]):
+            VAE that encodes/decodes videos to and from latent representations.
+        scheduler ([`FlowMapEulerDiscreteScheduler`]):
+            Flow-map sampler.
+    """
+
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    # Default chunk partition for the released NVIDIA AnyFlow-FAR checkpoints (81 frames at the diffusers
+    # VAE temporal stride of 4 → 21 latent frames split into 1 + 3*6 + 2 = [1, 3, 3, 3, 3, 3, 3, 2]). Override
+    # via the ``chunk_partition`` argument to ``__call__`` for other frame counts.
+    default_chunk_partition: List[int] = [1, 3, 3, 3, 3, 3, 3, 2]
+
+    def __init__(
+        self,
+        tokenizer: AutoTokenizer,
+        text_encoder: UMT5EncoderModel,
+        transformer: AnyFlowFARTransformer3DModel,
+        vae: AutoencoderKLWan,
+        scheduler: FlowMapEulerDiscreteScheduler,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+
+        self.vae_scale_factor_temporal = self.vae.config.scale_factor_temporal if getattr(self, "vae", None) else 4
+        self.vae_scale_factor_spatial = self.vae.config.scale_factor_spatial if getattr(self, "vae", None) else 8
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+
+    # Copied from diffusers.pipelines.wan.pipeline_wan.WanPipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: str | list[str] = None,
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 226,
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        prompt = [prompt_clean(u) for u in prompt]
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_attention_mask=True,
+            return_tensors="pt",
+        )
+        text_input_ids, mask = text_inputs.input_ids, text_inputs.attention_mask
+        seq_lens = mask.gt(0).sum(dim=1).long()
+
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), mask.to(device)).last_hidden_state
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        prompt_embeds = [u[:v] for u, v in zip(prompt_embeds, seq_lens)]
+        prompt_embeds = torch.stack(
+            [torch.cat([u, u.new_zeros(max_sequence_length - u.size(0), u.size(1))]) for u in prompt_embeds], dim=0
+        )
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.wan.pipeline_wan.WanPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: str | list[str],
+        negative_prompt: str | list[str] | None = None,
+        do_classifier_free_guidance: bool = True,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: torch.Tensor | None = None,
+        negative_prompt_embeds: torch.Tensor | None = None,
+        max_sequence_length: int = 226,
+        device: torch.device | None = None,
+        dtype: torch.dtype | None = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `list[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `list[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def check_inputs(
+        self,
+        prompt,
+        negative_prompt,
+        height,
+        width,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        video=None,
+        video_latents=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 16 != 0 or width % 16 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
+
+        if video is not None and video_latents is not None:
+            raise ValueError("Provide either `video` or `video_latents`, not both.")
+        if video is not None and (video.shape[1] - 1) % 4 != 0:
+            raise ValueError(f"`video` must have `(num_frames - 1) % 4 == 0`, got num_frames={video.shape[1]}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"  # noqa: E501
+            )
+
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`: {negative_prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif negative_prompt is not None and (
+            not isinstance(negative_prompt, str) and not isinstance(negative_prompt, list)
+        ):
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+
+    # Copied from diffusers.pipelines.wan.pipeline_wan.WanPipeline.prepare_latents
+    def prepare_latents(
+        self,
+        batch_size: int,
+        num_channels_latents: int = 16,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 81,
+        dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
+        generator: torch.Generator | list[torch.Generator] | None = None,
+        latents: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if latents is not None:
+            return latents.to(device=device, dtype=dtype)
+
+        num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_latent_frames,
+            int(height) // self.vae_scale_factor_spatial,
+            int(width) // self.vae_scale_factor_spatial,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        return latents
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1.0
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+
+    def encode_video(self, video: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """Encode a pixel-space video into AnyFlow-FAR's latent layout.
+
+        Mirrors the single-helper convention of other diffusers pipelines. Output layout is ``(B, T_latent, C,
+        H_latent, W_latent)`` — the per-frame layout the FAR rollout consumes.
+        """
+        video = self.video_processor.preprocess_video(video, height=height, width=width).to(
+            dtype=self.vae.dtype, device=self._execution_device
+        )
+        moments = self.vae._encode(video)
+        mu = torch.chunk(moments, 2, dim=1)[0]
+
+        latents_mean = torch.tensor(self.vae.config.latents_mean, device=mu.device).view(1, -1, 1, 1, 1)
+        latents_std = (1.0 / torch.tensor(self.vae.config.latents_std, device=mu.device)).view(1, -1, 1, 1, 1)
+        latents = ((mu.float() - latents_mean) * latents_std).to(mu)
+        return latents.permute(0, 2, 1, 3, 4)
+
+    def encode_kv_cache(
+        self, kv_cache, kv_cache_flag, chunk_partition, chunk_idx, output, prompt_embeds, negative_prompt_embeds
+    ):
+        kv_cache_flag["is_cache_step"] = True
+
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+
+        latents = output[:, : sum(chunk_partition)]
+        latent_model_input = (
+            torch.cat([latents] * 2).to(self.transformer.dtype)
+            if self.do_classifier_free_guidance
+            else latents.to(self.transformer.dtype)
+        )
+
+        timestep = torch.tensor([0], device=latents.device).expand(latent_model_input.shape[0]).unsqueeze(-1)
+        timestep = timestep.repeat((1, latent_model_input.shape[1]))
+
+        r_timestep = torch.tensor([0], device=latents.device).expand(latent_model_input.shape[0]).unsqueeze(-1)
+        r_timestep = r_timestep.repeat((1, latent_model_input.shape[1]))
+
+        _, kv_cache = self.transformer(
+            hidden_states=latent_model_input,
+            chunk_partition=chunk_partition,
+            timestep=timestep,
+            r_timestep=r_timestep,
+            encoder_hidden_states=prompt_embeds,
+            attention_kwargs=self.attention_kwargs,
+            return_dict=False,
+            # kv-cache related
+            kv_cache=kv_cache,
+            kv_cache_flag=copy.deepcopy(kv_cache_flag),
+        )
+
+        kv_cache_flag["num_cached_chunks"] += 1
+        kv_cache_flag["is_cache_step"] = False
+
+        return kv_cache
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        video: Optional[torch.Tensor] = None,
+        video_latents: Optional[torch.Tensor] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        height: int = 480,
+        width: int = 832,
+        num_frames: int = 81,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 1.0,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "np",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+        use_mean_velocity: bool = True,
+        use_kv_cache: bool = True,
+        chunk_partition: Optional[List[int]] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the video generation. If not defined, pass `prompt_embeds` instead.
+            video (`torch.Tensor`, *optional*):
+                Pre-VAE conditioning frames of shape `(B, T, C, H, W)` in `[0, 1]` (`T = 4n + 1`). When provided, the
+                pipeline VAE-encodes them and keeps the corresponding latent prefix fixed during sampling. Mutually
+                exclusive with `video_latents`.
+            video_latents (`torch.Tensor`, *optional*):
+                Pre-encoded VAE latents in the FAR layout `(B, T_latent, C, H_latent, W_latent)`. Skips VAE encoding on
+                the pipeline side. Mutually exclusive with `video`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to avoid during video generation. Ignored when not using guidance
+                (`guidance_scale < 1`).
+            height (`int`, defaults to `480`):
+                The height in pixels of the generated video.
+            width (`int`, defaults to `832`):
+                The width in pixels of the generated video.
+            num_frames (`int`, defaults to `81`):
+                The number of frames in the generated video. Must satisfy `(num_frames - 1) % vae_scale_factor_temporal
+                == 0`.
+            num_inference_steps (`int`, defaults to `50`):
+                The number of denoising steps per chunk. Distilled AnyFlow-FAR checkpoints support any-step sampling
+                (1, 2, 4, 8, ...).
+            guidance_scale (`float`, defaults to `1.0`):
+                Classifier-free guidance scale. The released AnyFlow checkpoints fuse CFG into the weights during
+                training; keep at `1.0` unless the checkpoint requires otherwise.
+            num_videos_per_prompt (`int`, *optional*, defaults to `1`):
+                The number of videos to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                Generator used to seed sampling.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents. If not provided, latents are sampled from the supplied `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. If not provided, embeddings are generated from `prompt`.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                Output format. One of `"pil"`, `"np"`, `"pt"`, or `"latent"`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return an [`AnyFlowPipelineOutput`] instead of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function or [`PipelineCallback`] called at the end of each inference step.
+            callback_on_step_end_tensor_inputs (`List[str]`, *optional*, defaults to `["latents"]`):
+                Tensor inputs forwarded to the callback. Must be a subset of `self._callback_tensor_inputs`.
+            max_sequence_length (`int`, defaults to `512`):
+                The maximum text-encoder sequence length.
+            use_mean_velocity (`bool`, defaults to `True`):
+                When `True`, condition the flow-map model on both the source timestep `t` and the target timestep `r`
+                to predict a mean velocity. Disable to mirror raw Euler stepping.
+            use_kv_cache (`bool`, defaults to `True`):
+                Reuse the FAR attention KV cache across causal chunks. Disable only for debugging.
+            chunk_partition (`List[int]`, *optional*):
+                Per-chunk frame counts. Defaults to `default_chunk_partition` (matched to the released 81-frame
+                checkpoints). When you change `num_frames`, supply a `chunk_partition` that sums to `(num_frames - 1)
+                // vae_scale_factor_temporal + 1`.
+
+        Examples:
+
+        Returns:
+            [`~AnyFlowPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, an [`AnyFlowPipelineOutput`] is returned, otherwise a `tuple` whose first
+                element is the generated video.
+        """
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            negative_prompt,
+            height,
+            width,
+            prompt_embeds,
+            negative_prompt_embeds,
+            video=video,
+            video_latents=video_latents,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        )
+
+        if num_frames % self.vae_scale_factor_temporal != 1:
+            logger.warning(
+                f"`num_frames - 1` has to be divisible by {self.vae_scale_factor_temporal}. Rounding to the nearest number."
+            )
+            num_frames = num_frames // self.vae_scale_factor_temporal * self.vae_scale_factor_temporal + 1
+        num_frames = max(num_frames, 1)
+
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._interrupt = False
+        self._num_timesteps = num_inference_steps
+
+        device = self._execution_device
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+
+        transformer_dtype = self.transformer.dtype
+        prompt_embeds = prompt_embeds.to(transformer_dtype)
+        if negative_prompt_embeds is not None:
+            negative_prompt_embeds = negative_prompt_embeds.to(transformer_dtype)
+
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels
+        init_latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            num_frames,
+            torch.float32,
+            device,
+            generator,
+            latents,
+        )
+        # ``prepare_latents`` returns the standard ``(B, C, T, H, W)`` diffusers layout. The FAR
+        # rollout permutes to ``(B, T, C, H, W)`` once before chunking.
+        init_latents = init_latents.to(transformer_dtype).permute(0, 2, 1, 3, 4)
+
+        # 5. Resolve conditioning latents (pre-encoded or pixel-space).
+        if video is not None:
+            video_latents = self.encode_video(video, height=height, width=width)
+
+        if chunk_partition is None:
+            chunk_partition = list(self.default_chunk_partition)
+        if init_latents.shape[1] != sum(chunk_partition):
+            raise ValueError(
+                f"chunk_partition={chunk_partition} sums to {sum(chunk_partition)}, but the input latent "
+                f"sequence has {init_latents.shape[1]} frames; pass an explicit chunk_partition that matches "
+                "your num_frames if you are not using the default 81-frame schedule."
+            )
+
+        full_token_per_frame = (init_latents.shape[3] // self.transformer.config.patch_size[1]) * (
+            init_latents.shape[4] // self.transformer.config.patch_size[2]
+        )
+        compressed_token_per_frame = (init_latents.shape[3] // self.transformer.config.compressed_patch_size[1]) * (
+            init_latents.shape[4] // self.transformer.config.compressed_patch_size[2]
+        )
+
+        # 6. Allocate KV cache (across chunks). The cache stays None when use_kv_cache=False.
+        if use_kv_cache:
+            kv_cache_batch_size = (
+                init_latents.shape[0] * 2 if self.do_classifier_free_guidance else init_latents.shape[0]
+            )
+            kv_cache = {}
+            for layer_idx in range(self.transformer.config.num_layers):
+                kv_cache[layer_idx] = {
+                    "full_cache": torch.zeros(
+                        (
+                            2,
+                            kv_cache_batch_size,
+                            self.transformer.config.num_attention_heads,
+                            self.transformer.config.full_chunk_limit * max(chunk_partition) * full_token_per_frame,
+                            self.transformer.config.attention_head_dim,
+                        ),
+                        device=init_latents.device,
+                        dtype=init_latents.dtype,
+                    ),
+                    "compressed_cache": torch.zeros(
+                        (
+                            2,
+                            kv_cache_batch_size,
+                            self.transformer.config.num_attention_heads,
+                            (len(chunk_partition) - self.transformer.config.full_chunk_limit + 1)
+                            * max(chunk_partition)
+                            * compressed_token_per_frame,
+                            self.transformer.config.attention_head_dim,
+                        ),
+                        device=init_latents.device,
+                        dtype=init_latents.dtype,
+                    ),
+                }
+            kv_cache_flag = {"num_cached_chunks": 0, "is_cache_step": False}
+        else:
+            kv_cache = None
+            kv_cache_flag = None
+
+        output = torch.zeros_like(init_latents)
+
+        # 7. Apply conditioning prefix.
+        if video_latents is not None:
+            output[:, : video_latents.shape[1]] = video_latents
+            num_context_chunks = next(
+                i + 1 for i in range(len(chunk_partition)) if sum(chunk_partition[: i + 1]) >= video_latents.shape[1]
+            )
+        else:
+            num_context_chunks = 0
+
+        # Each non-context chunk runs `num_inference_steps` denoising steps that fire
+        # callback_on_step_end; context chunks only encode KV cache and never call back.
+        self._num_timesteps = (len(chunk_partition) - num_context_chunks) * num_inference_steps
+
+        # 8. Denoising loop (inlined; outer over chunks, inner over timesteps). Mirrors
+        # `WanAnimatePipeline.__call__`'s nested-loop convention (cf. pipeline_wan_animate.py:1035).
+        # `encode_kv_cache` is kept as a method because it is its own coherent operation (a single
+        # cache-prefill call on the transformer); inlining it would obscure the read.
+        encoder_hidden_states = (
+            torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+            if (negative_prompt_embeds is not None)
+            else prompt_embeds
+        )
+        outer_progress_bar_config = getattr(self, "_progress_bar_config", {}).copy() or {}
+        chunk_progress_bar_config = {**outer_progress_bar_config, "position": 0, "desc": "Chunks"}
+        for chunk_idx in tqdm(range(len(chunk_partition)), **chunk_progress_bar_config):
+            if chunk_idx >= num_context_chunks:
+                chunk_latents = init_latents[
+                    :, sum(chunk_partition[:chunk_idx]) : sum(chunk_partition[: chunk_idx + 1])
+                ]
+                this_chunk_partition = chunk_partition[: chunk_idx + 1]
+
+                self.scheduler.set_timesteps(num_inference_steps, device=device)
+                timesteps = self.scheduler.timesteps  # length N; `step` resolves the next sigma internally.
+                inner_progress_bar_config = {
+                    **outer_progress_bar_config,
+                    "position": 1,
+                    "leave": False,
+                    "desc": f"Chunk {chunk_idx} Inference Steps",
+                }
+                for i, t in enumerate(tqdm(timesteps, **inner_progress_bar_config)):
+                    r = self.scheduler.sigmas[i + 1] * self.scheduler.config.num_train_timesteps
+                    if t == r:
+                        continue
+
+                    latent_model_input = (
+                        torch.cat([chunk_latents] * 2) if self.do_classifier_free_guidance else chunk_latents
+                    )
+                    timestep = t.expand(latent_model_input.shape[0]).unsqueeze(-1)
+                    timestep = timestep.repeat((1, latent_model_input.shape[1]))
+                    if use_mean_velocity:
+                        r_timestep = r.expand(latent_model_input.shape[0]).unsqueeze(-1)
+                        r_timestep = r_timestep.repeat((1, latent_model_input.shape[1]))
+                    else:
+                        r_timestep = timestep
+
+                    noise_pred, _ = self.transformer(
+                        hidden_states=latent_model_input,
+                        timestep=timestep,
+                        r_timestep=r_timestep,
+                        encoder_hidden_states=encoder_hidden_states,
+                        attention_kwargs=attention_kwargs,
+                        return_dict=False,
+                        chunk_partition=this_chunk_partition,
+                        kv_cache=kv_cache,
+                        kv_cache_flag=copy.deepcopy(kv_cache_flag),
+                    )
+                    if self.do_classifier_free_guidance:
+                        noise_uncond, noise_pred = noise_pred.chunk(2)
+                        noise_pred = noise_uncond + guidance_scale * (noise_pred - noise_uncond)
+
+                    chunk_latents = self.scheduler.step(noise_pred, t, chunk_latents, return_dict=False)[0]
+
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs or []:
+                            if k == "latents":
+                                callback_kwargs[k] = chunk_latents
+                            elif k == "prompt_embeds":
+                                callback_kwargs[k] = prompt_embeds
+                            elif k == "negative_prompt_embeds":
+                                callback_kwargs[k] = negative_prompt_embeds
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                        chunk_latents = callback_outputs.pop("latents", chunk_latents)
+                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                output[:, sum(chunk_partition[:chunk_idx]) : sum(chunk_partition[: chunk_idx + 1])] = chunk_latents
+
+            # Cache the KVs for this chunk so subsequent chunks can attend back to it.
+            if chunk_idx < len(chunk_partition) - 1:
+                kv_cache = self.encode_kv_cache(
+                    kv_cache,
+                    kv_cache_flag,
+                    chunk_partition=chunk_partition[: chunk_idx + 1],
+                    chunk_idx=chunk_idx,
+                    output=output,
+                    prompt_embeds=prompt_embeds,
+                    negative_prompt_embeds=negative_prompt_embeds,
+                )
+
+        latents = output.permute(0, 2, 1, 3, 4)
+
+        if not output_type == "latent":
+            latents = latents.to(self.vae.dtype)
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean)
+                .view(1, self.vae.config.z_dim, 1, 1, 1)
+                .to(latents.device, latents.dtype)
+            )
+            latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+                latents.device, latents.dtype
+            )
+            latents = latents / latents_std + latents_mean
+            video = self.vae.decode(latents, return_dict=False)[0]
+            video = self.video_processor.postprocess_video(video, output_type=output_type)
+        else:
+            video = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return AnyFlowPipelineOutput(frames=video)
diff --git a/src/diffusers/pipelines/anyflow/pipeline_output.py b/src/diffusers/pipelines/anyflow/pipeline_output.py
new file mode 100644
index 000000000000..5e3668769a21
--- /dev/null
+++ b/src/diffusers/pipelines/anyflow/pipeline_output.py
@@ -0,0 +1,34 @@
+# Copyright 2026 The AnyFlow Team, NVIDIA Corp., and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+
+import torch
+
+from ...utils import BaseOutput
+
+
+@dataclass
+class AnyFlowPipelineOutput(BaseOutput):
+    r"""
+    Output class for AnyFlow pipelines.
+
+    Args:
+        frames (`torch.Tensor`, `np.ndarray`, or list[list[PIL.Image.Image]]):
+            list of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
+            denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+            `(batch_size, num_frames, channels, height, width)`.
+    """
+
+    frames: torch.Tensor
diff --git a/src/diffusers/pipelines/auto_pipeline.py b/src/diffusers/pipelines/auto_pipeline.py
index 2876798e14bd..8ef87eb3d1bf 100644
--- a/src/diffusers/pipelines/auto_pipeline.py
+++ b/src/diffusers/pipelines/auto_pipeline.py
@@ -20,6 +20,7 @@
 from ..configuration_utils import ConfigMixin
 from ..models.controlnets import ControlNetUnionModel
 from ..utils import is_sentencepiece_available
+from .anyflow import AnyFlowFARPipeline, AnyFlowPipeline
 from .aura_flow import AuraFlowPipeline
 from .chroma import ChromaPipeline
 from .cogview3 import CogView3PlusPipeline
@@ -249,18 +250,21 @@
 
 AUTO_TEXT2VIDEO_PIPELINES_MAPPING = OrderedDict(
     [
+        ("anyflow", AnyFlowPipeline),
         ("wan", WanPipeline),
     ]
 )
 
 AUTO_IMAGE2VIDEO_PIPELINES_MAPPING = OrderedDict(
     [
+        ("anyflow-far", AnyFlowFARPipeline),
         ("wan-i2v", WanImageToVideoPipeline),
     ]
 )
 
 AUTO_VIDEO2VIDEO_PIPELINES_MAPPING = OrderedDict(
     [
+        ("anyflow-far", AnyFlowFARPipeline),
         ("wan", WanVideoToVideoPipeline),
     ]
 )
diff --git a/src/diffusers/schedulers/__init__.py b/src/diffusers/schedulers/__init__.py
index b1f75bed7dc5..447586c6f436 100644
--- a/src/diffusers/schedulers/__init__.py
+++ b/src/diffusers/schedulers/__init__.py
@@ -59,6 +59,7 @@
     _import_structure["scheduling_edm_euler"] = ["EDMEulerScheduler"]
     _import_structure["scheduling_euler_ancestral_discrete"] = ["EulerAncestralDiscreteScheduler"]
     _import_structure["scheduling_euler_discrete"] = ["EulerDiscreteScheduler"]
+    _import_structure["scheduling_flow_map_euler_discrete"] = ["FlowMapEulerDiscreteScheduler"]
     _import_structure["scheduling_flow_match_euler_discrete"] = ["FlowMatchEulerDiscreteScheduler"]
     _import_structure["scheduling_flow_match_heun_discrete"] = ["FlowMatchHeunDiscreteScheduler"]
     _import_structure["scheduling_flow_match_lcm"] = ["FlowMatchLCMScheduler"]
@@ -165,6 +166,7 @@
         from .scheduling_edm_euler import EDMEulerScheduler
         from .scheduling_euler_ancestral_discrete import EulerAncestralDiscreteScheduler
         from .scheduling_euler_discrete import EulerDiscreteScheduler
+        from .scheduling_flow_map_euler_discrete import FlowMapEulerDiscreteScheduler
         from .scheduling_flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler
         from .scheduling_flow_match_heun_discrete import FlowMatchHeunDiscreteScheduler
         from .scheduling_flow_match_lcm import FlowMatchLCMScheduler
diff --git a/src/diffusers/schedulers/scheduling_flow_map_euler_discrete.py b/src/diffusers/schedulers/scheduling_flow_map_euler_discrete.py
new file mode 100644
index 000000000000..70cdf1f3c61c
--- /dev/null
+++ b/src/diffusers/schedulers/scheduling_flow_map_euler_discrete.py
@@ -0,0 +1,265 @@
+# Copyright 2026 The AnyFlow Team, NVIDIA Corp., and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..utils import BaseOutput, logging
+from .scheduling_utils import SchedulerMixin
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class FlowMapEulerDiscreteSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.Tensor`):
+            Computed sample :math:`z_r` at the target flow-map timestep `r_timestep`. Should be used as the next
+            denoising input.
+    """
+
+    prev_sample: torch.Tensor
+
+
+class FlowMapEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
+    """
+    Euler-style sampler for flow-map-distilled diffusion models.
+
+    Flow-map models learn arbitrary-interval transitions :math:`z_t \\to z_r` rather than the fixed :math:`z_t \\to
+    z_0` mapping of consistency models, so a single distilled checkpoint can be evaluated at 1, 2, 4, 8, ... NFE
+    without retraining. The `step` method advances the sample from `timestep` to `r_timestep` along the predicted
+    velocity.
+
+    Introduced in [AnyFlow: Any-Step Video Diffusion Model with On-Policy Flow Map
+    Distillation](https://huggingface.co/papers/2605.13724) by Yuchao Gu, Guian Fang et al.
+
+    This scheduler inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the
+    generic methods implemented for all schedulers (loading, saving, etc.).
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps used to train the underlying flow-map model.
+        shift (`float`, defaults to 1.0):
+            Multiplicative timestep shift applied to the inference schedule. ``shift=1.0`` is the identity; values
+            greater than 1.0 push the schedule toward more denoising at later steps (e.g., ``shift=5`` matches the
+            Wan2.1 default).
+    """
+
+    _compatibles = []
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        shift: float = 1.0,
+    ):
+        # `_step_index` and `_begin_index` mirror `FlowMatchEulerDiscreteScheduler`'s state machine:
+        # `_step_index` advances on every `step()` so callbacks and composable schedulers can read it;
+        # `_begin_index` is honoured on the very first `step()` after `set_timesteps` to support
+        # mid-schedule restarts (e.g. image-to-image style use).
+        self._step_index: Optional[int] = None
+        self._begin_index: Optional[int] = None
+        self.set_timesteps(num_train_timesteps, device="cpu")
+
+    @property
+    def step_index(self) -> Optional[int]:
+        """The index counter for current timestep. Returns ``None`` before the first :meth:`step` call after
+        :meth:`set_timesteps`."""
+        return self._step_index
+
+    @property
+    def begin_index(self) -> Optional[int]:
+        """The index for the first timestep — set by :meth:`set_begin_index`. Defaults to ``None``."""
+        return self._begin_index
+
+    def set_begin_index(self, begin_index: int = 0):
+        """Set the begin index for the scheduler. Pipelines that start mid-schedule (e.g. image-to-image)
+        call this between :meth:`set_timesteps` and the first :meth:`step` to anchor the rollout."""
+        self._begin_index = begin_index
+
+    def scale_model_input(self, sample: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        """No-op identity scaling. Provided for API compatibility with other Diffusers schedulers."""
+        return sample
+
+    def scale_noise(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        noise: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        """Linearly interpolate ``sample`` toward ``noise`` according to the normalized ``timestep``."""
+        timestep = timestep.to(device=sample.device, dtype=sample.dtype)
+
+        timestep = timestep / self.config.num_train_timesteps
+        timestep = timestep.view(*timestep.shape, *([1] * (noise.ndim - timestep.ndim)))
+        sample = timestep * noise + (1.0 - timestep) * sample
+        return sample
+
+    def apply_shift(self, sigmas: torch.Tensor) -> torch.Tensor:
+        """Apply the configured shift transformation to a sigma tensor."""
+        if self.config.shift == 1.0:
+            return sigmas
+        return self.config.shift * sigmas / (1 + (self.config.shift - 1) * sigmas)
+
+    def set_timesteps(
+        self,
+        num_inference_steps: Optional[int] = None,
+        device: Union[str, torch.device] = None,
+    ) -> None:
+        """Build the inference timestep schedule.
+
+        Internally tracks ``self.sigmas`` of length ``num_inference_steps + 1`` (linspace endpoints :math:`[1, ...,
+        0]`); ``self.timesteps`` exposes the first ``num_inference_steps`` sigmas scaled by ``num_train_timesteps`` —
+        i.e. one timestep per inference step, matching :class:`~diffusers.schedulers.FlowMatchEulerDiscreteScheduler`.
+        The final sigma (== 0) is the implicit r-endpoint of the last step.
+        """
+        # MPS / NPU don't support float64 — build the schedule in float64 on CPU and only move
+        # the final tensors to the requested device (with a float32 downcast for MPS / NPU).
+        device_obj = torch.device(device) if device is not None and not isinstance(device, torch.device) else device
+        is_mps = device_obj is not None and device_obj.type == "mps"
+        is_npu = device_obj is not None and device_obj.type == "npu"
+        out_dtype = torch.float32 if (is_mps or is_npu) else torch.float64
+
+        sigmas = torch.linspace(1.0, 0.0, num_inference_steps + 1, dtype=torch.float64)
+        sigmas = self.apply_shift(sigmas)
+
+        self.num_inference_steps = num_inference_steps
+        self.sigmas = sigmas.to(device=device, dtype=out_dtype)
+        self.timesteps = (self.sigmas[:-1] * self.config.num_train_timesteps).to(device=device, dtype=out_dtype)
+        # Reset the state machine — first `step()` after this will re-initialize `_step_index`.
+        self._step_index = None
+        self._begin_index = None
+
+    def _init_step_index(self, timestep: Union[float, torch.FloatTensor]) -> None:
+        """Initialize ``self._step_index`` on the first :meth:`step` call after :meth:`set_timesteps`.
+
+        Off-schedule timesteps are allowed (any-step sampling is documented in :meth:`step`); in that case the
+        counter starts at 0 so it can still be used as an observable rollout marker.
+        """
+        if self._begin_index is not None:
+            self._step_index = self._begin_index
+            return
+        idx = self.index_for_timestep(timestep)
+        self._step_index = idx if idx is not None else 0
+
+    def index_for_timestep(self, timestep: Union[float, torch.FloatTensor]) -> Optional[int]:
+        """Return the index of ``timestep`` on the current schedule, or ``None`` if off-schedule.
+
+        Lookup is done against ``self.timesteps`` with a small fp tolerance. Used to recover the corresponding sigma
+        without assuming the linear ``timesteps = sigmas * num_train_timesteps`` relationship — that way a custom
+        schedule (e.g. non-linear shift, manually-set timesteps) still resolves correctly.
+        """
+        if self.timesteps is None:
+            return None
+        t_value = float(timestep.flatten()[0].item()) if torch.is_tensor(timestep) else float(timestep)
+        diffs = (self.timesteps.float() - t_value).abs()
+        idx = int(diffs.argmin().item())
+        if diffs[idx].item() > 1e-3:
+            return None
+        return idx
+
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        r_timestep: Optional[Union[float, torch.FloatTensor]] = None,
+        return_dict: bool = True,
+    ) -> Union[FlowMapEulerDiscreteSchedulerOutput, Tuple[torch.Tensor]]:
+        """
+        Advance ``sample`` from ``timestep`` to ``r_timestep`` using the model-predicted velocity.
+
+        Unlike a standard Euler scheduler, both endpoints of the interval can be caller-provided so that any-step
+        sampling is possible: a single model call can step from `t` to any chosen target `r` (including `r=0` for a
+        one-shot generation). When ``r_timestep`` is omitted, it defaults to the next timestep on the schedule
+        (matching ``FlowMatchEulerDiscreteScheduler`` semantics).
+
+        Internally the source and target sigmas are recovered by indexing ``self.sigmas`` via
+        :meth:`index_for_timestep` rather than by dividing the input timesteps by ``num_train_timesteps``, so any
+        schedule whose timestep / sigma relationship is non-linear (for example a custom shift) stays correct. For an
+        off-schedule ``r_timestep``, the scheduler falls back to ``r_timestep / num_train_timesteps`` so any-step
+        sampling outside the schedule remains supported.
+
+        Args:
+            model_output (`torch.Tensor`):
+                Direct output from the flow-map model (predicted mean velocity).
+            timestep (`float` or `torch.Tensor`):
+                Source timestep ``t`` in the same units as ``self.timesteps``.
+            sample (`torch.Tensor`):
+                Current sample :math:`z_t`.
+            r_timestep (`float` or `torch.Tensor`, *optional*):
+                Target timestep ``r``. Defaults to the next timestep on the schedule when ``None``; pass an explicit
+                value for any-step sampling. ``r_timestep == timestep`` is a no-op.
+            return_dict (`bool`, defaults to `True`):
+                Whether to return a [`FlowMapEulerDiscreteSchedulerOutput`] (the default) or a plain tuple.
+
+        Returns:
+            [`FlowMapEulerDiscreteSchedulerOutput`] or `tuple`:
+                When ``return_dict=True``, returns a [`FlowMapEulerDiscreteSchedulerOutput`] whose ``prev_sample`` is
+                :math:`z_r`. Otherwise returns a 1-tuple ``(prev_sample,)``.
+        """
+        if self.sigmas is None or self.timesteps is None:
+            raise ValueError("`set_timesteps` has not been called.")
+
+        # `_step_index` is maintained purely as observable state for callbacks / composable schedulers.
+        # Sigma resolution stays a pure function of the passed-in (`timestep`, `r_timestep`) so the call is
+        # idempotent — calling `step` twice with the same arguments always returns the same `prev_sample`.
+        if self._step_index is None:
+            self._init_step_index(timestep)
+
+        # Resolve source sigma via index lookup; fall back to / num_train_timesteps only if `timestep` is off-schedule.
+        t_idx = self.index_for_timestep(timestep)
+        if t_idx is not None:
+            sigma_t = self.sigmas[t_idx].to(device=sample.device, dtype=self.sigmas.dtype)
+        else:
+            t_value = timestep.to(self.sigmas.dtype) if torch.is_tensor(timestep) else torch.tensor(timestep)
+            sigma_t = (t_value / self.config.num_train_timesteps).to(device=sample.device, dtype=self.sigmas.dtype)
+
+        # Resolve target sigma. None defaults to sigmas[t_idx + 1] when on-schedule; otherwise the caller's
+        # explicit `r_timestep` is used (sigma lookup first, fall back to scaling for off-schedule any-step).
+        if r_timestep is None:
+            if t_idx is None:
+                raise ValueError(
+                    "`r_timestep` is None but `timestep` is not on the current schedule; "
+                    "pass an explicit `r_timestep` for any-step sampling outside the schedule."
+                )
+            sigma_r = self.sigmas[t_idx + 1].to(device=sample.device, dtype=self.sigmas.dtype)
+        else:
+            r_idx = self.index_for_timestep(r_timestep)
+            if r_idx is not None:
+                sigma_r = self.sigmas[r_idx].to(device=sample.device, dtype=self.sigmas.dtype)
+            else:
+                r_value = r_timestep.to(self.sigmas.dtype) if torch.is_tensor(r_timestep) else torch.tensor(r_timestep)
+                sigma_r = (r_value / self.config.num_train_timesteps).to(device=sample.device, dtype=self.sigmas.dtype)
+
+        sigma_t = sigma_t.view(*sigma_t.shape, *([1] * (model_output.ndim - sigma_t.ndim)))
+        sigma_r = sigma_r.view(*sigma_r.shape, *([1] * (model_output.ndim - sigma_r.ndim)))
+        prev_sample = sample - (sigma_t - sigma_r) * model_output
+        prev_sample = prev_sample.to(model_output.dtype)
+
+        # Advance state machine so downstream callbacks / composable schedulers observe correct `step_index`.
+        self._step_index += 1
+
+        if not return_dict:
+            return (prev_sample,)
+
+        return FlowMapEulerDiscreteSchedulerOutput(prev_sample=prev_sample)
diff --git a/src/diffusers/utils/dummy_pt_objects.py b/src/diffusers/utils/dummy_pt_objects.py
index 0ce20a4f7d97..8317a58b3cd6 100644
--- a/src/diffusers/utils/dummy_pt_objects.py
+++ b/src/diffusers/utils/dummy_pt_objects.py
@@ -435,6 +435,36 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class AnyFlowFARTransformer3DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
+class AnyFlowTransformer3DModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class AsymmetricAutoencoderKL(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -3002,6 +3032,21 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch"])
 
 
+class FlowMapEulerDiscreteScheduler(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch"])
+
+
 class FlowMatchEulerDiscreteScheduler(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/diffusers/utils/dummy_torch_and_transformers_objects.py b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
index 1e9bb67a768a..d8965054560c 100644
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -917,6 +917,36 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["torch", "transformers"])
 
 
+class AnyFlowFARPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
+class AnyFlowPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class AudioLDM2Pipeline(metaclass=DummyObject):
     _backends = ["torch", "transformers"]
 
diff --git a/tests/models/transformers/test_models_transformer_anyflow.py b/tests/models/transformers/test_models_transformer_anyflow.py
new file mode 100644
index 000000000000..5011222f17c9
--- /dev/null
+++ b/tests/models/transformers/test_models_transformer_anyflow.py
@@ -0,0 +1,120 @@
+# Copyright 2026 The AnyFlow Team, NVIDIA Corp., and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from diffusers import AnyFlowTransformer3DModel
+from diffusers.utils.torch_utils import randn_tensor
+
+from ...testing_utils import enable_full_determinism, torch_device
+from ..testing_utils import (
+    AttentionTesterMixin,
+    BaseModelTesterConfig,
+    MemoryTesterMixin,
+    ModelTesterMixin,
+    TorchCompileTesterMixin,
+    TrainingTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class AnyFlowTransformer3DTesterConfig(BaseModelTesterConfig):
+    @property
+    def model_class(self):
+        return AnyFlowTransformer3DModel
+
+    @property
+    def output_shape(self) -> tuple[int, ...]:
+        return (1, 2, 4, 16, 16)
+
+    @property
+    def input_shape(self) -> tuple[int, ...]:
+        return (1, 2, 4, 16, 16)
+
+    @property
+    def main_input_name(self) -> str:
+        return "hidden_states"
+
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
+
+    def get_init_dict(self) -> dict[str, int | list[int] | tuple | str | bool]:
+        return {
+            "patch_size": (1, 2, 2),
+            "num_attention_heads": 2,
+            "attention_head_dim": 12,
+            "in_channels": 4,
+            "out_channels": 4,
+            "text_dim": 16,
+            "freq_dim": 256,
+            "ffn_dim": 32,
+            "num_layers": 2,
+            "cross_attn_norm": True,
+            "rope_max_seq_len": 32,
+            "gate_value": 0.25,
+            "deltatime_type": "r",
+        }
+
+    def get_dummy_inputs(self) -> dict[str, torch.Tensor]:
+        batch_size = 1
+        num_frames = 2
+        num_channels = 4
+        height = 16
+        width = 16
+        text_seq_len = 12
+        text_dim = 16
+
+        return {
+            "hidden_states": randn_tensor(
+                (batch_size, num_frames, num_channels, height, width),
+                generator=self.generator,
+                device=torch_device,
+                dtype=self.torch_dtype,
+            ),
+            "timestep": torch.full((batch_size, num_frames), 500.0, device=torch_device, dtype=self.torch_dtype),
+            "r_timestep": torch.full((batch_size, num_frames), 250.0, device=torch_device, dtype=self.torch_dtype),
+            "encoder_hidden_states": randn_tensor(
+                (batch_size, text_seq_len, text_dim),
+                generator=self.generator,
+                device=torch_device,
+                dtype=self.torch_dtype,
+            ),
+        }
+
+
+class TestAnyFlowTransformer3D(AnyFlowTransformer3DTesterConfig, ModelTesterMixin):
+    """Core model tests for AnyFlow Transformer 3D (bidirectional variant)."""
+
+
+class TestAnyFlowTransformer3DMemory(AnyFlowTransformer3DTesterConfig, MemoryTesterMixin):
+    """Memory optimization tests for AnyFlow Transformer 3D."""
+
+
+class TestAnyFlowTransformer3DTraining(AnyFlowTransformer3DTesterConfig, TrainingTesterMixin):
+    """Training tests for AnyFlow Transformer 3D."""
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"AnyFlowTransformer3DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+
+class TestAnyFlowTransformer3DAttention(AnyFlowTransformer3DTesterConfig, AttentionTesterMixin):
+    """Attention processor tests for AnyFlow Transformer 3D."""
+
+
+class TestAnyFlowTransformer3DCompile(AnyFlowTransformer3DTesterConfig, TorchCompileTesterMixin):
+    """Torch compile tests for AnyFlow Transformer 3D."""
diff --git a/tests/models/transformers/test_models_transformer_anyflow_far.py b/tests/models/transformers/test_models_transformer_anyflow_far.py
new file mode 100644
index 000000000000..23ceded0aa8f
--- /dev/null
+++ b/tests/models/transformers/test_models_transformer_anyflow_far.py
@@ -0,0 +1,189 @@
+# Copyright 2026 The AnyFlow Team, NVIDIA Corp., and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import AnyFlowFARTransformer3DModel
+from diffusers.models.transformers.transformer_anyflow_far import (
+    AnyFlowCausalAttnProcessor,
+    AnyFlowFARTransformerOutput,
+)
+from diffusers.utils.torch_utils import randn_tensor
+
+from ...testing_utils import enable_full_determinism, torch_device
+from ..testing_utils import (
+    AttentionTesterMixin,
+    BaseModelTesterConfig,
+    MemoryTesterMixin,
+    ModelTesterMixin,
+    TrainingTesterMixin,
+)
+
+
+enable_full_determinism()
+
+
+class AnyFlowFARTransformer3DTesterConfig(BaseModelTesterConfig):
+    @property
+    def model_class(self):
+        return AnyFlowFARTransformer3DModel
+
+    @property
+    def output_shape(self) -> tuple[int, ...]:
+        return (1, 2, 4, 16, 16)
+
+    @property
+    def input_shape(self) -> tuple[int, ...]:
+        return (1, 4, 4, 16, 16)  # 2 compressed + 2 full frames
+
+    @property
+    def main_input_name(self) -> str:
+        return "hidden_states"
+
+    @property
+    def generator(self):
+        return torch.Generator("cpu").manual_seed(0)
+
+    def get_init_dict(self) -> dict[str, int | list[int] | tuple | str | bool]:
+        return {
+            "patch_size": (1, 2, 2),
+            "compressed_patch_size": (1, 4, 4),
+            "full_chunk_limit": 3,
+            "num_attention_heads": 2,
+            "attention_head_dim": 12,
+            "in_channels": 4,
+            "out_channels": 4,
+            "text_dim": 16,
+            "freq_dim": 256,
+            "ffn_dim": 32,
+            "num_layers": 2,
+            "cross_attn_norm": True,
+            "rope_max_seq_len": 32,
+            "gate_value": 0.25,
+            "deltatime_type": "r",
+        }
+
+    def get_dummy_inputs(self) -> dict[str, "torch.Tensor"]:
+        batch_size = 1
+        # Training-rollout path: chunk_partition sums to total frames; two single-frame chunks.
+        chunk_partition = [2, 2]
+        num_frames = sum(chunk_partition)
+        num_channels = 4
+        height = 16
+        width = 16
+        text_seq_len = 12
+        text_dim = 16
+
+        return {
+            "hidden_states": randn_tensor(
+                (batch_size, num_frames, num_channels, height, width),
+                generator=self.generator,
+                device=torch_device,
+                dtype=self.torch_dtype,
+            ),
+            "timestep": torch.full((batch_size, num_frames), 500.0, device=torch_device, dtype=self.torch_dtype),
+            "r_timestep": torch.full((batch_size, num_frames), 250.0, device=torch_device, dtype=self.torch_dtype),
+            "encoder_hidden_states": randn_tensor(
+                (batch_size, text_seq_len, text_dim),
+                generator=self.generator,
+                device=torch_device,
+                dtype=self.torch_dtype,
+            ),
+            "chunk_partition": chunk_partition,
+        }
+
+
+class TestAnyFlowFARTransformer3D(AnyFlowFARTransformer3DTesterConfig, ModelTesterMixin):
+    """Core model tests for AnyFlow FAR causal Transformer 3D."""
+
+
+class TestAnyFlowFARTransformer3DMemory(AnyFlowFARTransformer3DTesterConfig, MemoryTesterMixin):
+    """Memory optimization tests for AnyFlow FAR Transformer 3D."""
+
+
+class TestAnyFlowFARTransformer3DTraining(AnyFlowFARTransformer3DTesterConfig, TrainingTesterMixin):
+    """Training tests for AnyFlow FAR Transformer 3D."""
+
+    def test_gradient_checkpointing_is_applied(self):
+        expected_set = {"AnyFlowFARTransformer3DModel"}
+        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
+
+    # FAR causal self-attention routes through `flex_attention`, whose backward kernel is
+    # GPU-only (`torch.nn.attention.flex_attention` raises NotImplementedError on CPU). The
+    # bidi transformer test file covers training on the SDPA path; FAR training correctness
+    # is exercised end-to-end on H200 via the pipeline replay (L2=0 against NVlabs/AnyFlow).
+    @unittest.skipIf(torch_device == "cpu", "FlexAttention has no CPU backward kernel.")
+    def test_training(self):
+        super().test_training()
+
+    @unittest.skipIf(torch_device == "cpu", "FlexAttention has no CPU backward kernel.")
+    def test_training_with_ema(self):
+        super().test_training_with_ema()
+
+    @unittest.skipIf(torch_device == "cpu", "FlexAttention has no CPU backward kernel.")
+    def test_gradient_checkpointing_equivalence(self, loss_tolerance=1e-5, param_grad_tol=5e-5, skip=None):
+        super().test_gradient_checkpointing_equivalence(loss_tolerance, param_grad_tol, skip)
+
+
+class TestAnyFlowFARTransformer3DAttention(AnyFlowFARTransformer3DTesterConfig, AttentionTesterMixin):
+    """Attention processor tests for AnyFlow FAR Transformer 3D."""
+
+
+# Torch-compile mixin intentionally skipped: FAR's `_build_causal_mask` uses
+# `flex_attention.create_block_mask(_compile=False)`, which conflicts with the tracer
+# assumptions made by the standard TorchCompileTesterMixin. The bidi transformer test file
+# covers compile behavior; the FAR causal path is bit-exact-validated end-to-end on H200
+# through the pipeline replay rather than per-module compile.
+
+
+class AnyFlowCausalAttnProcessorTest(unittest.TestCase):
+    """Stand-alone smoke tests for the FAR causal attention processor.
+
+    These cover behaviors not reached by the generated model mixins:
+    * the backend gate (only the flex backend is accepted; non-flex backends raise),
+    * the `AnyFlowFARTransformerOutput` dataclass is importable for downstream typing.
+    """
+
+    def test_default_backend_is_flex(self):
+        processor = AnyFlowCausalAttnProcessor()
+        self.assertEqual(processor._attention_backend, "flex")
+
+    def test_unsupported_backend_raises(self):
+        processor = AnyFlowCausalAttnProcessor()
+        processor._attention_backend = "sage"
+
+        class _DummyAttn:
+            heads = 1
+            norm_q = norm_k = None
+
+            def to_q(self, x):
+                return x
+
+            def to_k(self, x):
+                return x
+
+            def to_v(self, x):
+                return x
+
+            to_out = [lambda x: x, lambda x: x]
+
+        with self.assertRaises(ValueError):
+            processor(_DummyAttn(), torch.zeros(1, 4, 4))
+
+    def test_output_dataclass_exposed(self):
+        # Downstream type-checking + autodoc rely on these attributes existing.
+        self.assertTrue(hasattr(AnyFlowFARTransformerOutput, "sample"))
+        self.assertTrue(hasattr(AnyFlowFARTransformerOutput, "kv_cache"))
diff --git a/tests/pipelines/anyflow/__init__.py b/tests/pipelines/anyflow/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/pipelines/anyflow/test_anyflow.py b/tests/pipelines/anyflow/test_anyflow.py
new file mode 100644
index 000000000000..20ec1f859089
--- /dev/null
+++ b/tests/pipelines/anyflow/test_anyflow.py
@@ -0,0 +1,135 @@
+# Copyright 2026 The AnyFlow Team, NVIDIA Corp., and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
+
+from diffusers import (
+    AnyFlowPipeline,
+    AnyFlowTransformer3DModel,
+    AutoencoderKLWan,
+    FlowMapEulerDiscreteScheduler,
+)
+
+from ...testing_utils import enable_full_determinism
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class AnyFlowPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pipeline_class = AnyFlowPipeline
+    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+    test_xformers_attention = False
+    supports_dduf = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        vae = AutoencoderKLWan(
+            base_dim=3,
+            z_dim=16,
+            dim_mult=[1, 1, 1, 1],
+            num_res_blocks=1,
+            temperal_downsample=[False, True, True],
+        )
+
+        torch.manual_seed(0)
+        scheduler = FlowMapEulerDiscreteScheduler(num_train_timesteps=1000, shift=5.0)
+        config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
+        text_encoder = T5EncoderModel(config)
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        transformer = AnyFlowTransformer3DModel(
+            patch_size=(1, 2, 2),
+            num_attention_heads=2,
+            attention_head_dim=12,
+            in_channels=16,
+            out_channels=16,
+            text_dim=32,
+            freq_dim=256,
+            ffn_dim=32,
+            num_layers=2,
+            cross_attn_norm=True,
+            rope_max_seq_len=32,
+            gate_value=0.25,
+            deltatime_type="r",
+        )
+
+        components = {
+            "transformer": transformer,
+            "vae": vae,
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "dance monkey",
+            "negative_prompt": "negative",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "height": 16,
+            "width": 16,
+            "num_frames": 9,
+            "max_sequence_length": 16,
+            "output_type": "pt",
+        }
+        return inputs
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        video = pipe(**inputs).frames
+        generated_video = video[0]
+
+        self.assertEqual(generated_video.shape, (9, 3, 16, 16))
+
+    @unittest.skip("AnyFlow uses mixed-precision flow-map sampling; FP16 round-trip is not numerically stable.")
+    def test_save_load_float16(self):
+        pass
+
+    @unittest.skip("AnyFlow's custom attention processor does not support sliced attention.")
+    def test_attention_slicing_forward_pass(self):
+        pass
diff --git a/tests/pipelines/anyflow/test_anyflow_far.py b/tests/pipelines/anyflow/test_anyflow_far.py
new file mode 100644
index 000000000000..8086afef6d65
--- /dev/null
+++ b/tests/pipelines/anyflow/test_anyflow_far.py
@@ -0,0 +1,157 @@
+# Copyright 2026 The AnyFlow Team, NVIDIA Corp., and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+from transformers import AutoConfig, AutoTokenizer, T5EncoderModel
+
+from diffusers import (
+    AnyFlowFARPipeline,
+    AnyFlowFARTransformer3DModel,
+    AutoencoderKLWan,
+    FlowMapEulerDiscreteScheduler,
+)
+
+from ...testing_utils import enable_full_determinism
+from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
+from ..test_pipelines_common import PipelineTesterMixin
+
+
+enable_full_determinism()
+
+
+class AnyFlowFARPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    """
+    Fast tests for the FAR-causal AnyFlow pipeline. Only T2V is exercised here; the I2V / TV2V branches are
+    only meaningful at the spatial resolutions used by released checkpoints and are covered in the slow
+    integration tests below.
+    """
+
+    pipeline_class = AnyFlowFARPipeline
+    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
+    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
+    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    required_optional_params = frozenset(
+        [
+            "num_inference_steps",
+            "generator",
+            "latents",
+            "return_dict",
+            "callback_on_step_end",
+            "callback_on_step_end_tensor_inputs",
+        ]
+    )
+    test_xformers_attention = False
+    supports_dduf = False
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        vae = AutoencoderKLWan(
+            base_dim=3,
+            z_dim=16,
+            dim_mult=[1, 1, 1, 1],
+            num_res_blocks=1,
+            temperal_downsample=[False, True, True],
+        )
+
+        torch.manual_seed(0)
+        scheduler = FlowMapEulerDiscreteScheduler(num_train_timesteps=1000, shift=5.0)
+        config = AutoConfig.from_pretrained("hf-internal-testing/tiny-random-t5")
+        text_encoder = T5EncoderModel(config)
+        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+        torch.manual_seed(0)
+        transformer = AnyFlowFARTransformer3DModel(
+            patch_size=(1, 2, 2),
+            compressed_patch_size=(1, 4, 4),
+            full_chunk_limit=3,
+            num_attention_heads=2,
+            attention_head_dim=12,
+            in_channels=16,
+            out_channels=16,
+            text_dim=32,
+            freq_dim=256,
+            ffn_dim=32,
+            num_layers=2,
+            cross_attn_norm=True,
+            rope_max_seq_len=32,
+            gate_value=0.25,
+            deltatime_type="r",
+        )
+
+        components = {
+            "transformer": transformer,
+            "vae": vae,
+            "scheduler": scheduler,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        # num_frames=9 -> 3 latent frames (VAE temporal stride 4); use a matching
+        # chunk_partition so the FAR pipeline's pre-flight assertion passes.
+        inputs = {
+            "prompt": "dance monkey",
+            "negative_prompt": "negative",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "height": 16,
+            "width": 16,
+            "num_frames": 9,
+            "max_sequence_length": 16,
+            "output_type": "pt",
+            "chunk_partition": [1, 1, 1],
+        }
+        return inputs
+
+    def test_inference(self):
+        device = "cpu"
+
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        video = pipe(**inputs).frames
+        generated_video = video[0]
+
+        self.assertEqual(generated_video.shape, (9, 3, 16, 16))
+
+    @unittest.skip("AnyFlow uses mixed-precision flow-map sampling; FP16 round-trip is not numerically stable.")
+    def test_save_load_float16(self):
+        pass
+
+    @unittest.skip("AnyFlow's custom attention processor does not support sliced attention.")
+    def test_attention_slicing_forward_pass(self):
+        pass
+
+    @unittest.skip(
+        "PipelineTesterMixin.test_callback_inputs zeroes latents on the final step and asserts the "
+        "*entire* output is zero. AnyFlowFARPipeline runs a chunk-wise FAR rollout where each chunk "
+        "produces an independent slice of the output buffer; zeroing latents in the final chunk only "
+        "zeroes that chunk's slice while earlier chunks (already written) stay non-zero. "
+        "The callback API itself works correctly (test_callback_cfg passes); only this specific "
+        "global-output assertion is incompatible with chunk-wise generation by construction."
+    )
+    def test_callback_inputs(self):
+        pass
diff --git a/tests/schedulers/test_scheduler_flow_map_euler_discrete.py b/tests/schedulers/test_scheduler_flow_map_euler_discrete.py
new file mode 100644
index 000000000000..049e7455883b
--- /dev/null
+++ b/tests/schedulers/test_scheduler_flow_map_euler_discrete.py
@@ -0,0 +1,168 @@
+# Copyright 2026 The AnyFlow Team, NVIDIA Corp., and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import torch
+
+from diffusers import FlowMapEulerDiscreteScheduler
+from diffusers.schedulers.scheduling_flow_map_euler_discrete import FlowMapEulerDiscreteSchedulerOutput
+
+
+class FlowMapEulerDiscreteSchedulerTest(unittest.TestCase):
+    """
+    The flow-map scheduler has a non-standard ``step`` signature that takes both ``timestep`` and
+    ``r_timestep`` (the target timestep), so it cannot use ``SchedulerCommonTest``. The tests below
+    exercise the contract that the scheduler exposes to ``AnyFlowPipeline`` and ``AnyFlowFARPipeline``.
+    """
+
+    scheduler_class = FlowMapEulerDiscreteScheduler
+
+    def get_default_config(self, **kwargs):
+        config = {
+            "num_train_timesteps": 1000,
+            "shift": 1.0,
+        }
+        config.update(**kwargs)
+        return config
+
+    def test_instantiation_with_defaults(self):
+        scheduler = self.scheduler_class(**self.get_default_config())
+        self.assertEqual(scheduler.config.num_train_timesteps, 1000)
+        self.assertEqual(scheduler.config.shift, 1.0)
+
+    def test_set_timesteps_endpoints(self):
+        scheduler = self.scheduler_class(**self.get_default_config())
+        for nfe in [1, 2, 4, 8, 16]:
+            scheduler.set_timesteps(num_inference_steps=nfe)
+            # `timesteps` is N-length (mirrors FlowMatchEulerDiscreteScheduler); the final
+            # r-endpoint sigma=0 lives in the internal `sigmas` buffer of length N+1.
+            self.assertEqual(scheduler.timesteps.shape, (nfe,))
+            self.assertEqual(scheduler.sigmas.shape, (nfe + 1,))
+            self.assertAlmostEqual(scheduler.timesteps[0].item(), 1000.0, places=4)
+            self.assertAlmostEqual(scheduler.sigmas[-1].item(), 0.0, places=4)
+
+    def test_apply_shift_identity(self):
+        scheduler = self.scheduler_class(**self.get_default_config(shift=1.0))
+        sigmas = torch.linspace(0.0, 1.0, 10)
+        torch.testing.assert_close(scheduler.apply_shift(sigmas), sigmas)
+
+    def test_apply_shift_monotonic(self):
+        scheduler = self.scheduler_class(**self.get_default_config(shift=5.0))
+        sigmas = torch.linspace(0.01, 0.99, 16)
+        shifted = scheduler.apply_shift(sigmas)
+        # shift > 1 must monotonically map [0,1] to [0,1] and increase intermediate values
+        self.assertTrue(torch.all(shifted >= 0))
+        self.assertTrue(torch.all(shifted <= 1))
+        self.assertTrue(torch.all(shifted[1:] - shifted[:-1] >= -1e-6))
+
+    def test_step_shape_preserved(self):
+        scheduler = self.scheduler_class(**self.get_default_config())
+        scheduler.set_timesteps(num_inference_steps=4)
+
+        sample = torch.randn(2, 16, 21, 30, 52)  # B, C, T, H, W (Wan2.1 latent shape)
+        model_output = torch.randn_like(sample)
+        timestep = scheduler.timesteps[0:1]
+        r_timestep = scheduler.timesteps[1:2]
+
+        output = scheduler.step(model_output, timestep, sample, r_timestep=r_timestep)
+        self.assertIsInstance(output, FlowMapEulerDiscreteSchedulerOutput)
+        prev_sample = output.prev_sample
+        self.assertEqual(prev_sample.shape, sample.shape)
+        self.assertEqual(prev_sample.dtype, model_output.dtype)
+
+        # return_dict=False yields a tuple with the same prev_sample.
+        (prev_sample_tuple,) = scheduler.step(model_output, timestep, sample, r_timestep=r_timestep, return_dict=False)
+        torch.testing.assert_close(prev_sample_tuple, prev_sample)
+
+    def test_step_zero_interval_is_identity(self):
+        # When timestep == r_timestep the update collapses to the input sample.
+        scheduler = self.scheduler_class(**self.get_default_config())
+        scheduler.set_timesteps(num_inference_steps=4)
+
+        sample = torch.randn(1, 4, 8, 8, 8)
+        model_output = torch.randn_like(sample)
+        t = scheduler.timesteps[2:3]
+
+        prev_sample = scheduler.step(model_output, t, sample, r_timestep=t).prev_sample
+        torch.testing.assert_close(prev_sample, sample.to(model_output.dtype))
+
+    def test_step_one_shot_sampling(self):
+        # Flow-map promise: stepping straight from t=T to r=0 produces a clean sample in a single call.
+        scheduler = self.scheduler_class(**self.get_default_config(shift=5.0))
+        scheduler.set_timesteps(num_inference_steps=1)
+        # `timesteps` is N=1 (just t=T); r=0 comes from the schedule's terminal sigma.
+        # Pass r_timestep=None so step() resolves it via self.sigmas[-1] * num_train_timesteps.
+        timesteps = scheduler.timesteps
+
+        sample = torch.randn(1, 4, 4, 4)
+        model_output = torch.randn_like(sample)
+
+        prev_sample = scheduler.step(
+            model_output,
+            timesteps[0:1],
+            sample,
+        ).prev_sample
+        self.assertEqual(prev_sample.shape, sample.shape)
+        self.assertFalse(torch.allclose(prev_sample, sample))
+
+    def test_step_index_advances(self):
+        # After `set_timesteps`, `step_index` is None. Each `step` call advances it; `begin_index` defaults to None.
+        scheduler = self.scheduler_class(**self.get_default_config())
+        scheduler.set_timesteps(num_inference_steps=4)
+        self.assertIsNone(scheduler.step_index)
+        self.assertIsNone(scheduler.begin_index)
+
+        sample = torch.randn(1, 4, 4, 4)
+        for i, t in enumerate(scheduler.timesteps):
+            scheduler.step(torch.randn_like(sample), t, sample)
+            self.assertEqual(scheduler.step_index, i + 1)
+
+    def test_step_off_schedule_anystep_supported(self):
+        # Documented contract: `step` accepts off-schedule (timestep, r_timestep) pairs and falls back to
+        # `t/num_train_timesteps` for both. State machine must not block this (regression: an earlier draft
+        # raised in `_init_step_index` for off-schedule t, which silently broke any-step sampling).
+        scheduler = self.scheduler_class(**self.get_default_config())
+        scheduler.set_timesteps(num_inference_steps=8)
+
+        sample = torch.randn(1, 4, 4, 4)
+        model_output = torch.randn_like(sample)
+        t_off = torch.tensor([777.7])
+        r_off = torch.tensor([123.4])
+
+        prev = scheduler.step(model_output, t_off, sample, r_timestep=r_off).prev_sample
+        self.assertEqual(prev.shape, sample.shape)
+        # step_index initialized to 0 (observable counter) and advanced after the call.
+        self.assertEqual(scheduler.step_index, 1)
+
+    def test_set_begin_index_anchors_step_index(self):
+        # `set_begin_index(k)` makes the first `step` initialize `_step_index = k` (mid-schedule restart).
+        scheduler = self.scheduler_class(**self.get_default_config())
+        scheduler.set_timesteps(num_inference_steps=4)
+        scheduler.set_begin_index(2)
+        self.assertEqual(scheduler.begin_index, 2)
+
+        sample = torch.randn(1, 4, 4, 4)
+        scheduler.step(torch.randn_like(sample), scheduler.timesteps[0], sample)
+        self.assertEqual(scheduler.step_index, 3)  # 2 -> 3 after one step
+
+    def test_scale_noise_endpoints(self):
+        scheduler = self.scheduler_class(**self.get_default_config())
+        sample = torch.zeros(2, 4, 4, 4)
+        noise = torch.ones_like(sample)
+        # t=0 -> all sample, t=num_train_timesteps -> all noise.
+        zero_t = torch.tensor([0.0])
+        torch.testing.assert_close(scheduler.scale_noise(sample, zero_t, noise), sample)
+        full_t = torch.tensor([float(scheduler.config.num_train_timesteps)])
+        torch.testing.assert_close(scheduler.scale_noise(sample, full_t, noise), noise)