huggingface · sayakpaul · Oct 5, 2023 · Sep 24, 2023 · Sep 24, 2023 · Sep 24, 2023
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
@@ -19,6 +19,7 @@
 from torch import nn
 
 from ..utils import is_torch_version, logging
+from ..utils.torch_utils import fourier_filter
 from .activations import get_activation
 from .attention import AdaGroupNorm
 from .attention_processor import Attention, AttnAddedKVProcessor, AttnAddedKVProcessor2_0
@@ -249,6 +250,7 @@ def get_up_block(
     add_upsample,
     resnet_eps,
     resnet_act_fn,
+    resolution_idx,
     transformer_layers_per_block=1,
     num_attention_heads=None,
     resnet_groups=None,
@@ -281,6 +283,7 @@ def get_up_block(
             out_channels=out_channels,
             prev_output_channel=prev_output_channel,
             temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
             dropout=dropout,
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
@@ -295,6 +298,7 @@ def get_up_block(
             out_channels=out_channels,
             prev_output_channel=prev_output_channel,
             temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
             dropout=dropout,
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
@@ -314,6 +318,7 @@ def get_up_block(
             out_channels=out_channels,
             prev_output_channel=prev_output_channel,
             temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
             dropout=dropout,
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
@@ -337,6 +342,7 @@ def get_up_block(
             out_channels=out_channels,
             prev_output_channel=prev_output_channel,
             temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
             dropout=dropout,
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
@@ -362,6 +368,7 @@ def get_up_block(
             out_channels=out_channels,
             prev_output_channel=prev_output_channel,
             temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
             dropout=dropout,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
@@ -377,6 +384,7 @@ def get_up_block(
             out_channels=out_channels,
             prev_output_channel=prev_output_channel,
             temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
             dropout=dropout,
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
@@ -390,6 +398,7 @@ def get_up_block(
             out_channels=out_channels,
             prev_output_channel=prev_output_channel,
             temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
             dropout=dropout,
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
@@ -402,6 +411,7 @@ def get_up_block(
             num_layers=num_layers,
             in_channels=in_channels,
             out_channels=out_channels,
+            resolution_idx=resolution_idx,
             dropout=dropout,
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
@@ -415,6 +425,7 @@ def get_up_block(
             num_layers=num_layers,
             in_channels=in_channels,
             out_channels=out_channels,
+            resolution_idx=resolution_idx,
             dropout=dropout,
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
@@ -430,6 +441,7 @@ def get_up_block(
             in_channels=in_channels,
             out_channels=out_channels,
             temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
             dropout=dropout,
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
@@ -441,6 +453,7 @@ def get_up_block(
             in_channels=in_channels,
             out_channels=out_channels,
             temb_channels=temb_channels,
+            resolution_idx=resolution_idx,
             dropout=dropout,
             add_upsample=add_upsample,
             resnet_eps=resnet_eps,
@@ -1993,6 +2006,7 @@ def __init__(
         prev_output_channel: int,
         out_channels: int,
         temb_channels: int,
+        resolution_idx: int,
         dropout: float = 0.0,
         num_layers: int = 1,
         resnet_eps: float = 1e-6,
@@ -2075,6 +2089,8 @@ def __init__(
         else:
             self.upsamplers = None
 
+        self.resolution_idx = resolution_idx
+
     def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, scale: float = 1.0):
         for resnet, attn in zip(self.resnets, self.attentions):
             # pop res hidden states
@@ -2103,6 +2119,7 @@ def __init__(
         out_channels: int,
         prev_output_channel: int,
         temb_channels: int,
+        resolution_idx: int,
         dropout: float = 0.0,
         num_layers: int = 1,
         transformer_layers_per_block: int = 1,
@@ -2181,6 +2198,7 @@ def __init__(
             self.upsamplers = None
 
         self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
 
     def forward(
         self,
@@ -2194,11 +2212,34 @@ def forward(
         encoder_attention_mask: Optional[torch.FloatTensor] = None,
     ):
         lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
 
         for resnet, attn in zip(self.resnets, self.attentions):
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # Courtesy:
+            # https://github.com/ChenyangSi/FreeU
+            # https://github.com/lyn-rgb/FreeU_Diffusers
+            if is_freeu_enabled is not None:
+                # --------------- FreeU code -----------------------
+                # Only operate on the first two stages
+                if self.resolution_idx == 0:
+                    num_half_channels = hidden_states.shape[1] // 2
+                    hidden_states[:, :num_half_channels] = hidden_states[:, :num_half_channels] * self.b1
+                    res_hidden_states = fourier_filter(res_hidden_states, threshold=1, scale=self.s1)
+                if self.resolution_idx == 1:
+                    num_half_channels = hidden_states.shape[1] // 2
+                    hidden_states[:, :num_half_channels] = hidden_states[:, :num_half_channels] * self.b2
+                    res_hidden_states = fourier_filter(res_hidden_states, threshold=1, scale=self.s2)
+                # ---------------------------------------------------------
+
             hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
 
             if self.training and self.gradient_checkpointing:
@@ -2252,6 +2293,7 @@ def __init__(
         prev_output_channel: int,
         out_channels: int,
         temb_channels: int,
+        resolution_idx: int,
         dropout: float = 0.0,
         num_layers: int = 1,
         resnet_eps: float = 1e-6,
@@ -2292,12 +2334,36 @@ def __init__(
             self.upsamplers = None
 
         self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
 
     def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, scale: float = 1.0):
+        is_freeu_enabled = (
+            getattr(self, "s1", None)
+            and getattr(self, "s2", None)
+            and getattr(self, "b1", None)
+            and getattr(self, "b2", None)
+        )
+
         for resnet in self.resnets:
             # pop res hidden states
             res_hidden_states = res_hidden_states_tuple[-1]
             res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+
+            # Courtesy:
+            # https://github.com/ChenyangSi/FreeU
+            # https://github.com/lyn-rgb/FreeU_Diffusers
+            if is_freeu_enabled:
+                # --------------- FreeU code -----------------------
+                # Only operate on the first two stages
+                if self.resolution_idx == 0:
+                    num_half_channels = hidden_states.shape[-1] // 2
+                    hidden_states[:, :num_half_channels] = hidden_states[:, :num_half_channels] * self.b1
+                    res_hidden_states = fourier_filter(res_hidden_states, threshold=1, scale=self.s1)
+                if self.resolution_idx == 1:
+                    num_half_channels = hidden_states.shape[-1] // 2
+                    hidden_states[:, :num_half_channels] = hidden_states[:, :num_half_channels] * self.b2
+                    res_hidden_states = fourier_filter(res_hidden_states, threshold=1, scale=self.s2)
+
             hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
 
             if self.training and self.gradient_checkpointing:
@@ -2331,6 +2397,7 @@ def __init__(
         self,
         in_channels: int,
         out_channels: int,
+        resolution_idx: int,
         dropout: float = 0.0,
         num_layers: int = 1,
         resnet_eps: float = 1e-6,
@@ -2370,6 +2437,8 @@ def __init__(
         else:
             self.upsamplers = None
 
+        self.resolution_idx = resolution_idx
+
     def forward(self, hidden_states, temb=None, scale: float = 1.0):
         for resnet in self.resnets:
             hidden_states = resnet(hidden_states, temb=temb, scale=scale)
@@ -2386,6 +2455,7 @@ def __init__(
         self,
         in_channels: int,
         out_channels: int,
+        resolution_idx: int,
         dropout: float = 0.0,
         num_layers: int = 1,
         resnet_eps: float = 1e-6,
@@ -2449,6 +2519,8 @@ def __init__(
         else:
             self.upsamplers = None
 
+        self.resolution_idx = resolution_idx
+
     def forward(self, hidden_states, temb=None, scale: float = 1.0):
         for resnet, attn in zip(self.resnets, self.attentions):
             hidden_states = resnet(hidden_states, temb=temb, scale=scale)
@@ -2469,6 +2541,7 @@ def __init__(
         prev_output_channel: int,
         out_channels: int,
         temb_channels: int,
+        resolution_idx: int,
         dropout: float = 0.0,
         num_layers: int = 1,
         resnet_eps: float = 1e-6,
@@ -2553,6 +2626,8 @@ def __init__(
             self.skip_norm = None
             self.act = None
 
+        self.resolution_idx = resolution_idx
+
     def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None, scale: float = 1.0):
         for resnet in self.resnets:
             # pop res hidden states
@@ -2589,6 +2664,7 @@ def __init__(
         prev_output_channel: int,
         out_channels: int,
         temb_channels: int,
+        resolution_idx: int,
         dropout: float = 0.0,
         num_layers: int = 1,
         resnet_eps: float = 1e-6,
@@ -2651,6 +2727,8 @@ def __init__(
             self.skip_norm = None
             self.act = None
 
+        self.resolution_idx = resolution_idx
+
     def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample=None, scale: float = 1.0):
         for resnet in self.resnets:
             # pop res hidden states
@@ -2684,6 +2762,7 @@ def __init__(
         prev_output_channel: int,
         out_channels: int,
         temb_channels: int,
+        resolution_idx: int,
         dropout: float = 0.0,
         num_layers: int = 1,
         resnet_eps: float = 1e-6,
@@ -2743,6 +2822,7 @@ def __init__(
             self.upsamplers = None
 
         self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
 
     def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, scale: float = 1.0):
         for resnet in self.resnets:
@@ -2784,6 +2864,7 @@ def __init__(
         out_channels: int,
         prev_output_channel: int,
         temb_channels: int,
+        resolution_idx: int,
         dropout: float = 0.0,
         num_layers: int = 1,
         resnet_eps: float = 1e-6,
@@ -2873,6 +2954,7 @@ def __init__(
             self.upsamplers = None
 
         self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
 
     def forward(
         self,
@@ -2947,6 +3029,7 @@ def __init__(
         in_channels: int,
         out_channels: int,
         temb_channels: int,
+        resolution_idx: int,
         dropout: float = 0.0,
         num_layers: int = 5,
         resnet_eps: float = 1e-5,
@@ -2988,6 +3071,7 @@ def __init__(
             self.upsamplers = None
 
         self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
 
     def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None, scale: float = 1.0):
         res_hidden_states_tuple = res_hidden_states_tuple[-1]
@@ -3027,6 +3111,7 @@ def __init__(
         in_channels: int,
         out_channels: int,
         temb_channels: int,
+        resolution_idx: int,
         dropout: float = 0.0,
         num_layers: int = 4,
         resnet_eps: float = 1e-5,
@@ -3104,6 +3189,7 @@ def __init__(
             self.upsamplers = None
 
         self.gradient_checkpointing = False
+        self.resolution_idx = resolution_idx
 
     def forward(
         self,

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
@@ -542,6 +542,7 @@ def __init__(
                 add_upsample=add_upsample,
                 resnet_eps=norm_eps,
                 resnet_act_fn=act_fn,
+                resolution_idx=i,
                 resnet_groups=norm_num_groups,
                 cross_attention_dim=reversed_cross_attention_dim[i],
                 num_attention_heads=reversed_num_attention_heads[i],
@@ -731,6 +732,20 @@ def _set_gradient_checkpointing(self, module, value=False):
         if hasattr(module, "gradient_checkpointing"):
             module.gradient_checkpointing = value
 
+    def enable_freeu(self, **kwargs):
+        for i, upsample_block in enumerate(self.up_blocks):
+            setattr(upsample_block, "b1", kwargs["b1"])
+            setattr(upsample_block, "b2", kwargs["b2"])
+            setattr(upsample_block, "s1", kwargs["s1"])
+            setattr(upsample_block, "s2", kwargs["s2"])
+
+    def disable_freeu(self):
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for i, upsample_block in enumerate(self.up_blocks):
+            for k in freeu_keys:
+                if hasattr(upsample_block, k) or getattr(upsample_block, k) is not None:
+                    setattr(self, k, None)
+
     def forward(
         self,
         sample: torch.FloatTensor,

diff --git a/src/diffusers/models/vae.py b/src/diffusers/models/vae.py
@@ -213,6 +213,7 @@ def __init__(
                 attention_head_dim=output_channel,
                 temb_channels=temb_channels,
                 resnet_time_scale_shift=norm_type,
+                resolution_idx=None,
             )
             self.up_blocks.append(up_block)
             prev_output_channel = output_channel