huggingface · sayakpaul · Sep 1, 2023 · Aug 25, 2023 · Aug 26, 2023 · Aug 26, 2023
diff --git a/docs/source/en/api/pipelines/stable_diffusion/gligen.md b/docs/source/en/api/pipelines/stable_diffusion/gligen.md
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
 
 # GLIGEN (Grounded Language-to-Image Generation)
 
-The GLIGEN model was created by researchers and engineers from [University of Wisconsin-Madison, Columbia University, and Microsoft](https://github.com/gligen/GLIGEN). The [`StableDiffusionGLIGENPipeline`] can generate photorealistic images conditioned on grounding inputs. Along with text and bounding boxes, if input images are given, this pipeline can insert objects described by text at the region defined by bounding boxes. Otherwise, it'll generate an image described by the caption/prompt and insert objects described by text at the region defined by bounding boxes. It's trained on COCO2014D and COCO2014CD datasets, and the model uses a frozen CLIP ViT-L/14 text encoder to condition itself on grounding inputs.
+The GLIGEN model was created by researchers and engineers from [University of Wisconsin-Madison, Columbia University, and Microsoft](https://github.com/gligen/GLIGEN). The [`StableDiffusionGLIGENPipeline`] and [`StableDiffusionGLIGENTextImagePipeline`] can generate photorealistic images conditioned on grounding inputs. Along with text and bounding boxes with [`StableDiffusionGLIGENPipeline`], if input images are given, [`StableDiffusionGLIGENTextImagePipeline`] can insert objects described by text at the region defined by bounding boxes. Otherwise, it'll generate an image described by the caption/prompt and insert objects described by text at the region defined by bounding boxes. It's trained on COCO2014D and COCO2014CD datasets, and the model uses a frozen CLIP ViT-L/14 text encoder to condition itself on grounding inputs.
 
 The abstract from the [paper](https://huggingface.co/papers/2301.07093) is:
 
@@ -26,7 +26,7 @@ If you want to use one of the official checkpoints for a task, explore the [glig
 
 </Tip>
 
-This pipeline was contributed by [Nikhil Gajendrakumar](https://github.com/nikhil-masterful).
+[`StableDiffusionGLIGENPipeline`] was contributed by [Nikhil Gajendrakumar](https://github.com/nikhil-masterful) and [`StableDiffusionGLIGENTextImagePipeline`] was contributed by [Nguyễn Công Tú Anh](https://github.com/tuanh123789).
 
 ## StableDiffusionGLIGENPipeline
 
@@ -41,6 +41,19 @@ This pipeline was contributed by [Nikhil Gajendrakumar](https://github.com/nikhi
 	- prepare_latents
 	- enable_fuser
 
+## StableDiffusionGLIGENTextImagePipeline
+
+[[autodoc]] StableDiffusionGLIGENTextImagePipeline
+	- all
+	- __call__
+	- enable_vae_slicing
+	- disable_vae_slicing
+	- enable_vae_tiling
+	- disable_vae_tiling
+	- enable_model_cpu_offload
+	- prepare_latents
+	- enable_fuser
+
 ## StableDiffusionPipelineOutput
 
 [[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
diff --git a/scripts/convert_gligen_to_diffusers.py b/scripts/convert_gligen_to_diffusers.py
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -66,6 +66,7 @@
         AutoPipelineForImage2Image,
         AutoPipelineForInpainting,
         AutoPipelineForText2Image,
+        CLIPImageProjection,
         ConsistencyModelPipeline,
         DanceDiffusionPipeline,
         DDIMPipeline,
@@ -176,6 +177,7 @@
         StableDiffusionDepth2ImgPipeline,
         StableDiffusionDiffEditPipeline,
         StableDiffusionGLIGENPipeline,
+        StableDiffusionGLIGENTextImagePipeline,
         StableDiffusionImageVariationPipeline,
         StableDiffusionImg2ImgPipeline,
         StableDiffusionInpaintPipeline,

diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
@@ -154,7 +154,7 @@ def __init__(
         self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn, final_dropout=final_dropout)
 
         # 4. Fuser
-        if attention_type == "gated":
+        if attention_type == "gated" or attention_type == "gated-text-image":
             self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
 
         # let chunk size default to None

diff --git a/src/diffusers/models/embeddings.py b/src/diffusers/models/embeddings.py
@@ -563,7 +563,7 @@ def __call__(self, x):
 
 
 class PositionNet(nn.Module):
-    def __init__(self, positive_len, out_dim, fourier_freqs=8):
+    def __init__(self, positive_len, out_dim, feature_type="text-only", fourier_freqs=8):
         super().__init__()
         self.positive_len = positive_len
         self.out_dim = out_dim
@@ -573,30 +573,83 @@ def __init__(self, positive_len, out_dim, fourier_freqs=8):
 
         if isinstance(out_dim, tuple):
             out_dim = out_dim[0]
-        self.linears = nn.Sequential(
-            nn.Linear(self.positive_len + self.position_dim, 512),
-            nn.SiLU(),
-            nn.Linear(512, 512),
-            nn.SiLU(),
-            nn.Linear(512, out_dim),
-        )
 
-        self.null_positive_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
+        if feature_type == "text-only":
+            self.linears = nn.Sequential(
+                nn.Linear(self.positive_len + self.position_dim, 512),
+                nn.SiLU(),
+                nn.Linear(512, 512),
+                nn.SiLU(),
+                nn.Linear(512, out_dim),
+            )
+            self.null_positive_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
+
+        elif feature_type == "text-image":
+            self.linears_text = nn.Sequential(
+                nn.Linear(self.positive_len + self.position_dim, 512),
+                nn.SiLU(),
+                nn.Linear(512, 512),
+                nn.SiLU(),
+                nn.Linear(512, out_dim),
+            )
+            self.linears_image = nn.Sequential(
+                nn.Linear(self.positive_len + self.position_dim, 512),
+                nn.SiLU(),
+                nn.Linear(512, 512),
+                nn.SiLU(),
+                nn.Linear(512, out_dim),
+            )
+            self.null_text_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
+            self.null_image_feature = torch.nn.Parameter(torch.zeros([self.positive_len]))
+
         self.null_position_feature = torch.nn.Parameter(torch.zeros([self.position_dim]))
 
-    def forward(self, boxes, masks, positive_embeddings):
+    def forward(
+        self,
+        boxes,
+        masks,
+        positive_embeddings=None,
+        phrases_masks=None,
+        image_masks=None,
+        phrases_embeddings=None,
+        image_embeddings=None,
+    ):
         masks = masks.unsqueeze(-1)
 
         # embedding position (it may includes padding as placeholder)
         xyxy_embedding = self.fourier_embedder(boxes)  # B*N*4 -> B*N*C
 
         # learnable null embedding
-        positive_null = self.null_positive_feature.view(1, 1, -1)
         xyxy_null = self.null_position_feature.view(1, 1, -1)
 
         # replace padding with learnable null embedding
-        positive_embeddings = positive_embeddings * masks + (1 - masks) * positive_null
         xyxy_embedding = xyxy_embedding * masks + (1 - masks) * xyxy_null
 
-        objs = self.linears(torch.cat([positive_embeddings, xyxy_embedding], dim=-1))
+        # positionet with text only information
+        if positive_embeddings is not None:
+            # learnable null embedding
+            positive_null = self.null_positive_feature.view(1, 1, -1)
+
+            # replace padding with learnable null embedding
+            positive_embeddings = positive_embeddings * masks + (1 - masks) * positive_null
+
+            objs = self.linears(torch.cat([positive_embeddings, xyxy_embedding], dim=-1))
+
+        # positionet with text and image infomation
+        else:
+            phrases_masks = phrases_masks.unsqueeze(-1)
+            image_masks = image_masks.unsqueeze(-1)
+
+            # learnable null embedding
+            text_null = self.null_text_feature.view(1, 1, -1)
+            image_null = self.null_image_feature.view(1, 1, -1)
+
+            # replace padding with learnable null embedding
+            phrases_embeddings = phrases_embeddings * phrases_masks + (1 - phrases_masks) * text_null
+            image_embeddings = image_embeddings * image_masks + (1 - image_masks) * image_null
+
+            objs_text = self.linears_text(torch.cat([phrases_embeddings, xyxy_embedding], dim=-1))
+            objs_image = self.linears_image(torch.cat([image_embeddings, xyxy_embedding], dim=-1))
+            objs = torch.cat([objs_text, objs_image], dim=1)
+
         return objs
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
@@ -565,13 +565,17 @@ def __init__(
             block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
         )
 
-        if attention_type == "gated":
+        if attention_type in ["gated", "gated-text-image"]:
             positive_len = 768
             if isinstance(cross_attention_dim, int):
                 positive_len = cross_attention_dim
             elif isinstance(cross_attention_dim, tuple) or isinstance(cross_attention_dim, list):
                 positive_len = cross_attention_dim[0]
-            self.position_net = PositionNet(positive_len=positive_len, out_dim=cross_attention_dim)
+
+            feature_type = "text-only" if attention_type == "gated" else "text-image"
+            self.position_net = PositionNet(
+                positive_len=positive_len, out_dim=cross_attention_dim, feature_type=feature_type
+            )
 
     @property
     def attn_processors(self) -> Dict[str, AttentionProcessor]:

diff --git a/src/diffusers/pipelines/__init__.py b/src/diffusers/pipelines/__init__.py
@@ -94,6 +94,7 @@
         StableDiffusionDepth2ImgPipeline,
         StableDiffusionDiffEditPipeline,
         StableDiffusionGLIGENPipeline,
+        StableDiffusionGLIGENTextImagePipeline,
         StableDiffusionImageVariationPipeline,
         StableDiffusionImg2ImgPipeline,
         StableDiffusionInpaintPipeline,
@@ -111,6 +112,7 @@
         StableUnCLIPImg2ImgPipeline,
         StableUnCLIPPipeline,
     )
+    from .stable_diffusion.clip_image_project_model import CLIPImageProjection
     from .stable_diffusion_safe import StableDiffusionPipelineSafe
     from .stable_diffusion_xl import (
         StableDiffusionXLImg2ImgPipeline,

diff --git a/src/diffusers/pipelines/stable_diffusion/__init__.py b/src/diffusers/pipelines/stable_diffusion/__init__.py
@@ -42,10 +42,12 @@ class StableDiffusionPipelineOutput(BaseOutput):
 except OptionalDependencyNotAvailable:
     from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
 else:
+    from .clip_image_project_model import CLIPImageProjection
     from .pipeline_cycle_diffusion import CycleDiffusionPipeline
     from .pipeline_stable_diffusion import StableDiffusionPipeline
     from .pipeline_stable_diffusion_attend_and_excite import StableDiffusionAttendAndExcitePipeline
     from .pipeline_stable_diffusion_gligen import StableDiffusionGLIGENPipeline
+    from .pipeline_stable_diffusion_gligen_text_image import StableDiffusionGLIGENTextImagePipeline
     from .pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipeline
     from .pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipeline
     from .pipeline_stable_diffusion_inpaint_legacy import StableDiffusionInpaintPipelineLegacy

diff --git a/src/diffusers/pipelines/stable_diffusion/clip_image_project_model.py b/src/diffusers/pipelines/stable_diffusion/clip_image_project_model.py
@@ -0,0 +1,29 @@
+# Copyright 2023 The GLIGEN Authors and HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from torch import nn
+
+from ...configuration_utils import ConfigMixin, register_to_config
+from ...models.modeling_utils import ModelMixin
+
+
+class CLIPImageProjection(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(self, hidden_size: int = 768):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.project = nn.Linear(self.hidden_size, self.hidden_size, bias=False)
+
+    def forward(self, x):
+        return self.project(x)