Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions examples/research_projects/anytext/README.md
Original file line number Diff line number Diff line change
@@ -1,20 +1,27 @@
# AnyTextPipeline Pipeline
# AnyTextPipeline

Project page: https://aigcdesigngroup.github.io/homepage_anytext

"AnyText comprises a diffusion pipeline with two primary elements: an auxiliary latent module and a text embedding module. The former uses inputs like text glyph, position, and masked image to generate latent features for text generation or editing. The latter employs an OCR model for encoding stroke data as embeddings, which blend with image caption embeddings from the tokenizer to generate texts that seamlessly integrate with the background. We employed text-control diffusion loss and text perceptual loss for training to further enhance writing accuracy."

Each text line that needs to be generated should be enclosed in double quotes. For any usage questions, please refer to the [paper](https://arxiv.org/abs/2311.03054).
> **Note:** Each text line that needs to be generated should be enclosed in double quotes.

For any usage questions, please refer to the [paper](https://arxiv.org/abs/2311.03054).

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/tolgacangoz/b87ec9d2f265b448dd947c9d4a0da389/anytext.ipynb)

```py
# This example requires the `anytext_controlnet.py` file:
# !git clone --depth 1 https://github.com/huggingface/diffusers.git
# %cd diffusers/examples/research_projects/anytext
# Let's choose a font file shared by an HF staff:
# !wget https://huggingface.co/spaces/ysharma/TranslateQuotesInImageForwards/resolve/main/arial-unicode-ms.ttf

import torch
from diffusers import DiffusionPipeline
from anytext_controlnet import AnyTextControlNetModel
from diffusers.utils import load_image

# I chose a font file shared by an HF staff:
# !wget https://huggingface.co/spaces/ysharma/TranslateQuotesInImageForwards/resolve/main/arial-unicode-ms.ttf

anytext_controlnet = AnyTextControlNetModel.from_pretrained("tolgacangoz/anytext-controlnet", torch_dtype=torch.float16,
variant="fp16",)
Expand All @@ -26,6 +33,7 @@ pipe = DiffusionPipeline.from_pretrained("tolgacangoz/anytext", font_path="arial
# generate image
prompt = 'photo of caramel macchiato coffee on the table, top-down perspective, with "Any" "Text" written on it using cream'
draw_pos = load_image("https://raw.githubusercontent.com/tyxsspa/AnyText/refs/heads/main/example_images/gen9.png")
# There are two modes: "generate" and "edit". "edit" mode requires `ori_image` parameter for the image to be edited.
image = pipe(prompt, num_inference_steps=20, mode="generate", draw_pos=draw_pos,
).images[0]
image
Expand Down
16 changes: 11 additions & 5 deletions examples/research_projects/anytext/anytext.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,14 +146,17 @@ def _is_whitespace(self, char):
EXAMPLE_DOC_STRING = """
Examples:
```py
>>> # This example requires the `anytext_controlnet.py` file:
>>> # !git clone --depth 1 https://github.com/huggingface/diffusers.git
>>> # %cd diffusers/examples/research_projects/anytext
>>> # Let's choose a font file shared by an HF staff:
>>> # !wget https://huggingface.co/spaces/ysharma/TranslateQuotesInImageForwards/resolve/main/arial-unicode-ms.ttf

>>> import torch
>>> from diffusers import DiffusionPipeline
>>> from anytext_controlnet import AnyTextControlNetModel
>>> from diffusers.utils import load_image

>>> # I chose a font file shared by an HF staff:
>>> !wget https://huggingface.co/spaces/ysharma/TranslateQuotesInImageForwards/resolve/main/arial-unicode-ms.ttf

>>> anytext_controlnet = AnyTextControlNetModel.from_pretrained("tolgacangoz/anytext-controlnet", torch_dtype=torch.float16,
... variant="fp16",)
>>> pipe = DiffusionPipeline.from_pretrained("tolgacangoz/anytext", font_path="arial-unicode-ms.ttf",
Expand All @@ -165,6 +168,7 @@ def _is_whitespace(self, char):
>>> # generate image
>>> prompt = 'photo of caramel macchiato coffee on the table, top-down perspective, with "Any" "Text" written on it using cream'
>>> draw_pos = load_image("https://raw.githubusercontent.com/tyxsspa/AnyText/refs/heads/main/example_images/gen9.png")
>>> # There are two modes: "generate" and "edit". "edit" mode requires `ori_image` parameter for the image to be edited.
>>> image = pipe(prompt, num_inference_steps=20, mode="generate", draw_pos=draw_pos,
... ).images[0]
>>> image
Expand Down Expand Up @@ -257,11 +261,11 @@ def forward(
idx = tokenized_text[i] == self.placeholder_token.to(device)
if sum(idx) > 0:
if i >= len(self.text_embs_all):
print("truncation for log images...")
logger.warning("truncation for log images...")
break
text_emb = torch.cat(self.text_embs_all[i], dim=0)
if sum(idx) != len(text_emb):
print("truncation for long caption...")
logger.warning("truncation for long caption...")
text_emb = text_emb.to(embedded_text.device)
embedded_text[i][idx] = text_emb[: sum(idx)]
return embedded_text
Expand Down Expand Up @@ -1058,6 +1062,8 @@ def forward(
raise ValueError(f"Can't read ori_image image from {ori_image}!")
elif isinstance(ori_image, torch.Tensor):
ori_image = ori_image.cpu().numpy()
elif isinstance(ori_image, PIL.Image.Image):
ori_image = np.array(ori_image.convert("RGB"))
else:
if not isinstance(ori_image, np.ndarray):
raise ValueError(f"Unknown format of ori_image: {type(ori_image)}")
Expand Down
Loading