From b19a024513b3165e309bdfb2a4fa043d6a5066c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tolga=20Cang=C3=B6z?= Date: Tue, 11 Mar 2025 10:39:59 +0300 Subject: [PATCH 1/3] [Documentation] Update README and example code with additional usage instructions for AnyText --- examples/research_projects/anytext/README.md | 10 ++++++++-- examples/research_projects/anytext/anytext.py | 7 +++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/examples/research_projects/anytext/README.md b/examples/research_projects/anytext/README.md index f5f4fe59ddfd..4ec726627d69 100644 --- a/examples/research_projects/anytext/README.md +++ b/examples/research_projects/anytext/README.md @@ -6,15 +6,20 @@ Project page: https://aigcdesigngroup.github.io/homepage_anytext Each text line that needs to be generated should be enclosed in double quotes. For any usage questions, please refer to the [paper](https://arxiv.org/abs/2311.03054). +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/tolgacangoz/b87ec9d2f265b448dd947c9d4a0da389/anytext.ipynb) ```py +# This example requires the `anytext_controlnet.py` file: +# !git clone --depth 1 https://github.com/huggingface/diffusers.git +# %cd diffusers/examples/research_projects/anytext +# Let's choose a font file shared by an HF staff: +# !wget https://huggingface.co/spaces/ysharma/TranslateQuotesInImageForwards/resolve/main/arial-unicode-ms.ttf + import torch from diffusers import DiffusionPipeline from anytext_controlnet import AnyTextControlNetModel from diffusers.utils import load_image -# I chose a font file shared by an HF staff: -# !wget https://huggingface.co/spaces/ysharma/TranslateQuotesInImageForwards/resolve/main/arial-unicode-ms.ttf anytext_controlnet = AnyTextControlNetModel.from_pretrained("tolgacangoz/anytext-controlnet", torch_dtype=torch.float16, variant="fp16",) @@ -26,6 +31,7 @@ pipe = DiffusionPipeline.from_pretrained("tolgacangoz/anytext", font_path="arial # generate image prompt = 'photo of caramel macchiato coffee on the table, top-down perspective, with "Any" "Text" written on it using cream' draw_pos = load_image("https://raw.githubusercontent.com/tyxsspa/AnyText/refs/heads/main/example_images/gen9.png") +# There are two modes: "generate" and "edit". "edit" mode requires `ori_image` parameter for the image to be edited. image = pipe(prompt, num_inference_steps=20, mode="generate", draw_pos=draw_pos, ).images[0] image diff --git a/examples/research_projects/anytext/anytext.py b/examples/research_projects/anytext/anytext.py index 518452f97942..2b5aea72863e 100644 --- a/examples/research_projects/anytext/anytext.py +++ b/examples/research_projects/anytext/anytext.py @@ -146,6 +146,12 @@ def _is_whitespace(self, char): EXAMPLE_DOC_STRING = """ Examples: ```py + >>> # This example requires the `anytext_controlnet.py` file: + >>> # !git clone --depth 1 https://github.com/huggingface/diffusers.git + >>> # %cd diffusers/examples/research_projects/anytext + >>> # Let's choose a font file shared by an HF staff: + >>> # !wget https://huggingface.co/spaces/ysharma/TranslateQuotesInImageForwards/resolve/main/arial-unicode-ms.ttf + >>> import torch >>> from diffusers import DiffusionPipeline >>> from anytext_controlnet import AnyTextControlNetModel @@ -165,6 +171,7 @@ def _is_whitespace(self, char): >>> # generate image >>> prompt = 'photo of caramel macchiato coffee on the table, top-down perspective, with "Any" "Text" written on it using cream' >>> draw_pos = load_image("https://raw.githubusercontent.com/tyxsspa/AnyText/refs/heads/main/example_images/gen9.png") + >>> # There are two modes: "generate" and "edit". "edit" mode requires `ori_image` parameter for the image to be edited. >>> image = pipe(prompt, num_inference_steps=20, mode="generate", draw_pos=draw_pos, ... ).images[0] >>> image From fb5b9f6302f0260f71eeb9fe84db6f1a6026d678 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tolga=20Cang=C3=B6z?= Date: Tue, 11 Mar 2025 12:54:42 +0300 Subject: [PATCH 2/3] [Documentation] Update README for AnyTextPipeline and improve logging in code --- examples/research_projects/anytext/README.md | 6 ++++-- examples/research_projects/anytext/anytext.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/examples/research_projects/anytext/README.md b/examples/research_projects/anytext/README.md index 4ec726627d69..3a67efd8b2f4 100644 --- a/examples/research_projects/anytext/README.md +++ b/examples/research_projects/anytext/README.md @@ -1,10 +1,12 @@ -# AnyTextPipeline Pipeline +# AnyTextPipeline Project page: https://aigcdesigngroup.github.io/homepage_anytext "AnyText comprises a diffusion pipeline with two primary elements: an auxiliary latent module and a text embedding module. The former uses inputs like text glyph, position, and masked image to generate latent features for text generation or editing. The latter employs an OCR model for encoding stroke data as embeddings, which blend with image caption embeddings from the tokenizer to generate texts that seamlessly integrate with the background. We employed text-control diffusion loss and text perceptual loss for training to further enhance writing accuracy." -Each text line that needs to be generated should be enclosed in double quotes. For any usage questions, please refer to the [paper](https://arxiv.org/abs/2311.03054). +> **Note:** Each text line that needs to be generated should be enclosed in double quotes. + +For any usage questions, please refer to the [paper](https://arxiv.org/abs/2311.03054). [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/tolgacangoz/b87ec9d2f265b448dd947c9d4a0da389/anytext.ipynb) diff --git a/examples/research_projects/anytext/anytext.py b/examples/research_projects/anytext/anytext.py index 2b5aea72863e..dc48760ee596 100644 --- a/examples/research_projects/anytext/anytext.py +++ b/examples/research_projects/anytext/anytext.py @@ -264,11 +264,11 @@ def forward( idx = tokenized_text[i] == self.placeholder_token.to(device) if sum(idx) > 0: if i >= len(self.text_embs_all): - print("truncation for log images...") + logger.warning("truncation for log images...") break text_emb = torch.cat(self.text_embs_all[i], dim=0) if sum(idx) != len(text_emb): - print("truncation for long caption...") + logger.warning("truncation for long caption...") text_emb = text_emb.to(embedded_text.device) embedded_text[i][idx] = text_emb[: sum(idx)] return embedded_text @@ -1065,6 +1065,8 @@ def forward( raise ValueError(f"Can't read ori_image image from {ori_image}!") elif isinstance(ori_image, torch.Tensor): ori_image = ori_image.cpu().numpy() + elif isinstance(ori_image, PIL.Image.Image): + ori_image = np.array(ori_image.convert("RGB")) else: if not isinstance(ori_image, np.ndarray): raise ValueError(f"Unknown format of ori_image: {type(ori_image)}") From fcc4959aaddde8b73336dd519f113a738fc02abb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tolga=20Cang=C3=B6z?= Date: Sat, 15 Mar 2025 08:03:42 +0300 Subject: [PATCH 3/3] Remove wget command for font file from example docstring in anytext.py --- examples/research_projects/anytext/anytext.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/examples/research_projects/anytext/anytext.py b/examples/research_projects/anytext/anytext.py index dc48760ee596..5c30b24efe88 100644 --- a/examples/research_projects/anytext/anytext.py +++ b/examples/research_projects/anytext/anytext.py @@ -157,9 +157,6 @@ def _is_whitespace(self, char): >>> from anytext_controlnet import AnyTextControlNetModel >>> from diffusers.utils import load_image - >>> # I chose a font file shared by an HF staff: - >>> !wget https://huggingface.co/spaces/ysharma/TranslateQuotesInImageForwards/resolve/main/arial-unicode-ms.ttf - >>> anytext_controlnet = AnyTextControlNetModel.from_pretrained("tolgacangoz/anytext-controlnet", torch_dtype=torch.float16, ... variant="fp16",) >>> pipe = DiffusionPipeline.from_pretrained("tolgacangoz/anytext", font_path="arial-unicode-ms.ttf",