CLIP Guided Images Mixing With Stable Diffusion

CLIP guided stable diffusion images mixing pipeline allows to combine two images using standard diffusion models. This approach is using (optional) CoCa model to avoid writing image description. This script was contributed by [Karachev Denis](https://github.com/TheDenk) and notebook by [Parag Ekbote](https://github.com/ParagEkbote).

In [1]:
pip install torch matplotlib Pillow diffusers transformers open_clip_torch

Collecting open_clip_torch
  Downloading open_clip_torch-2.30.0-py3-none-any.whl.metadata (31 kB)
Collecting ftfy (from open_clip_torch)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting timm (from open_clip_torch)
  Downloading timm-1.0.14-py3-none-any.whl.metadata (50 kB)
Downloading open_clip_torch-2.30.0-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m127.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ftfy-6.3.1-py3-none-any.whl (44 kB)
Downloading timm-1.0.14-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m141.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy, timm, open_clip_torch
Successfully installed ftfy-6.3.1 open_clip_torch-2.30.0 timm-1.0.14
Note: you may need to restart the kernel to use updated packages.


In [4]:
import PIL
import torch
import requests
import open_clip
from open_clip import SimpleTokenizer
from io import BytesIO
from diffusers import DiffusionPipeline
from transformers import CLIPImageProcessor, CLIPModel


def download_image(url):
    response = requests.get(url)
    return PIL.Image.open(BytesIO(response.content)).convert("RGB")

# Loading additional models
feature_extractor = CLIPImageProcessor.from_pretrained(
    "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
)
clip_model = CLIPModel.from_pretrained(
    "laion/CLIP-ViT-B-32-laion2B-s34B-b79K", torch_dtype=torch.float16
)
coca_model = open_clip.create_model('coca_ViT-L-14', pretrained='laion2B-s13B-b90k').to('cuda')
coca_model.dtype = torch.float16
coca_transform = open_clip.image_transform(
    coca_model.visual.image_size,
    is_train=False,
    mean=getattr(coca_model.visual, 'image_mean', None),
    std=getattr(coca_model.visual, 'image_std', None),
)
coca_tokenizer = SimpleTokenizer()

# Pipeline creating
mixing_pipeline = DiffusionPipeline.from_pretrained(
    "CompVis/stable-diffusion-v1-4",
    custom_pipeline="clip_guided_images_mixing_stable_diffusion",
    clip_model=clip_model,
    feature_extractor=feature_extractor,
    coca_model=coca_model,
    coca_tokenizer=coca_tokenizer,
    coca_transform=coca_transform,
    torch_dtype=torch.float16,
)
mixing_pipeline.enable_attention_slicing()
mixing_pipeline = mixing_pipeline.to("cuda")

# Pipeline running
generator = torch.Generator(device="cuda").manual_seed(17)

def download_image(url):
    response = requests.get(url)
    return PIL.Image.open(BytesIO(response.content)).convert("RGB")

content_image = download_image("https://huggingface.co/datasets/TheDenk/images_mixing/resolve/main/boromir.jpg")
style_image = download_image("https://huggingface.co/datasets/TheDenk/images_mixing/resolve/main/gigachad.jpg")

pipe_images = mixing_pipeline(
    num_inference_steps=50,
    content_image=content_image,
    style_image=style_image,
    noise_strength=0.65,
    slerp_latent_style_strength=0.9,
    slerp_prompt_style_strength=0.1,
    slerp_clip_image_style_strength=0.1,
    guidance_scale=9.0,
    batch_size=1,
    clip_guidance_scale=100,
    generator=generator,
).images

output_path = "mixed_output.jpg"
pipe_images[0].save(output_path)
print(f"Image saved successfully at {output_path}")

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

Image saved successfully at mixed_output.jpg
