In [None]:
!pip install -qU diffusers transformers accelerate huggingface_hub safetensors

# Textual Inversion

Textual inversion enables a model like Stable Diffusion to learn a new concept from just a few sample images. This gives us more control over the generated images and allow us to tailor the model towards specific concepts.

In [None]:
from diffusers import StableDiffusionPipeline
from diffusers.utils import make_image_grid

## Stable Diffusion 1 and 2

We need a pre-learned concept from **Stable Diffusion Conceptualizer**

In [None]:
pretrained_model_name_or_path = 'stable-diffusion-v1-5/stable-diffusion-v1-5'
repo_id_embeds = 'sd-concepts-library/cat-toy'

pipeline = StableDiffusionPipeline.from_pretrained(
    pretrained_model_name_or_path,
    torch_dtype=torch.float16,
    use_safetensors=True
).to('cuda')

pipeline.load_textual_inversion(repo_id_embeds)

Create a prompt with the pre-learned concept by using the special placeholder token `<cat-toy>`, and choose the number of samples and rows of images we would like to generate:

In [None]:
prompt = ' a grafitti in a favela wall with a <cat-toy> on it'
num_samples_per_row = 2
num_rows = 2

all_images = []
for _ in range(num_rows):
    images = pipeline(
        prompt,
        num_images_per_prompt=num_samples_per_row,
        num_inference_steps=50,
        guidance_scale=7.5
    ).images
    all_images.extend(images)

make_image_grid(all_images, rows=num_rows, cols=num_samples_per_row)

## Stable Diffusion XL

SDXL has two text encoders so we will need two textual inversion embeddings.

In [None]:
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file

f = hf_hub_download('dn118/unaestheticXL', filename='unaestheticXLv31.safetensors')
state_dict = load_file(f)
state_dict

There are two tensors, `'clip_g'` corresponds to the bigger text encoder in SDXL and refers to `pipe.text_encoder_2`, and `'clip_l'` refers to `pipe.text_encoder`.

In [None]:
from diffusers import AutoPipelineForText2Image
import torch

pipe = AutoPipelineForText2Image.from_pretrained(
    'stabilityai/stable-diffusion-xl-base-1.0',
    torch_dtype=torch.float16,
    variant='fp16',
).to('cuda')

pipe.load_textual_inversion(
    state_dict['clip_g'],
    token='unaestheticXLv31',
    text_encoder=pipe.text_encoder_2,
    tokenizer=pipe.tokenizer_2
)
pipe.load_textual_inversion(
    state_dict['clip_l'],
    token='unaestheticXLv31',
    text_encoder=pipe.text_encoder,
    tokenizer=pipe.tokenizer
)

In [None]:
prompt = "a woman standing in front of a mountain"
generator = torch.Generator('cuda').manual_seed(111)

# the embedding should be used a a negative embedding
image = pipe(
    prompt,
    negative_prompt='unaestheticXLv31',
    generator=generator
).images[0]
image