In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# from dotenv import load_dotenv

# load_dotenv()

True

In [1]:
from stable_diffusion_backend.ml.image_generation import text_to_img

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
prompt = "a realistic happy dog playing in the grass"

negative_prompt = None
# negative_prompt = "render, cartoon, cgi, render, illustration, painting, drawing"
# negative_prompt = "cgi, 3d render, bad quality, worst quality, text, signature, watermark, extra limbs"

seed = None
# seed = 2084045068

# clip_skip = None
clip_skip = 1
# clip_skip = 2

image = text_to_img(
    prompt=prompt,
    negative_prompt=negative_prompt,
    seed=seed,
    num_inference_steps=7,
    cfg_scale=2.0,
    clip_skip=clip_skip,
)

image

## Intermediate Images

- [Diffusers Issue](https://github.com/huggingface/diffusers/issues/3579)


In [None]:
import logging
import os
import shutil
from base64 import b64encode
from io import BytesIO

import discord
import matplotlib.pyplot as plt

## Basic libraries
import numpy as np
import torch
from diffusers import AutoencoderKL, LMSDiscreteScheduler, UNet2DConditionModel

# matplotlib inline
## For video display
from IPython.display import HTML, display

## disable warnings
# logging.disable(logging.WARNING)
## Imaging  library
from PIL import Image
from torchvision import transforms as tfms
from tqdm.auto import tqdm

## Import the CLIP artifacts
from transformers import CLIPTextModel, CLIPTokenizer

## Initiating tokenizer and encoder.
tokenizer = CLIPTokenizer.from_pretrained(
    "openai/clip-vit-large-patch14", torch_dtype=torch.float16
)
text_encoder = CLIPTextModel.from_pretrained(
    "openai/clip-vit-large-patch14", torch_dtype=torch.float16
).to("cuda")
## Initiating the VAE
vae = AutoencoderKL.from_pretrained(
    "CompVis/stable-diffusion-v1-4", subfolder="vae", torch_dtype=torch.float16
).to("cuda")
## Initializing a scheduler and Setting number of sampling steps
scheduler = LMSDiscreteScheduler(
    beta_start=0.00085,
    beta_end=0.012,
    beta_schedule="scaled_linear",
    num_train_timesteps=1000,
)
scheduler.set_timesteps(50)
## Initializing the U-Net model
unet = UNet2DConditionModel.from_pretrained(
    "CompVis/stable-diffusion-v1-4", subfolder="unet", torch_dtype=torch.float16
).to("cuda")


## Helper functions
def load_image(p):
    """
    Function to load images from a defined path
    """
    return Image.open(p).convert("RGB").resize((512, 512))


def pil_to_latents(image):
    """
    Function to convert image to latents
    """
    init_image = tfms.ToTensor()(image).unsqueeze(0) * 2.0 - 1.0
    init_image = init_image.to(device="cuda", dtype=torch.float16)
    return vae.encode(init_image).latent_dist.sample() * 0.18215


def latents_to_pil(latents):
    """
    Function to convert latents to images
    """
    latents = (1 / 0.18215) * latents
    with torch.no_grad():
        image = vae.decode(latents).sample
    image = (image / 2 + 0.5).clamp(0, 1)
    image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
    images = (image * 255).round().astype("uint8")
    return [Image.fromarray(image) for image in images]


def text_enc(prompts, maxlen=None):
    """
    A function to take a texual promt and convert it into embeddings
    """
    if maxlen is None:
        maxlen = tokenizer.model_max_length
    inp = tokenizer(
        prompts,
        padding="max_length",
        max_length=maxlen,
        truncation=True,
        return_tensors="pt",
    )
    return text_encoder(inp.input_ids.to("cuda"))[0].half()


async def prompt_2_img(
    prompt,
    g=7.5,
    batch_size=1,
    seed=0,
    steps=70,
    height=512,
    width=512,
    message=None,
    context=None,
):
    """
    Diffusion process to convert prompt to image
    """
    warning_step = 0

    # Converting textual prompts to embedding
    text = text_enc([prompt] * batch_size)

    # Adding an unconditional prompt , helps in the generation process
    uncond = text_enc([""] * batch_size, text.shape[1])
    emb = torch.cat([uncond, text])

    # Setting the seed
    if seed:
        torch.manual_seed(seed)

    # Initiating random noise
    latents = torch.randn((batch_size, unet.in_channels, height // 8, width // 8))

    # Setting number of steps in scheduler
    scheduler.set_timesteps(steps)

    # Adding noise to the latents
    latents = latents.to("cuda").half() * scheduler.init_noise_sigma

    # Iterating through defined steps
    for i, ts in enumerate(tqdm(scheduler.timesteps)):
        # We need to scale the i/p latents to match the variance
        inp = scheduler.scale_model_input(torch.cat([latents] * 2), ts)

        # Predicting noise residual using U-Net
        with torch.no_grad():
            u, t = unet(inp, ts, encoder_hidden_states=emb).sample.chunk(2)

        # Performing Guidance
        pred = u + g * (t - u)

        # Conditioning  the latents
        latents = scheduler.step(pred, ts, latents).prev_sample

        # Saving intermediate images
        if context.author and message.content:
            if warning_step == i:
                images = latents_to_pil(latents)
                canvas = Image.new("RGBA", (width * 2, height * 2), (255, 255, 255, 0))
                if len(images) >= 1:
                    canvas.paste(images[0], (0, 0))
                if len(images) >= 2:
                    canvas.paste(images[1], (width, 0))
                if len(images) >= 3:
                    canvas.paste(images[2], (0, height))
                if len(images) == 4:
                    canvas.paste(images[3], (width, height))
                image_bytes = BytesIO()
                # Save the image data to the BytesIO object
                canvas.save(image_bytes, format="PNG")
                image_bytes.seek(0)
                file = discord.File(image_bytes, filename=f"steps_{i:04}.png")
                await message.edit(
                    content=f"**{prompt}** - {context.author.mention} ({i%steps}%) ({'fast' if steps <= 70 else 'slow'})",
                    attachments=[file],
                )
                warning_step += 10

    # Returning the latent representation to output an image of 3x512x512
    return latents_to_pil(latents)