## Generate 128, 64x64 random latent vectors - pass into auto encoder - time it

In [None]:
# Default env type is test
# Change this when running on colab or kaggle
ENV_TYPE = "TEST"

In [None]:
import os
import sys

if ENV_TYPE == "TEST":
    model_path = "/input/models"
    base_directory = "../"
else:
    # clone the repository
    !git clone https://github.com/kk-digital/kcg-ml-sd1p4.git

    # move to the repo
    %cd kcg-ml-sd1p4/

    model_path = "./"
    # Get the current directory
    base_directory = os.getcwd()
    base_directory = os.path.join(base_directory, 'kcg-ml')
    # download model weights
    !wget https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.ckpt

# Insert the paths into sys.path
sys.path.insert(0, base_directory)

In [2]:
import torch

if torch.cuda.is_available():
    device = "cuda"
    print("GPU is available")
else:
    device = "cpu"
    print("Warning: GPU is not available, running on CPU")


GPU is available


In [None]:
#Install requirements 
!pip install diffusers==0.11.1
!pip install transformers scipy ftfy accelerate
!pip3 install labml
!pip3 install labml-nn
!pip3 install pytorch-lightning
!pip install openai-clip
!pip install Pillow==9.0.0

In [None]:
"""
---
title: Generate images using stable diffusion with a prompt
summary: >
 Generate images using stable diffusion with a prompt
---

# Generate images using [stable diffusion](../index.html) with a prompt
"""

import os
from pathlib import Path

import torch

from labml import lab, monit
from stable_diffusion.latent_diffusion import LatentDiffusion
from stable_diffusion.sampler.ddim import DDIMSampler
from stable_diffusion.sampler.ddpm import DDPMSampler
from stable_diffusion.utils.model import load_model, save_images, set_seed


class Txt2Img:
    """
    ### Text to image class
    """
    model: LatentDiffusion

    def __init__(self, *,
                 checkpoint_path: Path,
                 sampler_name: str,
                 n_steps: int = 50,
                 ddim_eta: float = 0.0,
                 ):
              self.load(checkpoint_path, sampler_name, n_steps, ddim_eta)

    def load(self,
             checkpoint_path: Path,
             sampler_name: str,
             n_steps: int = 50,
             ddim_eta: float = 0.0,
             ):
        self.model = load_model(checkpoint_path)
        self.device = torch.device(device)
        self.model.to(self.device)

        if sampler_name == 'ddim':
            self.sampler = DDIMSampler(self.model,
                                       n_steps=n_steps,
                                       ddim_eta=ddim_eta)
        elif sampler_name == 'ddpm':
            self.sampler = DDPMSampler(self.model)

    def unload(self):
        self.model = None
        self.sampler = None

    @torch.no_grad()
    def __call__(self, *,
                 dest_path: str,
                 batch_size: int = 3,
                 prompt: str,
                 h: int = 512, w: int = 512,
                 uncond_scale: float = 7.5,
                 ):
        """
        :param dest_path: is the path to store the generated images
        :param batch_size: is the number of images to generate in a batch
        :param prompt: is the prompt to generate images with
        :param h: is the height of the image
        :param w: is the width of the image
        :param uncond_scale: is the unconditional guidance scale $s$. This is used for
            $\epsilon_\theta(x_t, c) = s\epsilon_\text{cond}(x_t, c) + (s - 1)\epsilon_\text{cond}(x_t, c_u)$
        """
        # Number of channels in the image
        c = 4
        # Image to latent space resolution reduction
        f = 8

        # Make a batch of prompts
        prompts = batch_size * [prompt]

        # AMP auto casting
        with torch.autocast(device):
            # In unconditional scaling is not $1$ get the embeddings for empty prompts (no conditioning).
            if uncond_scale != 1.0:
                un_cond = self.model.get_text_conditioning(batch_size * [""])
            else:
                un_cond = None
            # Get the prompt embeddings
            cond = self.model.get_text_conditioning(prompts)
            # [Sample in the latent space](../sampler/index.html).
            # `x` will be of shape `[batch_size, c, h / f, w / f]`
            x = self.sampler.sample(cond=cond,
                                    shape=[batch_size, c, h // f, w // f],
                                    uncond_scale=uncond_scale,
                                    uncond_cond=un_cond)
            # Decode the image from the [autoencoder](../model/autoencoder.html)
            images = self.model.autoencoder_decode(x)

        # Save images
        save_images(images, dest_path, 'txt_')

    # functions for pipeline
    @torch.no_grad()
    def generate_text_embeddings(self, prompt, batch_size=4, uncond_scale=7.5):
        """
        :param prompt: is the prompt to generate images with
        """
        # Make a batch of prompts
        prompts = batch_size * [prompt]

        with torch.no_grad():
            # In unconditional scaling is not $1$ get the embeddings for empty prompts (no conditioning).
            if uncond_scale != 1.0:
                un_cond = self.model.get_text_conditioning(batch_size * [""])
            else:
                un_cond = None
            # Get the prompt embeddings
            cond = self.model.get_text_conditioning(prompts)

        # return the embeddings
        return cond, un_cond
    
    @torch.no_grad()
    def generate_latent_space(self, cond, un_cond, batch_size=4, uncond_scale=7.5, h=512, w=512):
        """
        :param prompt: is the prompt to generate images with
        """
        # Number of channels in the image
        c = 4
        # Image to latent space resolution reduction
        f = 8

        # AMP auto casting
        with torch.autocast(device):
            # [Sample in the latent space](../sampler/index.html).
            # `x` will be of shape `[batch_size, c, h / f, w / f]`
            x = self.sampler.sample(cond=cond,
                                    shape=[batch_size, c, h // f, w // f],
                                    uncond_scale=uncond_scale,
                                    uncond_cond=un_cond)
        
        # return the embeddings
        return x
    
    @torch.no_grad()
    def generate_image(self, x):
        """
        :param prompt: is the prompt to generate images with
        """
        # AMP auto casting
        with torch.autocast(device):
            # Decode the image from the [autoencoder](../model/autoencoder.html)
            image = self.model.autoencoder_decode(x)
        
        # return the embeddings
        return image


In [None]:
from typing import List

import torch
from torch import nn
from transformers import CLIPTokenizer, CLIPTextModel

class CLIPTextEmbedder(nn.Module):
    """
    ## CLIP Text Embedder
    """

    def __init__(self, version: str = "openai/clip-vit-large-patch14", max_length: int = 77):
        """
        :param version: is the model version
        :param max_length: is the max length of the tokenized prompt
        """
        super().__init__()
        self.tokenizer = CLIPTokenizer.from_pretrained(version)
        self.transformer = CLIPTextModel.from_pretrained(version).eval()

        self.device = torch.device(device)
        # Move the transformer to the correct device
        self.transformer = self.transformer.to(self.device)

        self.max_length = max_length

    def forward(self, prompts: List[str]):
        """
        :param prompts: are the list of prompts to embed
        """
        input_ids = self.tokenizer.batch_encode_plus(
            prompts, truncation=True, max_length=self.max_length, padding="max_length", return_tensors="pt")["input_ids"]
        # Move input_ids to the correct device
        input_ids = input_ids.to(self.device)
        return self.transformer(input_ids=input_ids).last_hidden_state


# Example usage
x = CLIPTextEmbedder()
out = x.forward(prompts=["space marines"])
print(torch.Tensor.size(out))
print(out)


## Return 128 image & Calculate time

In [None]:
import time
import torch

# Initialize the Txt2Img class
txt2img = Txt2Img(checkpoint_path=os.path.join(model_path, 'v1-5-pruned-emaonly.ckpt'),
                  sampler_name='ddim',
                  n_steps=50,
                  ddim_eta=0.0)

# Define parameters for generating images
# N determines how many images will be generated in a single pass
N = 128
h = 64
w = 64
c = 4
f = 8

# Create random latent vectors
latent_vectors = torch.randn(N, c, h // f, w // f).to(txt2img.device)

# Time the decoding process
start_time = time.time()

# Decode images from latent vectors

images = txt2img.generate_image(latent_vectors)

# Calculate and print the time taken
end_time = time.time()
time_elapsed = end_time - start_time
time_per_image = time_elapsed / N
print(f"number of images: {N}")
print(f"Time taken to generate images: {time_elapsed:.4f} seconds")
print(f"Time taken per image: {time_per_image:.4f} seconds")

# Return 128 random prompt & Calculate Time

In [None]:
import random
import numpy as np
import time

phrase_list = ['2d', 'pixel art', 'cave', 'scifi', 'side scrolling', 'chibi', 'waifu', 'space ship', 'desert', 'city', 'wasteland', 'mega structure', 'steal', 'stone', 'rock']

def prompt_generator(phrase_list, prompt_word_length=32, prompt_number=1):
    prompts = []
    for i in range(prompt_number):
        prompt = ''
        while len(prompt) < prompt_word_length:
            phrase = random.choice(phrase_list)
            if len(prompt) + len(phrase) + 1 <= prompt_word_length:
                prompt += phrase + ' '
            else:
                break
        prompts.append(prompt.strip())
    return prompts

start_time = time.time()
prompts = prompt_generator(phrase_list, prompt_word_length=32, prompt_number=128)
end_time = time.time()

print('Generated prompts:')
for prompt in prompts:
    print(prompt)

print('Execution time:', end_time - start_time, 'seconds')
