In [1]:
import os
import sys
import torch
import time

base_directory = "../"
sys.path.insert(0, base_directory)

from stable_diffusion2.latent_diffusion import LatentDiffusion
from stable_diffusion2.stable_diffusion import StableDiffusion
from stable_diffusion2.utils.model import *
from stable_diffusion2.utils.utils import SectionManager as section
from stable_diffusion2.utils.utils import *
from stable_diffusion2.model.clip.clip_embedder import CLIPTextEmbedder



from stable_diffusion2.model.unet.unet import UNetModel

from pathlib import Path

In [6]:
CHECKPOINT_PATH = os.path.abspath('../input/model/v1-5-pruned-emaonly.ckpt')

EMBEDDER_PATH = os.path.abspath('../input/model/clip/clip_embedder.ckpt')
TOKENIZER_PATH = os.path.abspath('../input/model/clip/clip_tokenizer.ckpt')
TRANSFORMER_PATH = os.path.abspath('../input/model/clip/clip_transformer.ckpt')

UNET_PATH = os.path.abspath('../input/model/unet/unet.ckpt')

AUTOENCODER_PATH = os.path.abspath('../input/model/autoencoder/autoencoder.ckpt')
ENCODER_PATH = os.path.abspath('../input/model/autoencoder/encoder.ckpt')
DECODER_PATH = os.path.abspath('../input/model/autoencoder/decoder.ckpt')

LATENT_DIFFUSION_PATH = os.path.abspath('../input/model/latent_diffusion/latent_diffusion.ckpt')

In [7]:
device = get_device()
latent_diffusion_model = LatentDiffusion(linear_start=0.00085,
            linear_end=0.0120,
            n_steps=1000,
            latent_scaling_factor=0.18215
            ).to(device)
latent_diffusion_model.load_submodel_tree(device = device)

INFO: Using CUDA device: NVIDIA GeForce RTX 3080 Ti


In [14]:
@torch.no_grad()
def generate_images(self, *,
             seed: int = 0,
             batch_size: int = 1,
             prompt: str,
             h: int = 512, w: int = 512,
             uncond_scale: float = 7.5,
             low_vram: bool = False,
             noise_fn = torch.randn,
             temperature: float = 1.0,
             ):
    """
    :param seed: the seed to use when generating the images
    :param dest_path: is the path to store the generated images
    :param batch_size: is the number of images to generate in a batch
    :param prompt: is the prompt to generate images with
    :param h: is the height of the image
    :param w: is the width of the image
    :param uncond_scale: is the unconditional guidance scale $s$. This is used for
        $\epsilon_\theta(x_t, c) = s\epsilon_\text{cond}(x_t, c) + (s - 1)\epsilon_\text{cond}(x_t, c_u)$
    :param low_vram: whether to limit VRAM usage
    """
    # Number of channels in the image
    c = 4
    # Image to latent space resolution reduction
    f = 8
    if seed == 0:
        seed = time.time_ns() % 2**32
    set_seed(seed)
    # Adjust batch size based on VRAM availability
    if low_vram:
        batch_size = 1
    # Make a batch of prompts
    prompts = batch_size * [prompt]
    # AMP auto casting
    autocast = get_autocast()
    with autocast:
        un_cond, cond = self.get_text_conditioning(uncond_scale, prompts, batch_size)
        # [Sample in the latent space](../sampler/index.html).
        # `x` will be of shape `[batch_size, c, h / f, w / f]`
        x = self.sampler.sample(cond=cond,
                                shape=[batch_size, c, h // f, w // f],
                                uncond_scale=uncond_scale,
                                uncond_cond=un_cond,
                                noise_fn=noise_fn,
                                temperature=temperature)
        return self.decode_image(x)

In [15]:
StableDiffusionModel.generate_images = generate_images
stable_diffusion_model = StableDiffusionModel()
stable_diffusion_model.initialize_from_model(latent_diffusion_model)

INFO: Using CUDA device: NVIDIA GeForce RTX 3080 Ti


In [16]:
def logistic_distribution(loc, scale):
    base_distribution = torch.distributions.Uniform(0, 1)
    transforms = [torch.distributions.transforms.SigmoidTransform().inv, torch.distributions.transforms.AffineTransform(loc=loc, scale=scale)]
    logistic = torch.distributions.TransformedDistribution(base_distribution, transforms)
    return logistic
noise_fn = lambda shape, device = device: logistic_distribution(loc=0.0, scale=0.49000000953674316).sample(shape).to(device)

In [17]:
imgs = stable_diffusion_model.generate_images(
    prompt = 'A woman with flowers in her hair in a courtyard, in the style of Frank Frazetta',
    seed = 2982,
    noise_fn = noise_fn
    )

save_images(imgs, 'test.png')

['A woman with flowers in her hair in a courtyard, in the style of Frank Frazetta']


HTML(value='<pre  style="overflow-x: scroll;">Sample...</pre>')

In [10]:
from scripts2.stable_diffusion_base_script import StableDiffusionBaseScript
class Txt2Img(StableDiffusionBaseScript):
    """
    ### Text to image class
    """


    @torch.no_grad()
    def generate_images(self, *,
                 seed: int = 0,
                 batch_size: int = 1,
                 prompt: str,
                 h: int = 512, w: int = 512,
                 uncond_scale: float = 7.5,
                 low_vram: bool = False,
                 noise_fn = torch.randn,
                 temperature: float = 1.0,
                 ):
        """
        :param seed: the seed to use when generating the images
        :param dest_path: is the path to store the generated images
        :param batch_size: is the number of images to generate in a batch
        :param prompt: is the prompt to generate images with
        :param h: is the height of the image
        :param w: is the width of the image
        :param uncond_scale: is the unconditional guidance scale $s$. This is used for
            $\epsilon_\theta(x_t, c) = s\epsilon_\text{cond}(x_t, c) + (s - 1)\epsilon_\text{cond}(x_t, c_u)$
        :param low_vram: whether to limit VRAM usage
        """
        # Number of channels in the image
        c = 4
        # Image to latent space resolution reduction
        f = 8

        if seed == 0:
            seed = time.time_ns() % 2**32

        set_seed(seed)
        # Adjust batch size based on VRAM availability
        if low_vram:
            batch_size = 1

        # Make a batch of prompts
        prompts = batch_size * [prompt]

        # AMP auto casting
        autocast = get_autocast()
        with autocast:
            un_cond, cond = self.get_text_conditioning(uncond_scale, prompts, batch_size)

            # [Sample in the latent space](../sampler/index.html).
            # `x` will be of shape `[batch_size, c, h / f, w / f]`
            x = self.sampler.sample(cond=cond,
                                    shape=[batch_size, c, h // f, w // f],
                                    uncond_scale=uncond_scale,
                                    uncond_cond=un_cond,
                                    noise_fn=noise_fn,
                                    temperature=temperature)

            return self.decode_image(x)

In [11]:
txt2img = Txt2Img(checkpoint_path=None)
txt2img.initialize_from_model(latent_diffusion_model)


INFO: Using CUDA device: NVIDIA GeForce RTX 3080 Ti


In [12]:
imgs = txt2img.generate_images(
    prompt = 'A woman with flowers in her hair in a courtyard, in the style of Frank Frazetta',
    seed = 2982,
    noise_fn = noise_fn
    )

save_images(imgs, 'test.png')

['A woman with flowers in her hair in a courtyard, in the style of Frank Frazetta']


HTML(value='<pre  style="overflow-x: scroll;">Sample...</pre>')

In [6]:
img = load_img("./scripts2/test_img.jpg").to(device)

In [7]:
encoded_img = latent_diffusion_model.autoencoder_encode(img)

In [8]:
decoded_img = latent_diffusion_model.autoencoder_decode(encoded_img)

OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB (GPU 0; 12.00 GiB total capacity; 11.01 GiB already allocated; 0 bytes free; 11.23 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [1]:
import os
import sys
import torch
import time

base_directory = "../"
sys.path.insert(0, base_directory)

from stable_diffusion2.latent_diffusion import LatentDiffusion
from stable_diffusion2.stable_diffusion import StableDiffusion
from stable_diffusion2.utils.model import *
from stable_diffusion2.utils.utils import SectionManager as section
from stable_diffusion2.utils.utils import *
from stable_diffusion2.model.clip.clip_embedder import CLIPTextEmbedder



from stable_diffusion2.model.unet.unet import UNetModel

from pathlib import Path

In [2]:
device = check_device()
encoder = initialize_encoder(device = device)
decoder = initialize_decoder(device = device)

INFO: `device` is None. Falling back to current device.
INFO: Using CUDA device 0: NVIDIA GeForce RTX 3080 Ti.
Starting section: encoder initialization...
INFO: Using CUDA device 0: NVIDIA GeForce RTX 3080 Ti.
Finished section: encoder initialization in 0.27 seconds

Starting section: decoder initialization...
INFO: Using CUDA device 0: NVIDIA GeForce RTX 3080 Ti.
Finished section: decoder initialization in 0.27 seconds



In [3]:
get_memory_status()

Free: 10730 MiB
Total: 12287 MiB
Used: 1557 MiB


(tensor(10730), tensor(12287))

In [4]:
autoencoder = load_autoencoder(device = device)

Starting section: autoencoder model loading, from c:\Users\igor-\.cloned\kcg-ml-sd1p4\input\model\autoencoder\autoencoder.ckpt...
INFO: Using CUDA device 0: NVIDIA GeForce RTX 3080 Ti.
Finished section: autoencoder model loading, from c:\Users\igor-\.cloned\kcg-ml-sd1p4\input\model\autoencoder\autoencoder.ckpt in 0.00 seconds

Starting section: casting autoencoder model to device and evaling...
Finished section: casting autoencoder model to device and evaling in 0.00 seconds



In [5]:
get_memory_status()

Free: 10730 MiB
Total: 12287 MiB
Used: 1557 MiB


(tensor(10730), tensor(12287))

In [6]:
autoencoder.encoder = encoder
autoencoder.decoder = decoder

In [8]:
img = load_img("./scripts2/test_img.jpg").to(device)
encoded_img = autoencoder.encode(img)

In [9]:
get_memory_status()

Free: 1199 MiB
Total: 12287 MiB
Used: 11088 MiB


(tensor(1199), tensor(12287))

In [12]:
sample = encoded_img.sample()

In [13]:
sample.shape

torch.Size([1, 4, 64, 64])

In [14]:
torch.cuda.empty_cache()

In [15]:
get_memory_status()

Free: 1260 MiB
Total: 12287 MiB
Used: 11027 MiB


(tensor(1260), tensor(12287))

In [11]:
with section("autoencoder initialization from saved submodels"):
    autoencoder_3 = Autoencoder(emb_channels=4, z_channels=4)
    autoencoder_3.load_submodels(encoder_path=ENCODER_PATH, decoder_path=DECODER_PATH)

Starting autoencoder initialization from saved submodels...
Finished autoencoder initialization from saved submodels in 0.25 seconds


In [12]:
encoded_img_3 = autoencoder_3.encoder(img)

In [13]:
encoded_img.shape, encoded_img_3.shape

(torch.Size([1, 8, 64, 64]), torch.Size([1, 8, 64, 64]))

In [14]:
torch.norm(encoded_img_3 - encoded_img)

tensor(0., device='cuda:0', grad_fn=<LinalgVectorNormBackward0>)

In [None]:
latent_diffusion_model = 