In [62]:
from diffusers import StableDiffusionPipeline
import torch
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", variant="fp16", dtype=torch.float16, use_auth_token=False).to("cuda")

Keyword arguments {'dtype': torch.float16, 'use_auth_token': False} are not expected by StableDiffusionPipeline and will be ignored.


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

In [63]:
prompt_0 = "Horse grazing in the meadows"
prompt_1 = "Zebra grazing in the meadows"

In [64]:
from base64 import b64encode
import numpy as np
from diffusers import AutoencoderKL, LMSDiscreteScheduler, UNet2DConditionModel
from huggingface_hub import notebook_login

# For video display:
from IPython.display import HTML
from matplotlib import pyplot as plt
from pathlib import Path
from PIL import Image
from torch import autocast
from torchvision import transforms as tfms
from tqdm.auto import tqdm
from transformers import CLIPTextModel, CLIPTokenizer, logging
import os

In [65]:
!ls ~/.cache/huggingface/hub # check if model is cached

models--CompVis--stable-diffusion-v1-4	models--openai--clip-vit-large-patch14


In [66]:
torch.manual_seed(1)
if not (Path.home()/'.cache/huggingface'/'token').exists(): notebook_login()

In [67]:
torch_device = "cuda" if torch.cuda.is_available() else "mps" if torch.backens.mps.is_available() else "cpu"
print(torch_device)

cuda


In [68]:
# load the autoencoder model which will be used to decode the latents into image space
ae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae")

In [69]:
# Load the tokenizer and text encoder to tokenize and encode the text.
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")

In [70]:
# load the unet model which predicts the amount of noise in the latents

unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet")

# many options for noise scheduler are available. we go with the  Linear MultiStep Discrete Scheduler (LMSDiscrete).
scheduler = LMSDiscreteScheduler(beta_start = 0.00085, beta_end = 0.012, beta_schedule = "scaled_linear", num_train_timesteps = 1000)



In [71]:
# moving all the models to gpu
vae = ae.to(torch_device)
text_encoder = text_encoder.to(torch_device)
unet = unet.to(torch_device)

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 14.12 MiB is free. Process 6301 has 14.72 GiB memory in use. Of the allocated memory 14.41 GiB is allocated by PyTorch, and 191.55 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
def denoise(prompt):
  height, width = 512, 512 # default resolution of image
  num_inference_steps = 50 # number of inference steps
  guidance_scale = 7.5 # the sweet spot to balance image generation diversity and adherence to prompt
  generator = torch.manual_seed(42) # to replicate results
  batch_size = 1

  # get the text embedding for the prompts
  text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")

  with torch.no_grad():
    text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]
  max_length = text_input.input_ids.shape[-1]

  # get the text embedding for the unconditional image generation
  uncond_str = [""] * batch_size
  uncond_input = tokenizer(uncond_str, padding = "max_length", max_length = max_length, return_tensors = "pt")

  with torch.no_grad():
    uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]

  text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) # concatenate both unconditional and prompt conditional embeddings
  scheduler.set_timesteps(num_inference_steps)

  # prepare the latents

  latents = torch.randn(
      batch_size, unet.config.in_channels, height // 8, width // 8,
      generator = generator
  ) # random noise scaled down to 1/8 th the resolution of original image
  latents = latents.to(torch_device) # moving to GPU
  latents = latents * scheduler.init_noise_sigma # scaling the latents
  with autocast("cuda") : # convert to the correct type in cuda (memory efficient and accurate)
    for i, t in tqdm(enumerate(scheduler.timesteps), total = len(scheduler.timesteps)):
      # expand the latents to the UNet model
      latent_model_input = torch.cat([latents] * 2)
      # scale the latents - pre-conditioning process
      latent_model_input = scheduler.scale_model_input(latent_model_input, t)

      with torch.no_grad():
        # predict the noise in the latent space at time step t
        noise_pred = unet(latent_model_input, t, encoder_hidden_states = text_embeddings).sample
      noise_pred_uncond,noise_pred_text = noise_pred.chunk(2)
      # mix them in the required proportion - similar to Kalman filter update
      direction = noise_pred_text - noise_pred_uncond
      direction = direction / (direction.norm(p=2,dim=(1,2,3), keepdim=True) + 1e-8)   # normalize

      noise_pred = noise_pred_uncond + guidance_scale * direction
      # update the latents by removing the predicted noise
      latents = scheduler.step(noise_pred, t, latents).prev_sample
  return latents


In [None]:
latents1 = denoise(prompt_0)
latents2 = denoise(prompt_1)

In [None]:
def latent_to_pil(latent):
  # batch of latents -> image
  latent = 1/0.18125 * latent
  with torch.no_grad():
    image = vae.decode(latent).sample
  # reverse process of pil_to_latent
  image = (image / 2 + 0.5).clamp(0,1)
  image = image.detach().cpu().permute(0,2,3,1).numpy()
  image = (image * 255).round().astype(np.uint8)
  pil_image = [Image.fromarray(img) for img in image]
  return pil_image

In [None]:
horse_img = latent_to_pil(latents1)
zebra_img = latent_to_pil(latents2)

In [None]:
horse_img[0]

In [None]:
zebra_img[0]

In [None]:
print(latents1.shape, latents2.shape)

In [None]:
def otsu_threshold(diff_tensor):
    vals = diff_tensor.flatten()
    # Flatten and compute histogram (256 bins for [0,1] range)
    hist = torch.histc(diff_tensor.flatten(), bins=256, min=vals.min(), max=vals.max())
    hist = hist / hist.sum()  # Normalize to PDF

    # Cumulative sums for weights and means
    w = torch.cumsum(hist, dim=0)  # Class probabilities
    mu = torch.cumsum(hist * torch.arange(256, dtype=torch.float32) / 255.0, dim=0)

    # Total mean
    mu_total = mu[-1]

    # Between-class variance: w * (1-w) * (mu_t - mu)^2
    mu_b = mu.clone()
    variance = w * (1.0 - w) * ((mu_total * w - mu_b) ** 2)

    # Find max variance threshold (avoid endpoints)
    thresh_idx = variance[1:-1].argmax() + 1
    return thresh_idx.float() / 255.0  # Scale back to [0,1]

In [None]:
# normalize to [0,1]
diff = torch.norm(latents1-latents2, dim=1, keepdim=True).to("cpu")
dmin, dmax = diff.min(), diff.max()
norm_diff = (diff - dmin) / (dmax - dmin + 1e-8)

threshold = otsu_threshold(norm_diff)

# horse region = 1, background = 0
binary_mask = (norm_diff > threshold).float().to(diff.device)

In [None]:
binary_mask.to(torch_device)

In [None]:
prompt = prompt_1
height, width = 512, 512 # default resolution of image  num_inference_steps = 50 # number of inference steps
guidance_scale = 7.5 # the sweet spot to balance image generation diversity and adherence to prompt
generator = torch.manual_seed(42) # to replicate results
batch_size = 1
num_inference_steps = 50

# get the text embedding for the prompts
text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")

with torch.no_grad():
  text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]
max_length = text_input.input_ids.shape[-1]
# get the text embedding for the unconditional image generation
uncond_str = [""] * batch_size
uncond_input = tokenizer(uncond_str, padding = "max_length", max_length = max_length, return_tensors = "pt")

with torch.no_grad():
 uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]

text_embeddings = torch.cat([uncond_embeddings, text_embeddings]) # concatenate both unconditional and prompt conditional embeddings
scheduler.set_timesteps(num_inference_steps)

# prepare the latents

latents = latents1.clone()
latents = latents.to(torch_device) # moving to GPU
latents = latents * scheduler.init_noise_sigma # scaling the latents

# Move binary_mask to the same device as latents
binary_mask = binary_mask.to(torch_device)

with autocast("cuda") : # convert to the correct type in cuda (memory efficient and accurate)
    for i, t in tqdm(enumerate(scheduler.timesteps), total = len(scheduler.timesteps)):
      # expand the latents to the UNet model
      latent_model_input = torch.cat([latents] * 2)
      # scale the latents - pre-conditioning process
      latent_model_input = scheduler.scale_model_input(latent_model_input, t)

      with torch.no_grad():
        # predict the noise in the latent space at time step t
        noise_pred = unet(latent_model_input, t, encoder_hidden_states = text_embeddings).sample
      noise_pred_uncond,noise_pred_text = noise_pred.chunk(2)
      # mix them in the required proportion - similar to Kalman filter update
      direction = noise_pred_text - noise_pred_uncond
      direction = direction / (direction.norm(p=2,dim=(1,2,3), keepdim=True) + 1e-8)   # normalize

      noise_pred = noise_pred_uncond + guidance_scale * direction
      # update the latents by removing the predicted noise
      new_latent = scheduler.step(noise_pred, t, latents).prev_sample
      latents = binary_mask * new_latent + (1-binary_mask) * latents

In [None]:
zebra_img2 = latent_to_pil(latents)

In [None]:
zebra_img2[0]