<a href="https://colab.research.google.com/github/danielpatrickhug/karpathy_stablediffusionwalk_notebook_adaption/blob/main/stable_diffusion_walk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stable Diffusion Walk
Notebook Adaption of @karpathy [stablediffusionwalk](https://gist.github.com/karpathy/00103b0037c5aaea32fe1da1af553355) 

## Setup

In [None]:
!pip install --upgrade diffusers transformers scipy
!pip install ffmpeg-python
!pip install ftfy

## Nvidia Info
- Check is GPU is initialized
    - If a GPU is not initialized, go to Edit/Notebook settings and set "Hardware Accelerator" to GPU

In [None]:
!nvidia-smi

#Huggingface Auth

In [None]:
!huggingface-cli login

## Imports

In [None]:
import torch
from torch import autocast
import os
from diffusers import StableDiffusionPipeline
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
from PIL import Image
from IPython import display
import numpy as np
import pandas as pd
import inspect
from IPython.display import Image as IPImage
from types import SimpleNamespace

torch.manual_seed(7)
device = "cuda"

## Function definition

In [None]:
@torch.no_grad()
def diffuse(
        pipe,
        cond_embeddings, # text conditioning, should be (1, 77, 768)
        cond_latents,    # image conditioning, should be (1, 4, 64, 64)
        num_inference_steps,
        guidance_scale,
        eta,
    ):
    torch_device = cond_latents.get_device()

    # classifier guidance: add the unconditional embedding
    max_length = cond_embeddings.shape[1] # 77
    uncond_input = pipe.tokenizer([""], padding="max_length", max_length=max_length, return_tensors="pt")
    uncond_embeddings = pipe.text_encoder(uncond_input.input_ids.to(torch_device))[0]
    text_embeddings = torch.cat([uncond_embeddings, cond_embeddings])

    # if we use LMSDiscreteScheduler, let's make sure latents are mulitplied by sigmas
    if isinstance(pipe.scheduler, LMSDiscreteScheduler):
        cond_latents = cond_latents * pipe.scheduler.sigmas[0]

    # init the scheduler
    accepts_offset = "offset" in set(inspect.signature(pipe.scheduler.set_timesteps).parameters.keys())
    extra_set_kwargs = {}
    if accepts_offset:
        extra_set_kwargs["offset"] = 1
    pipe.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs)
    # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
    # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
    # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
    # and should be between [0, 1]
    accepts_eta = "eta" in set(inspect.signature(pipe.scheduler.step).parameters.keys())
    extra_step_kwargs = {}
    if accepts_eta:
        extra_step_kwargs["eta"] = eta

    # diffuse!
    for i, t in enumerate(pipe.scheduler.timesteps):

        # expand the latents for classifier free guidance
        latent_model_input = torch.cat([cond_latents] * 2)
        if isinstance(pipe.scheduler, LMSDiscreteScheduler):
            sigma = pipe.scheduler.sigmas[i]
            latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)

        # predict the noise residual
        noise_pred = pipe.unet(latent_model_input, t, encoder_hidden_states=text_embeddings)["sample"]

        # cfg
        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

        # compute the previous noisy sample x_t -> x_t-1
        if isinstance(pipe.scheduler, LMSDiscreteScheduler):
            cond_latents = pipe.scheduler.step(noise_pred, i, cond_latents, **extra_step_kwargs)["prev_sample"]
        else:
            cond_latents = pipe.scheduler.step(noise_pred, t, cond_latents, **extra_step_kwargs)["prev_sample"]

    # scale and decode the image latents with vae
    cond_latents = 1 / 0.18215 * cond_latents
    image = pipe.vae.decode(cond_latents)

    # generate output numpy image as uint8
    image = (image[0] / 2 + 0.5).clamp(0, 1)
    image = image.cpu().permute(0, 2, 3, 1).numpy()
    image = (image[0] * 255).astype(np.uint8)

    return image


def slerp(t, v0, v1, DOT_THRESHOLD=0.9995):
    """ helper function to spherically interpolate two arrays v1 v2 """

    if not isinstance(v0, np.ndarray):
        inputs_are_torch = True
        input_device = v0.device
        v0 = v0.cpu().numpy()
        v1 = v1.cpu().numpy()

    dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
    if np.abs(dot) > DOT_THRESHOLD:
        v2 = (1 - t) * v0 + t * v1
    else:
        theta_0 = np.arccos(dot)
        sin_theta_0 = np.sin(theta_0)
        theta_t = theta_0 * t
        sin_theta_t = np.sin(theta_t)
        s0 = np.sin(theta_0 - theta_t) / sin_theta_0
        s1 = sin_theta_t / sin_theta_0
        v2 = s0 * v0 + s1 * v1

    if inputs_are_torch:
        v2 = torch.from_numpy(v2).to(input_device)

    return v2

## Dreaming

### Load Stable Diffusion Model

In [None]:
model_checkpoint = "CompVis/stable-diffusion-v1-4"


lms = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear")
pipe = StableDiffusionPipeline.from_pretrained(model_checkpoint, scheduler=lms, use_auth_token=True)

pipe = pipe.to(device)
pipe.unet.to(device)
pipe.vae.to(device)
pipe.text_encoder.to(device)

print('w00t')

Paths

In [None]:
prompt = "blueberry spaghetti" 

In [None]:
root_dir = "/content/drive/MyDrive/StableDreams"
dream_name = "Blueberry"
outdir = os.path.join(root_dir, dream_name)
os.makedirs(outdir, exist_ok=True)

In [None]:
def DreamArgs():
    height = 512 #@param {type:"number"}
    width = 512 #@param {type:"number"}
    max_frames = 10 #@param {type:"number"}
    num_steps = 200 #@param {type:"number"}
    num_inference_steps = 50 #@param {type:"number"}
    guidance_scale = 7.5 #@param {type:"number"}
    eta = 0.0 #@param {type:"number"}
    quality = 90 #@param {type:"number"}
    return locals()

args = SimpleNamespace(**DreamArgs())

In [None]:
# get the conditional text embeddings based on the prompt
text_input = pipe.tokenizer(prompt, padding="max_length", max_length=pipe.tokenizer.model_max_length, truncation=True, return_tensors="pt")
cond_embeddings = pipe.text_encoder(text_input.input_ids.to(device))[0] # shape [1, 77, 768]

In [None]:
# sample a source
init1 = torch.randn((1, pipe.unet.in_channels, args.height // 8, args.width // 8), device=device)

# iterate the loop
frame_index = 0
while frame_index < args.max_frames:

    # sample the destination
    init2 = torch.randn((1, pipe.unet.in_channels, args.height // 8, args.width // 8), device=device)

    for i, t in enumerate(np.linspace(0, 1, args.num_steps)):
        init = slerp(float(t), init1, init2)

        print("dreaming... ", frame_index)
        with autocast("cuda"):
            image = diffuse(pipe, cond_embeddings, init, args.num_inference_steps, args.guidance_scale, args.eta)
        im = Image.fromarray(image)
        outpath = os.path.join(outdir, 'frame%06d.jpg' % frame_index)
        im.save(outpath, quality=args.quality)
        frame_index += 1

    init1 = init2

### Dream Seam

In [None]:
import ffmpeg
mp4_path = f'{outdir}/{dream_name}.mp4'
(
    ffmpeg
    .input(f"{outdir}/*.jpg", pattern_type='glob', framerate=10)
    .output(mp4_path)
    .run()
)

In [None]:
from base64 import b64encode
mp4 = open(mp4_path,'rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()

display.display( display.HTML(f'<video controls loop><source src="{data_url}" type="video/mp4"></video>') )
