# Quantization Stable Diffusion

_Authored by: [Thomas Liang](https://github.com/thliang01)_


- [ ] TODO: write description and quantization stable diffusion models

## Install required python package

In [None]:
! pip install --upgrade diffusers accelerate transformers safetensors datasets quanto
! pip install -q numpy Pillow torchmetrics[image] torch-fidelity

## Import modules

In [None]:
import torch
import numpy as np
import os

import time

from PIL import Image
from IPython import display as IPdisplay
from tqdm.auto import tqdm

from diffusers import DiffusionPipeline
from diffusers import DDIMScheduler
from transformers import logging

logging.set_verbosity_error()

### Check CUDA is available

In [None]:
print(torch.cuda.is_available())

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## Base Model

In [None]:
model_name_or_path = "stabilityai/stable-diffusion-xl-base-1.0"
scheduler = DDIMScheduler.from_pretrained(model_name_or_path, subfolder="scheduler")
num_inference_steps = 50
height = 512
width = 512
generator = torch.manual_seed(42)

pipeline = DiffusionPipeline.from_pretrained(
    model_name_or_path,
    scheduler = scheduler,
    torch_dtype = torch.float16, 
    variant = "fp16",
    height = height,
    width = width,
    use_safetensors = True, 
    generator = generator,
    num_inference_steps = num_inference_steps,
).to(device)

## Display_images

In [None]:
prompt = "a photo of an astronaut riding a horse on mars"
images = pipeline(prompt).images[0]
images

## Evaluating Diffusion Models (default)

* CLIP score
* PickScore

### CLIP score

In [None]:
prompts = [
    "a photo of an astronaut riding a horse on mars",
    "A high tech solarpunk utopia in the Amazon rainforest",
    "A pikachu fine dining with a view to the Eiffel Tower",
    "A mecha robot in a favela in expressionist style",
    "an insect robot preparing a delicious meal",
    "A small cabin on top of a snowy mountain in the style of Disney, artstation",
]

images = pipeline(prompts, num_images_per_prompt=1, output_type="np", height = height, width = width).images

print(images.shape)
# (6, 512, 512, 3)

In [None]:
from torchmetrics.functional.multimodal import clip_score
from functools import partial

clip_score_fn = partial(clip_score, model_name_or_path="openai/clip-vit-base-patch16")

def calculate_clip_score(images, prompts):
    images_int = (images * 255).astype("uint8")
    clip_score = clip_score_fn(torch.from_numpy(images_int).permute(0, 3, 1, 2), prompts).detach()
    return round(float(clip_score), 4)

sd_clip_score = calculate_clip_score(images, prompts)
print(f"CLIP score: {sd_clip_score}")

### PickScore

In [None]:
# import
from transformers import AutoProcessor, AutoModel
from PIL import Image
import torch

# load model
device = "cuda"
processor_name_or_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
model_pretrained_name_or_path = "yuvalkirstain/PickScore_v1"
processor = AutoProcessor.from_pretrained(processor_name_or_path)
model = AutoModel.from_pretrained(model_pretrained_name_or_path).eval().to(device)

In [None]:
# Score function adapted from their docs
def get_scores(prompt, images):
    
    # preprocess
    image_inputs = processor(
        images=images,
        padding=True,
        truncation=True,
        max_length=77,
        return_tensors="pt",
    ).to(device)
    
    text_inputs = processor(
        text=prompt,
        padding=True,
        truncation=True,
        max_length=77,
        return_tensors="pt",
    ).to(device)


    with torch.no_grad():
        # embed
        image_embs = model.get_image_features(**image_inputs)
        image_embs = image_embs / torch.norm(image_embs, dim=-1, keepdim=True)
    
        text_embs = model.get_text_features(**text_inputs)
        text_embs = text_embs / torch.norm(text_embs, dim=-1, keepdim=True)
    
        # score
        scores = model.logit_scale.exp() * (text_embs @ image_embs.T)[0]
       
    return scores.cpu().tolist()

In [None]:
get_scores("a photo of an astronaut riding a horse on mars", images)

In [None]:
get_scores("a photo of a pretty flower", images)

In [None]:
from datasets import load_dataset
pap = load_dataset("yuvalkirstain/pickapic_v1_no_images")
prompts = pap['validation_unique']['caption']
prompts[:3]

#### Measuring the effect of CFG_Scale on Score


In [None]:
import matplotlib.pyplot as plt
from IPython.display import clear_output

average_scores = []
cfg_scales = [2, 5, 9, 12, 30]
for cfg_scale in cfg_scales:
    scores = []
    for i, prompt in enumerate(prompts[:5]):
        print(f"Scale {cfg_scale}, prompt {i}")
        generator = generator # For reproducibility
        im = pipeline(prompt, num_inference_steps=50, 
                  generator=generator, guidance_scale=cfg_scale).images[0]
        scores.append(get_scores(prompt, im)[0])
        clear_output(wait=True)
    average_scores.append(sum(scores)/len(scores))

plt.plot(cfg_scales, average_scores)

#### Using A Score Model for Re-Ranking

In [None]:
def generate_good_image(prompt):
    images = []
    # Generate 4 images with two different guidance scales (for example):
    images += pipeline(prompt, num_inference_steps=50, num_images_per_prompt=1,height = height, width = width).images
    images += pipeline(prompt, num_inference_steps=50, num_images_per_prompt=1,height = height, width = width, guidance_scale=5).images 
    # Score them and pick the best one
    scores = get_scores(prompt, images)
    best_image = images[scores.index(max(scores))]
    return best_image

generate_good_image("a photo of an astronaut riding a horse on mars")

## Quantization Stable Diffusion

* Post Training Quantization

In [None]:
from quanto import quantize, freeze, qint8
import torch

model = "stabilityai/stable-diffusion-xl-base-1.0"

print(model)

In [None]:
def PTQ(torch_dtype, unet_dtype=None, device="cuda"):
    pipe = DiffusionPipeline.from_pretrained(
        model, 
        torch_dtype=torch_dtype,
        scheduler = scheduler,
        height = height,
        width = width,
        generator = generator,
        num_inference_steps = num_inference_steps, 
        use_safetensors=True).to(device)

    if unet_dtype:
        quantize(pipe.unet, weights=unet_dtype)
        freeze(pipe.unet)

    pipe.set_progress_bar_config(disable=True)
    return pipe

In [None]:
qpipe = PTQ(torch_dtype=torch.float16, unet_dtype=qint8)

In [None]:
# after performing quantization
print(qpipe)

In [None]:
prompt = "a photo of an astronaut riding a horse on mars"
images = qpipe(prompt).images[0]
images

## Evaluating Diffusion Models (Post Training Quantization) After

* CLIP score
* PickScore

In [None]:
### CLIP score
prompts = [
    "a photo of an astronaut riding a horse on mars",
    "A high tech solarpunk utopia in the Amazon rainforest",
    "A pikachu fine dining with a view to the Eiffel Tower",
    "A mecha robot in a favela in expressionist style",
    "an insect robot preparing a delicious meal",
    "A small cabin on top of a snowy mountain in the style of Disney, artstation",
]

images = qpipe(prompts, num_images_per_prompt=1, output_type="np", height = height, width = width).images

print(images.shape)
# (6, 512, 512, 3)
from torchmetrics.functional.multimodal import clip_score
from functools import partial

clip_score_fn = partial(clip_score, model_name_or_path="openai/clip-vit-base-patch16")

def calculate_clip_score(images, prompts):
    images_int = (images * 255).astype("uint8")
    clip_score = clip_score_fn(torch.from_numpy(images_int).permute(0, 3, 1, 2), prompts).detach()
    return round(float(clip_score), 4)

sd_clip_score = calculate_clip_score(images, prompts)
print(f"CLIP score: {sd_clip_score}")

In [None]:
### PickScore
# import
from transformers import AutoProcessor, AutoModel
from PIL import Image
import torch

# load model
device = "cuda"
processor_name_or_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
model_pretrained_name_or_path = "yuvalkirstain/PickScore_v1"
processor = AutoProcessor.from_pretrained(processor_name_or_path)
model = AutoModel.from_pretrained(model_pretrained_name_or_path).eval().to(device)
# Score function adapted from their docs
def get_scores(prompt, images):
    
    # preprocess
    image_inputs = processor(
        images=images,
        padding=True,
        truncation=True,
        max_length=77,
        return_tensors="pt",
    ).to(device)
    
    text_inputs = processor(
        text=prompt,
        padding=True,
        truncation=True,
        max_length=77,
        return_tensors="pt",
    ).to(device)


    with torch.no_grad():
        # embed
        image_embs = model.get_image_features(**image_inputs)
        image_embs = image_embs / torch.norm(image_embs, dim=-1, keepdim=True)
    
        text_embs = model.get_text_features(**text_inputs)
        text_embs = text_embs / torch.norm(text_embs, dim=-1, keepdim=True)
    
        # score
        scores = model.logit_scale.exp() * (text_embs @ image_embs.T)[0]
       
    return scores.cpu().tolist()
get_scores("a photo of an astronaut riding a horse on mars", images)
get_scores("a photo of a pretty flower", images)
from datasets import load_dataset
pap = load_dataset("yuvalkirstain/pickapic_v1_no_images")
prompts = pap['validation_unique']['caption']
prompts[:3]
#### Measuring the effect of CFG_Scale on Score

import matplotlib.pyplot as plt
from IPython.display import clear_output

average_scores = []
cfg_scales = [2, 5, 9, 12, 30]
for cfg_scale in cfg_scales:
    scores = []
    for i, prompt in enumerate(prompts[:5]):
        print(f"Scale {cfg_scale}, prompt {i}")
        generator = generator # For reproducibility
        im = qpipe(prompt, num_inference_steps=50, 
                  generator=generator, guidance_scale=cfg_scale).images[0]
        scores.append(get_scores(prompt, im)[0])
        clear_output(wait=True)
    average_scores.append(sum(scores)/len(scores))

plt.plot(cfg_scales, average_scores)
#### Using A Score Model for Re-Ranking
def generate_good_image(prompt):
    images = []
    # Generate 4 images with two different guidance scales (for example):
    images += qpipe(prompt, num_inference_steps=50, num_images_per_prompt=1,height = height, width = width).images
    images += qpipe(prompt, num_inference_steps=50, num_images_per_prompt=1,height = height, width = width, guidance_scale=5).images 
    # Score them and pick the best one
    scores = get_scores(prompt, images)
    best_image = images[scores.index(max(scores))]
    return best_image

generate_good_image("a photo of an astronaut riding a horse on mars")