In [None]:
# Stable Diffusion Pipeline

from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
import torch

model_id = "stabilityai/stable-diffusion-2-1-base"

scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
pipe = StableDiffusionPipeline.from_pretrained(
    model_id,
    scheduler=scheduler,
    torch_dtype=torch.float16,
)
pipe.set_progress_bar_config(disable=True)
pipe = pipe.to("cuda")
pipe.enable_xformers_memory_efficient_attention()

with torch.inference_mode():
    prompt = "a photo of an astronaut riding a horse on mars"
    # image = pipe(prompt).images[0]

# image.show()

In [None]:
# MS-COCO
from torchvision.datasets import CocoCaptions

dataset = CocoCaptions(
    root="./datasets/ms-coco/val2017/",
    annFile="./datasets/ms-coco/annotations/captions_val2017.json",
)

dataset[0][0].show()

In [None]:
# DiffusionDB
import numpy as np
from datasets import load_from_disk
import open_clip


# dataset = load_from_disk("./datasets/diffusiondb/2m_random_50k")

dataset = dataset.filter(
    # filter width and height
    lambda x: (x["width"] == 512 and x["height"] == 512)
    # filter diffusion hyperparameters
    and (x["step"] == 50 and x["cfg"] == 7 and x["sampler"] == "k_lms")
    # filter nsfw
    and (x["image_nsfw"] < 0.2 and x["prompt_nsfw"] < 0.1)
)

dataset = dataset.remove_columns(
    [
        "seed",
        "step",
        "cfg",
        "sampler",
        "width",
        "height",
        "user_name",
        "timestamp",
        "image_nsfw",
        "prompt_nsfw",
    ]
)

tokenizer = open_clip.get_tokenizer("ViT-B-32")
filtered_dataset = dataset.filter(
    lambda x: 0 < (tokenizer(x["prompt"]) == 0).sum().item() < 74
)

In [None]:
# DiffusionDB
import numpy as np
from datasets import load_from_disk


dataset = load_from_disk("./datasets/diffusiondb/filtered")
dataset

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

device = torch.device("cuda")
model_id = "gpt2-large"
model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)


def calculate_perplexity(prompt, stride=512):
    assert isinstance(prompt, str)
    encodings = tokenizer(prompt, return_tensors="pt")
    max_length = model.config.n_positions
    seq_len = encodings.input_ids.size(1)
    nlls = []
    prev_end_loc = 0
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100
        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            neg_log_likelihood = outputs.loss
        nlls.append(neg_log_likelihood)
        prev_end_loc = end_loc
        if end_loc == seq_len:
            break
    ppl = torch.exp(torch.stack(nlls).mean()).item()
    return ppl


i = 500
print(dataset[i]["prompt"], calculate_perplexity(dataset[i]["prompt"]))