In [1]:
import os

import torch
from diffusers import StableDiffusionPipeline, DDPMScheduler, StableDiffusionInpaintPipeline
from osm_dataset import TextToImageDataset
from config import CHECKPOINTS_DIR, MODEL_NAME, DEVICE, DATA_DIR
from torch.utils.data import DataLoader


In [2]:

def get_prediction(pipe: StableDiffusionPipeline, prompt: str):
    with torch.no_grad():
        images = pipe(prompt, height=256, width=256).images

    return images[0]


In [3]:
# !python diff_hugging.py
# import diff_hugging
# diff_hugging.main()

In [4]:
noise_scheduler = DDPMScheduler.from_pretrained(
        "models/wroclaw_v2", subfolder="scheduler"
    )
pipeline = StableDiffusionPipeline.from_pretrained("models/wroclaw_v2", safety_checker=None, scheduler=noise_scheduler).to(DEVICE)

The config attributes {'dropout': 0.0} were passed to UNet2DConditionModel, but are not expected and will be ignored. Please verify your config.json configuration file.
You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .


In [5]:
ds = TextToImageDataset(DATA_DIR)
dl = DataLoader(ds, batch_size=1)

Resolving data files:   0%|          | 0/2145 [00:00<?, ?it/s]

Found cached dataset imagefolder (/home/marcin/.cache/huggingface/datasets/imagefolder/Wrocław, PL-16adbde1af182d57/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f)


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/2144 [00:00<?, ? examples/s]

In [6]:
from tqdm import tqdm
import torch.nn.functional as F

def evaluate(dl, pipeline):
    total_loss = 0.0
    for img, caption in tqdm(dl):
        latents = pipeline.vae.encode(
                        img.to(DEVICE)
                    ).latent_dist.sample()
        latents = latents * pipeline.vae.config.scaling_factor

        # Sample noise that we'll add to the latents
        noise = torch.randn_like(latents)

        bsz = latents.shape[0]
        # Sample a random timestep for each image
        timesteps = torch.randint(
            0,
            noise_scheduler.config.num_train_timesteps,
            (bsz,),
            device=latents.device,
        )
        timesteps = timesteps.long()

        # Add noise to the latents according to the noise magnitude at each timestep
        # (this is the forward diffusion process)
        noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

        # Get the text embedding for conditioning
        encoder_hidden_states = pipeline.text_encoder(caption.to(DEVICE))[0]

        # Get the target for loss depending on the prediction type
        if noise_scheduler.config.prediction_type == "epsilon":
            target = noise
        elif noise_scheduler.config.prediction_type == "v_prediction":
            target = noise_scheduler.get_velocity(latents, noise, timesteps)
        else:
            raise ValueError(
                f"Unknown prediction type {noise_scheduler.config.prediction_type}"
            )

        # Predict the noise residual and compute loss
        model_pred = pipeline.unet(
            noisy_latents, timesteps, encoder_hidden_states
        ).sample

        
        loss = F.mse_loss(
            model_pred.float(), target.float(), reduction="mean"
            )
        total_loss += loss.item()
    return total_loss / len(dl)

In [7]:
evaluate(dl, pipeline)

100%|██████████| 2144/2144 [03:25<00:00, 10.45it/s]


0.12338643512209016