# Install the diffusers library that encapsulates the model and the inference pipeline.

In [None]:
!pip install diffusers # state of the art library for diffusion model training and inference including text-to-image models
!pip install imscore # library and collection of models fro aesthetic scoring
!pip install einops # library needed for preparing the inputs for imscore models
!pip install transformers==4.56.2 # state of the art library for language model training anf inference needed for diffusers library

# Import the necessary libraries

In [None]:
import torch
from diffusers import UNet2DConditionModel, StableDiffusionPipeline, DPMSolverMultistepScheduler

# Instantiate the original model with the original UNet weights from Huggingface

In [None]:
# Load the stable diffusion pipeline, autoecoder, textual encoder, and UNet
pipe = StableDiffusionPipeline.from_pretrained(
        "stabilityai/stable-diffusion-2-1", torch_dtype=torch.float16, cache_dir="./cache"
    ).to("cuda")

# noise scheduler according to which we will remove the predicted noise from the noisy latent
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to('cuda')
pipe.safety_checker = None

# this is what defines the noisy latent from which the UNet will start from
generator = torch.Generator(device='cuda')
generator = generator.manual_seed(0)

prompt = "A fox in an autumn forest."
# guidance scale is how much we want the prompt to guide the denoising process
image_base = pipe(prompt=prompt, generator=generator, guidance_scale=5).images[0]
image_base

# Instantiate the fine-tuned UNet with its new weights. Ensure that the UNet weights (.safetensors) and config.json are under sd-finetune/unet

In [None]:
finetuned_unet = UNet2DConditionModel.from_pretrained(
                            "KetevanK/SD_fine_tuning",
                            subfolder='unet',
                            # access token since my model is in private repository
                            token="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",
                            torch_dtype=torch.float16,
                            cache_dir="./cache").to('cuda')

# Swapping base model UNet with my finetuned UNet
pipe.unet = finetuned_unet
# The generator seed should be fixed for both models. This ensures both UNets
# start from the same noisy latent vector
generator = torch.Generator(device='cuda')
generator = generator.manual_seed(0)

image_mine = pipe(prompt=prompt, generator=generator, guidance_scale=5).images[0]
image_mine

# Evaluate the generated images aesthetically with HPSv2 aesthetic model

In [None]:
import numpy as np
from einops import rearrange
# imscore (https://github.com/RE-N-Y/imscore)
# library offers a collection of aesthetic scoring models and was used in
# my quantitative evaluation
from imscore.hps.model import HPSv2
from imscore.mps.model import MPS
from imscore.pickscore.model import PickScorer
from imscore.imreward.model import ImageReward
# Since the aesthetic scorer is a fine-tuned CLIP model itself,
# loading the model through the library will download the necessary files and
# evaluate the images. Downloading the files might take a bit of time

# Uncomment one of the below lines for aesthetic scoring
scorer = HPSv2.from_pretrained("RE-N-Y/hpsv21")
# scorer = MPS.from_pretrained("RE-N-Y/mpsv1")
# scorer = ImageReward.from_pretrained("RE-N-Y/ImageReward")
# scorer = PickScorer("yuvalkirstain/PickScore_v1")

def convert_to_torch_tensor(pixels):
	pixels = np.array(pixels)
	pixels = rearrange(torch.tensor(pixels), "h w c -> 1 c h w") / 255.0
	return pixels

ims = [image_base, image_mine]
image_tensor = [convert_to_torch_tensor(im) for im in ims]
image_tensor = torch.cat(image_tensor, dim=0).to("cuda")
scorer.to("cuda").eval()

with torch.inference_mode():
	scores = scorer.score(image_tensor, [prompt]*len(ims))
scores = scores.exp()/scores.exp().sum() # softmax the scores
print(scores)

# if scores[0] > scores[1] → image_base is better
# if scores[0] < scores[1] → image_mine is better