**Notes**
1. To reproduce images provided in our submission, the ideal runtime enviornment for this Jupyter notebook is Google Colab using an A100 GPU.
2. You may need a Hugging Face account to download the LoRA adapters, but registeration is free.
3. If you run into an issue with insufficient GPU memory (which might happen if you use an L4 GPU instead of A100), try decreasing the size of a batch to 1.
4. make sure to configure the working directory of your jupyter notebook as the project root directory (NOT the `notebooks` folder) to avoid importing errors

In [None]:
# Install and load necessary packages
%%capture
!pip install accelerate diffusers transformers
!pip install -U peft
!pip install git+https://github.com/openai/CLIP.git

from copy import deepcopy
import torch
from diffusers import StableDiffusionXLPipeline
import matplotlib.pyplot as plt
import random
# import custom inference & postprocessing functions we wrote
# make sure to set your pwd as the project root directory to avoid importing errors
from src.SDXL_inference import ScoreFusion_inference, postprocess_image 

## Initialize the pipeline and the two auxiliary U-Nets

In [None]:
# IMPORTANT: Need to run this code first to initialize the pipeline object, so that it initiliazes cross_attention_kwargs
%%capture
PROFESSION = 'mathematics scientist'
positive_prompt = f"a photo of a {PROFESSION}, looking at the camera, ultra quality, sharp focus"
negative_prompt = 'cartoon, anime, 3d, painting, b&w, low quality'
w1 = torch.tensor(0.5, device="cuda", dtype=torch.float16)
w2 = torch.tensor(0.5, device="cuda", dtype=torch.float16)
adapters=["white_male", "asian_female"]

model_dict = {
    "asian_female": "NYUAD-ComNets/Asian_Female_Profession_Model",
    "white_male": "NYUAD-ComNets/White_Male_Profession_Model"
}
models = [model_dict[name] for name in adapters]
pipeline = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", variant="fp16", use_safetensors=True, torch_dtype=torch.float16).to("cuda")
for i,j in zip(models,adapters):
    pipeline.load_lora_weights(i, weight_name="pytorch_lora_weights.safetensors",adapter_name=j)

pipeline.set_adapters(adapters, adapter_weights=[1.0, 0.0])
unet1 = deepcopy(pipeline.unet).cuda()
pipeline.set_adapters(adapters, adapter_weights=[0.0, 1.0])
unet2 = deepcopy(pipeline.unet).cuda()
pipeline.set_adapters(adapters, adapter_weights=[0.0, 0.0]) # unset LoRA adapters

# Note: this is just for 'warmstarting' the pipeline, so disregard the images.
_ = pipeline(prompt=positive_prompt,negative_prompt=negative_prompt,
        num_inference_steps=2, num_images_per_prompt=1).images

## Sample images from the ScoreFusion model

In [None]:
TOTAL_SIZE, NUM_BATCHES = 16, 4
generator = torch.Generator(device="cuda")
generator.manual_seed(0) # set seed for reproducibility
torch.cuda.empty_cache()

# run for 2 batches of 16-image batch
all_latents = []
for i in range(NUM_BATCHES):
    images_latents = ScoreFusion_inference(pipeline=pipeline,
        l1 = w1,
        l2 = w2,
        unet1 = unet1,
        unet2 = unet2,
        num_images_per_prompt = int(TOTAL_SIZE / NUM_BATCHES),
        num_inference_steps = 100,
        prompt = positive_prompt,
        negative_prompt = negative_prompt,
        height = 1024,
        width = 1024,
        generator = generator,
        output_type = 'latent' # don't use VAE in this step; do it outisde the loop explicitly
    ).images
    all_latents.append(images_latents)
    torch.cuda.empty_cache()
batched_latents = torch.cat(all_latents, dim=0)

# post-process the latents, map them back to pixel space using VAE
torch.cuda.empty_cache()
images = [postprocess_image(pipeline, latent.unsqueeze(0)).images[0] for latent in batched_latents]

In [None]:
fig, axs = plt.subplots(4, 4, figsize = (16,16))
for ax, img in zip(axs.flatten(), images[:16]):
    ax.imshow(img)
    ax.axis('off')
plt.tight_layout()
plt.show()