# Handle Imports

This is setup in a way that (hopefully) allows one to both run this notebook in Colab (Pro version) as well as locally if they have a beefy GPU.

In [None]:
try:
    import google.colab
    USING_COLAB = True
except:
    USING_COLAB = False

In [None]:
if USING_COLAB:
    !pip install transformers
    !pip install diffusers
    !pip install folium==0.9.1
    !pip install huggingface-hub
    !pip install pillow==9.0.0
    !pip install scikit-image==0.19.2
    !pip install accelerate
    !pip install safetensors
    !pip install sentencepiece
    !pip install -U xformers
else:
    print("Assuming you have installed the mcmc_visanagrams package.")

In [None]:
# Separating out the installation of the mcmc_visanagrams package so that it can be re-installed as
# needed without re-installing all the other packages (or checking for them).
if USING_COLAB:
    # NOTE: You might be able to install this as an editable pip package with the `-e` flag (before
    # the URL), meaning that you could potentially edit the package while working in Colab and
    # push the changes to GitHub. This would circumvent the need to push code before testing it.
    !pip install git+https://github.com/joshroy01/mcmc_visanagrams.git@implement_superres_composed_embeddings

In [None]:
if USING_COLAB:
    from google.colab import output
    output.enable_custom_widget_manager()

In [None]:
from pathlib import Path

import torch
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from torch import autocast
from diffusers import DDIMScheduler, DiffusionPipeline

from mcmc_visanagrams.pipelines.if_pipeline import IFPipeline

from mcmc_visanagrams.utils.display import visualize_context

# Code for Samplers
from mcmc_visanagrams.samplers.annealed_ula_sampler import AnnealedULASampler
from mcmc_visanagrams.samplers.annealed_uha_sampler import AnnealedUHASampler

from mcmc_visanagrams.utils.output import load_context, load_model_spec

%load_ext autoreload
%autoreload 2

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
DATA_PATH = Path(".").resolve()
TRIAL_NAME = "single_prompt"
# TRIAL_NAME = "two_prompt/space_lower_left"

if not USING_COLAB:
    DATA_PATH = DATA_PATH / "data"
    TRIAL_PATH = DATA_PATH / TRIAL_NAME
else:
    # Assuming data was uploaded to colab not in a folder tree structure.
    TRIAL_PATH = DATA_PATH

try:
    model_spec = load_model_spec(TRIAL_PATH)
except:
    model_spec = {"stage_1": 'DeepFloyd/IF-I-XL-v1.0', 'stage_2': 'DeepFloyd/IF-II-L-v1.0'}

STAGE_1_PRETRAINED_MODEL_ID = model_spec["stage_1"]
STAGE_2_PRETRAINED_MODEL_ID = model_spec["stage_2"]

In [None]:
print(model_spec)

# **MCMC Sampling to Generate Tapestries**

In [None]:
context_path = TRIAL_PATH / "context.json"
latents_path = TRIAL_PATH / "latents.pt"

# Load data that would have been generated in stage 1
context = load_context(context_path)
latents = torch.load(latents_path)

In [None]:
guidance_mag = 20.0

color_lookup = {}
np.random.seed(1)
for k, v in context.items():
    color_lookup[v['string']] = (np.random.uniform(size=(3, )), k[0]**2)

plt.figure(figsize=(5, 5))
img = visualize_context(128, 64, context, color_lookup)

plt.imshow(img)

for k, v in context.items():
    scale, xstart, ystart = k
    caption = v['string']
    color = color_lookup[caption][0]
    plt.plot([], [], color=color, label=caption)

plt.legend(loc='center left', bbox_to_anchor=(1.1, 0.5))

plt.savefig('composite_captions.pdf', bbox_inches='tight')
plt.savefig('composite_captions.png', bbox_inches='tight', facecolor=plt.gca().get_facecolor())
# %download_file composite_captions.pdf
# %download_file composite_captions.png

In [None]:
has_cuda = torch.cuda.is_available()
device = torch.device('cpu' if not has_cuda else 'cuda')
print(device)

# We still initialize stage 1 since I haven't implemented saving of the embeddings from stage 1.
stage_1 = IFPipeline.from_pretrained(STAGE_1_PRETRAINED_MODEL_ID,
                                     variant="fp16",
                                     torch_dtype=torch.float16,
                                     use_auth_token=True)
stage_1.enable_xformers_memory_efficient_attention()
stage_1.enable_model_cpu_offload()
stage_1.safety_checker = None

In [None]:
num_steps = 1000

# increase the number of Langevin MCMC steps run to sample between intermediate distributions
# more steps improves sampling quality
# la_steps = 10
# la_steps = 15
la_steps = 20

la_step_sizes = stage_1.scheduler.betas * 2

alphas = 1 - stage_1.scheduler.betas
alphas_cumprod = np.cumprod(alphas)
scalar = np.sqrt(1 / (1 - alphas_cumprod))

la_sampler = AnnealedULASampler(num_steps, la_steps, la_step_sizes, None, None, None)

In [None]:
seed = 0
# Number of intermediate transition distributions to specify
steps = 100

generator = torch.Generator('cuda').manual_seed(seed)

# with torch.no_grad():
#     latents = stage_1(context,
#                       la_sampler,
#                       height=128,
#                       width=128,
#                       generator=generator,
#                       num_inference_steps=steps)

In [None]:
import matplotlib.pyplot as plt
# from imageio import imwrite

image = latents[0].cpu().numpy().transpose(1, 2, 0)
image = ((image + 1) / 2 * 255)

CLIP_DYNAMIC_RANGE = True
if CLIP_DYNAMIC_RANGE:
    image[image < 0.0] = 0.0
    image[image > 255] = 255

image = image.astype(np.uint8)
# imwrite("test.png", image)
plt.imshow(image)

In [None]:
from mcmc_visanagrams.pipelines.if_super_resolution_pipeline import IFSuperResolutionPipeline

# stage_2 = DiffusionPipeline.from_pretrained(STAGE_2_PRETRAINED_MODEL_ID,
stage_2 = IFSuperResolutionPipeline.from_pretrained(STAGE_2_PRETRAINED_MODEL_ID,
                                                    text_encoder=None,
                                                    variant="fp16",
                                                    torch_dtype=torch.float16)
stage_2.enable_xformers_memory_efficient_attention()
stage_2.enable_model_cpu_offload()

In [None]:
reshaped_shape = (4, 2, 3, 3)
reshaped_size = np.prod(reshaped_shape)
t1 = torch.arange(reshaped_size).reshape(4, 2, 3, 3)
print(t1)

s = t1.size()
split_shape = (2, -1, *s[1:])
print("Split shape:", split_shape)
split = t1.reshape(2, -1, *s[1:])

# print(split)

noise_pred_uncond, noise_pred_text = split[0], split[1]
print(noise_pred_uncond)


In [None]:
# This is changed from the original notebook. They were conditioning the stage 2 diffusion of
# DeepFloyd on an empty string which seems a bit ridiculous to me. As an intermediate stage, I'm
# conditioning Stage 2 on the first context's string.
# prompt = ""
# prompt = context[(2, 0, 0)]["string"]
# prompt_embeds, negative_embeds = stage_1.encode_prompt(prompt)
# from mcmc_visanagrams.utils.latents import extract_latents_stage_2

# num_steps_stage_2 = steps
num_steps_stage_2 = 25
# num_steps_stage_2 = 50
# num_steps_stage_2 = 100
# num_steps_stage_2 = 250
stage_2.text_encoder = stage_1.text_encoder

with torch.no_grad():
    images = stage_2(
        image=latents,
        context=context,
        sampler=la_sampler,
        height=256,
        width=256,
        #  prompt_embeds=prompt_embeds,
        #  negative_prompt_embeds=negative_embeds,
        generator=generator,
        noise_level=num_steps_stage_2,
        output_type="pt",
        num_inference_steps=num_steps_stage_2)


In [None]:
# save upsampled image
# if not isinstance(images, np.ndarray):
#     images = images[0].cpu().numpy().transpose(1, 2, 0)
#     images = ((images + 1) / 2 * 255)

# if CLIP_DYNAMIC_RANGE:
#     images[images < 0.0] = 0.0
#     images[images > 255] = 255

# print(images.min())
# print(images.max())

# images = images.astype(np.uint8)

from mcmc_visanagrams.utils.display import image_from_latents

images_np = image_from_latents(images, clip_dynamic_range=True)
# print(images.max())
# print(images.min())

# subimage = images_np[85:86, 62:76, :]
plt.imshow(images_np)
# plt.imshow(subimage)
# print(subimage)
# plt.savefig("stage_2_output_space_lower_left_no_mcmc_sampling.png", dpi=200)

downsampled = torch.nn.functional.interpolate(images, (128, 128), mode='nearest-exact')
downsampled = image_from_latents(downsampled, clip_dynamic_range=True)

plt.figure()
plt.imshow(downsampled)

In [None]:
# save upsampled image
# if not isinstance(images, np.ndarray):
#     images = images[0].cpu().numpy().transpose(1, 2, 0)
#     images = ((images + 1) / 2 * 255)

# if CLIP_DYNAMIC_RANGE:
#     images[images < 0.0] = 0.0
#     images[images > 255] = 255

# print(images.min())
# print(images.max())

# images = images.astype(np.uint8)

from mcmc_visanagrams.utils.display import image_from_latents

images_np = image_from_latents(images, clip_dynamic_range=True)
# print(images.max())
# print(images.min())

# subimage = images_np[85:86, 62:76, :]
plt.imshow(images_np)
# plt.imshow(subimage)
# print(subimage)
# plt.savefig("stage_2_output_space_lower_left_no_mcmc_sampling.png", dpi=200)

downsampled = torch.nn.functional.interpolate(images, (128, 128), mode='nearest-exact')
downsampled = image_from_latents(downsampled, clip_dynamic_range=True)

plt.figure()
plt.imshow(downsampled)

In [None]:
shape_test = (4, 2, 3, 3)
t1 = torch.arange(np.prod(shape_test)).reshape(*shape_test)
# print(t1)

s = t1.size()
split_mcmc_shape = (2, -1, *s[1:])
split_mcmc_0, split_mcmc_1 = t1.reshape(*split_mcmc_shape)

split_normal = t1.chunk(2, dim=0)
split_normal_0 = split_normal[0]
split_normal_1 = split_normal[1]

print(split_mcmc_0 - split_normal_0)


In [None]:
images_np_scaled = images[0].detach().cpu().numpy().transpose(1, 2, 0)

# images_np_scaled = ((images_np_scaled + 1)/2)

# Scrunch the dynamic range
images_np_scaled -= images_np_scaled.min()
images_np_scaled /= images_np_scaled.max()
images_np_scaled *= 255
# images_np_scaled = (images_np_scaled - images_np_scaled.min()) / images_np_scaled.max() * 255

plt.figure()
plt.imshow(images_np_scaled.astype(np.uint8))

In [None]:
# np.save("oil_painting_swiss_alps_no_composed_diffusion_2.npy", images)