You should install some libraries:
+ tensorflow, numpy, matplotlib, tqdm, transformers

Import libraries

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import keras.backend as K
from tqdm import tqdm
from keras.utils import plot_model

A function to draw images

In [None]:
def draw(image_list, k=3, cmap='gray'):

    n = len(image_list)
    if n > 0:
        nrows = (n-1)//k+1
        fig1, plots1 = plt.subplots(figsize=(5*k, 5*nrows), nrows=nrows, ncols=k)

        if nrows > 1:
            for i in range(nrows):
                for j in range(k):
                    index = k*i+j
                    if index < n:
                        plots1[i, j].imshow(image_list[index], cmap=cmap)
        else:
            for j in range((n-1)%k+1):
                plots1[j].imshow(image_list[j], cmap=cmap)

## 1). Cifar10

### 1.1) Diffusion Models

Setup Diffusion functions

In [None]:
T = 1000
beta_0 = 1e-4
beta_T = 0.02

betas = tf.linspace(start=beta_0, stop=beta_T, num=T)
alphas = 1.0 - betas
alpha_cum = K.cast(np.cumprod(alphas), "float32")
alpha_cum = tf.concat((alpha_cum, [1.0]), axis=0) # for alpha_0=1.0

def forward_diffusion(x_0, t):
    epsilon = tf.random.normal(shape=tf.shape(x_0))
    alpha_cum_t = K.gather(alpha_cum, t)
    alpha_cum_t = K.reshape(alpha_cum_t, (-1, 1, 1, 1))
    x_t = (alpha_cum_t**0.5)*x_0 + ((1-alpha_cum_t)**0.5)*epsilon
    return x_t, epsilon

def denoise(model, x, y, T, n_steps=50, nu=0.5):
    # prepare steps
    n = tf.shape(x)[0]
    to = list(range(0, T, T//n_steps))
    to.append(-1) # for alpha_cum_0=1.0
    # Sampling
    for i in tqdm(list(reversed(range(len(to)-1)))):
        # Get time
        t = tf.repeat(to[i], n)
        # Predict the noise
        predicted_noise = model([x, t, y], training=False)
        # Sample a noise
        noise = tf.random.normal(shape=(tf.shape(x)))
        # Calculate the std
        sigma = nu*((1-alpha_cum[to[i-1]])/(1-alpha_cum[to[i]]))**0.5*(1-alpha_cum[to[i]]/alpha_cum[to[i-1]])**0.5
        # Get the predicted x0
        predicted_x0 = (x-(1-alpha_cum[to[i]])**0.5*predicted_noise)/alpha_cum[to[i]]**0.5
        #predicted_x0 /= np.quantile(np.abs(predicted_x0), 0.99, axis=[1, 2, 3], keepdims=True)
        #predicted_x0 = tf.clip_by_value(predicted_x0, -1, 1)
        # Calculate direction pointing to x_t
        direction = (1-alpha_cum[to[i-1]]-sigma**2)**0.5 * predicted_noise
        # Get x_t-1
        x = alpha_cum[to[i-1]]**0.5 * predicted_x0 + direction + sigma*noise
    # Clip to the range
    x = tf.clip_by_value(x, -1.0, 1.0)*0.5+0.5
    return x

Load unet 

In [None]:
unet = tf.keras.models.load_model("./models/unet_1000_cifar_cond")

See model structure

In [None]:
unet.summary()

In [None]:
label = tf.repeat(tf.range(10), 10) # 10 elements from each of the 10 classes
label

In [None]:
x = tf.random.normal(shape=(100, 32, 32, 3)) # random latent

Launch the generation

In [None]:
ddpm_imgs = denoise(model=unet, x=x, T=T, y=label, n_steps=50) # 50 gives good results, and faster than 1000 steps

In [None]:
draw(ddpm_imgs, k=10)

### 1.2) GANS

In [None]:
gan = tf.keras.models.load_model("./models/ccifar_gan_200/")

See model structure

In [None]:
gan.summary()

In [None]:
label = tf.repeat(tf.range(10), 10) # 10 elements from each of the 10 classes
label

In [None]:
latent = tf.random.normal(shape=(100, 128)) # random latent

In [None]:
fake_imgs = gan.predict([latent, label])*0.5+0.5

In [None]:
draw(fake_imgs, k=10)

## 2). CUB-200

Import Text model: CLIP

If it did not load properly, restart the notebook, import libraries, and execute it (without importing previous models)

In [None]:
from transformers import AutoTokenizer, AutoProcessor, TFCLIPModel

clip = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [None]:
def get_embeddings(text):
    tokens = tokenizer([text,  ". "*23], padding=True, return_tensors="tf")["input_ids"][:1]
    inputs_text = {"input_ids": tokens}
    inputs_text["pixel_values"] = np.zeros(shape=(1, 3, 224, 224))
    out = clip(inputs_text)
    embeddings = tf.concat((out["text_model_output"]["pooler_output"][:, None, :], out["text_model_output"]["last_hidden_state"]), axis=1)
    return embeddings

GANs

Load model

In [None]:
gan = tf.keras.models.load_model("./models/attngan_350/")

See model structure

In [None]:
gan.summary()

Text Prompt

In [None]:
text = "a red bird, high resolution, high quality"
k = 5 # 5 images

In [None]:
to_be_visualised = tf.random.normal(shape=(k, 100))
prompt = tf.repeat(get_embeddings(text), k, 0)

In [None]:
imgs, _, weights = gan.predict([to_be_visualised, prompt])

In [None]:
draw(imgs*0.5+0.5)