In [1]:
import sys
sys.path.append("/Users/sarathrnair/Projects/tf-transformers/src/")

In [2]:
import tensorflow as tf
from unet import UnetModel
from tf_transformers.models import SentenceTransformer
from gaussian_diffusion import GaussianDiffusion



In [3]:
model_name = 'sentence-transformers/sentence-t5-base'
text_encoder = SentenceTransformer.from_pretrained(model_name, return_layer=True)
text_encoder.trainable = False

Metal device set to: Apple M1


INFO:absl:Successful ✅✅: Model checkpoints matched and loaded from /Users/sarathrnair/.cache/huggingface/hub/tftransformers__sentence-t5-base-sentence-transformers.main.d64dbdc4c8c15637da4215b81f38af99d48a586c/ckpt-1
INFO:absl:Successful ✅: Loaded model from tftransformers/sentence-t5-base-sentence-transformers


In [4]:
out_channels = 128

channel_mult = [1, 2, 3, 4]
num_res_blocks = 3
time_emb = 128
text_emb = 768
input_channels = 3

unet = UnetModel(
                text_embedding_dimension=text_emb, # Make sure output of text encoder matches this
                time_embedding_dimension=time_emb, # This should be same in BaseDiffusion model
                out_channels=out_channels, 
                channel_mult = channel_mult,
                input_channels=input_channels,
                num_res_blocks = num_res_blocks,
                attention_resolutions=[32, 16, 8],
                cross_attention_resolutions=[32, 16, 8],
                use_scale_shift_norm=True,

                )


In [5]:
unet.count_params()

97237763

In [6]:
unet.count_params()

97237763

In [17]:
config = {}
config['beta_schedule'] = 'linear'
config['diffusion_steps'] = 1000
config['image_height'] = 32
config['image_width'] = 32
config['input_channels'] = input_channels

model = GaussianDiffusion(config,
                     text_encoder_model=text_encoder, 
                     unet_model=unet)

model = model.get_model()

In [18]:
batch_size = 4
text_sequence_length = 96
height = config['image_height']
width  = config['image_width']
in_channels = config['input_channels']
diffusion_steps = config['diffusion_steps']

image = tf.random.uniform((batch_size, height, width, in_channels)) # original image

input_ids = tf.random.uniform(minval=0, maxval=100, shape=(batch_size, text_sequence_length), dtype=tf.int32)
input_mask = tf.random.uniform(minval=0, maxval=2, shape=(batch_size, text_sequence_length), dtype=tf.int32)
time_steps = tf.random.uniform(minval=0, maxval=diffusion_steps, shape=(1, batch_size), dtype=tf.int32) # time steps

noise = tf.random.uniform((batch_size, height, width, in_channels)) # noise image

inputs = {}
inputs['input_pixels'] = image
inputs['noise'] = noise
inputs['input_ids'] = input_ids
inputs['input_mask'] = input_mask
inputs['time_steps'] = time_steps

model_outputs = model(inputs)


In [8]:
model.input

{'input_pixels': <KerasTensor: shape=(None, 32, 32, 3) dtype=float32 (created by layer 'input_pixels')>,
 'time_steps': <KerasTensor: shape=(1, None) dtype=int32 (created by layer 'time_steps')>,
 'noise': <KerasTensor: shape=(None, 32, 32, 3) dtype=float32 (created by layer 'input_noise')>,
 'input_ids': <KerasTensor: shape=(None, None) dtype=int32 (created by layer 'input_ids')>,
 'input_mask': <KerasTensor: shape=(None, None) dtype=int32 (created by layer 'input_mask')>}

In [9]:
model.output

{'xpred': <KerasTensor: shape=(None, 32, 32, 3) dtype=float32 (created by layer 'diffusion')>,
 'noise': <KerasTensor: shape=(None, 32, 32, 3) dtype=float32 (created by layer 'diffusion')>}

In [11]:
model.save_serialized("/tmp/diffusion_temp2")

2022-06-20 16:12:58.108475: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: /tmp/diffusion_temp2/assets


INFO:tensorflow:Assets written to: /tmp/diffusion_temp2/assets


In [12]:
loaded = tf.saved_model.load("/tmp/diffusion_temp2")
model = loaded.signatures['serving_default']

### Without Text Encoder

In [3]:
input_channels = 3

config = {}
config['beta_schedule'] = 'linear'
config['diffusion_steps'] = 1000
config['image_height'] = 32
config['image_width'] = 32
config['input_channels'] = input_channels

out_channels = 128

channel_mult = [1, 2, 3, 4]
num_res_blocks = 3
time_emb = 128
text_emb = 768
input_channels = 3

unet = UnetModel(
                text_embedding_dimension=None, # None
                time_embedding_dimension=time_emb, # This should be same in BaseDiffusion model
                out_channels=out_channels, 
                channel_mult = channel_mult,
                input_channels=input_channels,
                num_res_blocks = num_res_blocks,
                attention_resolutions=[32, 16, 8],
                use_scale_shift_norm=True,

                )



model = GaussianDiffusion(config,
                     text_encoder_model=None, 
                     unet_model=unet)

model = model.get_model()

Metal device set to: Apple M1


In [4]:
unet.get_model().count_params()

87483267

In [7]:
model.count_params()

87483267

In [5]:
model.input

{'input_pixels': <KerasTensor: shape=(None, 32, 32, 3) dtype=float32 (created by layer 'input_pixels')>,
 'time_steps': <KerasTensor: shape=(1, None) dtype=int32 (created by layer 'time_steps')>,
 'noise': <KerasTensor: shape=(None, 32, 32, 3) dtype=float32 (created by layer 'input_noise')>}

In [6]:
unet.input

{'input_pixels': <KerasTensor: shape=(None, 64, 64, 3) dtype=float32 (created by layer 'input_pixels')>,
 'time_steps': <KerasTensor: shape=(1, None) dtype=int32 (created by layer 'time_steps')>}