In [1]:
from guided_diffusion.script_util import create_model_and_diffusion, model_and_diffusion_defaults


In [6]:
# get the default model and diffusion arguments
import torch


model_args = model_and_diffusion_defaults()
print(model_args)
# Update the model arguments for a specific configuration
model_args.update({
    'image_size': 256,
    'num_channels': 256, # out_channels of the first conv layer.
    'num_res_blocks': 2,
    'learn_sigma': True, # Enables predicting both noise mean and variance → final output has 6 channels (3 for noise, 3 for log variance).
    'class_cond': False, # Disables class conditioning.
    'diffusion_steps': 1000, # Sets the number of timesteps in the noise corruption (or removal) process.
    # For forward, at each new timestep, the noise level increases by a constant increment—so early steps add just a bit of noise,
    # progressing steadily to more noise. For backward (noise removal), we start by removing a lot of noise 
    'noise_schedule': 'linear', 
    'attention_resolutions': "32, 16, 8", # Applies self-attention blocks at feature map resolutions of 32×32, 16×16, and 8×8 pixels.
    # num_head_channels: 64 means that each attention head has 64 channels. so the number of attention heads is determined by 
    # the number of channels divided by this value. Each attention head has its own Q, K, V projections.
    # Each head computes scaled-dot-product attention independently.
    'num_head_channels': 64,
    'use_fp16': True, # enables half-precision (FP16) 
    'resblock_updown': True,
})

# Create model and diffusion process
model, diffusion = create_model_and_diffusion(**model_args)
model.convert_to_fp16()
# Load the pretrained checkpoint
ckpt = torch.load("output_hev/imagenet/checkpoint/256x256_diffusion_uncond.pt")
model.load_state_dict(ckpt, strict=True)

# 5. Set device and eval mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

{'image_size': 64, 'num_channels': 128, 'num_res_blocks': 2, 'num_heads': 4, 'num_heads_upsample': -1, 'num_head_channels': -1, 'attention_resolutions': '16,8', 'channel_mult': '', 'dropout': 0.0, 'class_cond': False, 'use_checkpoint': False, 'use_scale_shift_norm': True, 'resblock_updown': False, 'use_fp16': False, 'use_new_attention_order': False, 'learn_sigma': False, 'diffusion_steps': 1000, 'noise_schedule': 'linear', 'timestep_respacing': '', 'use_kl': False, 'predict_xstart': False, 'rescale_timesteps': False, 'rescale_learned_sigmas': False}


  ckpt = torch.load("output_hev/imagenet/checkpoint/256x256_diffusion_uncond.pt")


UNetModel(
  (time_embed): Sequential(
    (0): Linear(in_features=256, out_features=1024, bias=True)
    (1): SiLU()
    (2): Linear(in_features=1024, out_features=1024, bias=True)
  )
  (input_blocks): ModuleList(
    (0): TimestepEmbedSequential(
      (0): Conv2d(3, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    )
    (1-2): 2 x TimestepEmbedSequential(
      (0): ResBlock(
        (in_layers): Sequential(
          (0): GroupNorm32(32, 256, eps=1e-05, affine=True)
          (1): SiLU()
          (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
        (h_upd): Identity()
        (x_upd): Identity()
        (emb_layers): Sequential(
          (0): SiLU()
          (1): Linear(in_features=1024, out_features=512, bias=True)
        )
        (out_layers): Sequential(
          (0): GroupNorm32(32, 256, eps=1e-05, affine=True)
          (1): SiLU()
          (2): Dropout(p=0.0, inplace=False)
          (3): Conv2d(256, 256, kernel_size=(3

In [3]:
from data.utils.dataloaders import get_imagenet_dataloader
imgnet_dataloader = get_imagenet_dataloader(batch_size=2)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
first_images, first_labels = next(iter(imgnet_dataloader))

In [7]:

model.to(device=device)

desired_t = 0.5  # The timestep at which you want to sample noise
t = torch.full((first_images.shape[0],), desired_t, device=device)
first_images = first_images.to(device=device)


print(first_images.dtype, t.dtype)
# with torch.no_grad():
#     with torch.amp.autocast(device_type="cuda"):
noise_and_variance = model(first_images, t)

torch.float32 torch.float32


In [16]:
noise, variance = noise_and_variance.chunk(2, dim=1)  # Split the output into noise and variance components

In [59]:
print(noise[0][0][:10])

tensor([[ 4.4883, -0.8354,  4.2383,  ..., -0.1281, -0.3445, -0.3804],
        [ 4.6719,  0.1807,  0.0278,  ...,  0.0310, -0.0050, -0.0418],
        [ 4.0234,  0.4199, -0.3218,  ..., -0.5283, -0.4951, -0.4685],
        ...,
        [ 0.0731, -0.5625, -0.0626,  ..., -0.5273, -0.5132, -0.4624],
        [ 2.3848, -0.0480, -0.1290,  ...,  0.0052, -0.0585,  0.0618],
        [-0.1399,  0.4019, -0.2094,  ..., -0.3772,  0.0716, -0.6270]],
       device='cuda:0', dtype=torch.float16)
