## Compare the behavior of diffuser Trainer and train5

- Create a shared dataloader
- Create the same model
- Seed everything
- Compare the model weights
- Train for one step
- Check the loss
- Compare the model weights



In [1]:
import random
import sys
sys.path.append("../../../")

import os
os.environ["CUBLAS_WORKSPACE_CONFIG"]=":4096:8"

from examples.image_gen.nano_diffusion.train5 import train_loop

2024-10-09 21:11:15.993201: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-09 21:11:16.036015: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX512F AVX512_VNNI, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-10-09 21:11:18,092 - datasets - INFO - PyTorch version 2.0.1 available. (config.py:54)
2024-10-09 21:11:18,093 - datasets - INFO - TensorFlow version 2.13.1 available. (config.py:101)
2024-10-09 21:11:18,094 - datasets - INFO - JAX version 0.4.14 available. (config.py:114)


In [2]:
from torchvision.datasets import CIFAR10
from torchvision import transforms
from torch.utils.data import DataLoader
import torch
torch.backends.cudnn.deterministic = True
# use deterministic algorithms
torch.use_deterministic_algorithms(True)

transform = transforms.Compose([
    # transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
dataset = CIFAR10(root='./data', train=True, download=True, transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)


Files already downloaded and verified


In [3]:
from examples.image_gen.nano_diffusion.train5 import create_model
from torch import optim
import numpy as np

# seed
torch.manual_seed(0)
np.random.seed(0)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
denoising_model = create_model().to(device)

optimizer = optim.AdamW(denoising_model.parameters(), lr=1e-4)
lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda step: 1)

num_timesteps = 1000
betas = torch.linspace(1e-4, 0.02, num_timesteps)
alphas = 1 - betas
alphas_cumprod = torch.cumprod(alphas, dim=0)
noise_schedule = {
    "betas": betas.to(device),
    "alphas": alphas.to(device),
    "alphas_cumprod": alphas_cumprod.to(device),
}



model params: 113.67 M


In [4]:
def print_params(model):
    for i, w in enumerate(model.parameters()):
        print(f"Parameter {i}: {w.data.mean()}")
        if i > 10:
            break


print_params(denoising_model)

Parameter 0: -0.0009031191002577543
Parameter 1: -0.03329317271709442
Parameter 2: 0.00011141580034745857
Parameter 3: -0.006176298949867487
Parameter 4: 1.8592058040667325e-05
Parameter 5: 0.0011070019099861383
Parameter 6: 1.0
Parameter 7: 0.0
Parameter 8: 6.0346192185534164e-05
Parameter 9: 0.002473791129887104
Parameter 10: -7.468119292752817e-06
Parameter 11: -0.004966080188751221


In [5]:
# set up a one-step train loop
# seed
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)
train_loop(
    denoising_model=denoising_model,
    train_dataloader=dataloader,
    val_dataloader=None,
    optimizer=optimizer,
    lr_scheduler=lr_scheduler,
    noise_schedule=noise_schedule,
    n_T=num_timesteps,
    total_steps=1,
    device=device,
)

current seed: 0
noise.mean(): tensor(-0.0030, device='cuda:0') shape: torch.Size([32, 3, 32, 32])
t: tensor(532.7188, device='cuda:0')
Step 0/1, Examples trained: 32, Train Loss: 1.0660, LR: 0.000100


32

In [6]:
_ = denoising_model.eval()

In [7]:
print_params(denoising_model)


Parameter 0: -0.0009030032088048756
Parameter 1: -0.033293142914772034
Parameter 2: 0.00011159940186189488
Parameter 3: -0.006176290102303028
Parameter 4: 1.8253314920002595e-05
Parameter 5: 0.001105814822949469
Parameter 6: 1.000002145767212
Parameter 7: 1.0934378224192187e-05
Parameter 8: 5.899346433579922e-05
Parameter 9: 0.002464410848915577
Parameter 10: -7.634651410626248e-06
Parameter 11: -0.004975453019142151


In [8]:
denoising_model2 = create_model().to(device)
print_params(denoising_model2)

model params: 113.67 M
Parameter 0: -0.0009031191002577543
Parameter 1: -0.03329317271709442
Parameter 2: 0.00011141580034745857
Parameter 3: -0.006176298949867487
Parameter 4: 1.8592058040667325e-05
Parameter 5: 0.0011070019099861383
Parameter 6: 1.0
Parameter 7: 0.0
Parameter 8: 6.0346192185534164e-05
Parameter 9: 0.002473791129887104
Parameter 10: -7.468119292752817e-06
Parameter 11: -0.004966080188751221


In [9]:
# train with diffuser trainer
import random
from cvlization.torch.training_pipeline.image_gen.diffuser_unconditional.pipeline import TrainingPipeline, Trainer, DDPMScheduler, Accelerator, ProjectConfiguration


class AdaptedDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        img, label = self.dataset[idx]
        return {
            "input": img,
            "target": label,
        }

adapted_dataset = AdaptedDataset(dataset)
adapted_dataloader = DataLoader(adapted_dataset, batch_size=32, shuffle=False)
optimizer2 = optim.AdamW(denoising_model2.parameters(), lr=1e-4)
lr_scheduler2 = optim.lr_scheduler.LambdaLR(optimizer2, lr_lambda=lambda step: 1)

ddpm_scheduler = DDPMScheduler(
    num_train_timesteps=num_timesteps,
    beta_schedule="linear",
    prediction_type="epsilon",
)
accelerator_project_config = ProjectConfiguration(total_limit=1)
accelerator = Accelerator(
    gradient_accumulation_steps=1,
    mixed_precision=None,
    log_with=None,
    project_dir=None,
    project_config=accelerator_project_config,
)
accelerator = None
denoising_model2.train()
if accelerator is not None:
    denoising_model2, optimizer2, adapted_dataloader, lr_scheduler2 = accelerator.prepare(
        denoising_model2, optimizer2, adapted_dataloader, lr_scheduler2
    )
trainer = Trainer(
    model=denoising_model2,
    accelerator=accelerator,
    output_dir="data/tensorboard",
    noise_scheduler=ddpm_scheduler,
    gradient_accumulation_steps=1,
    optimizer=optimizer2,
    lr_scheduler=lr_scheduler2,
    logger=None,
    use_ema=False,
    train_batch_size=32,
    ddpm_num_inference_steps=num_timesteps,
    num_epochs=1,
    prediction_type="epsilon",
    num_update_steps_per_epoch=1,
    max_train_steps=1,
    device=device,
)

# seed
torch.manual_seed(0)
np.random.seed(0)
random.seed(0)

trainer.train(train_dataloader=adapted_dataloader)
if accelerator is not None:
    denoising_model2 = accelerator.unwrap_model(denoising_model2)
denoising_model2.eval()

print_params(denoising_model2)

2024-10-09 21:11:42,461 - cvlization.torch.training_pipeline.image_gen.diffuser_unconditional.pipeline - INFO - ***** Running training ***** (pipeline.py:73)
2024-10-09 21:11:42,462 - cvlization.torch.training_pipeline.image_gen.diffuser_unconditional.pipeline - INFO -   Num Epochs = 1 (pipeline.py:75)
2024-10-09 21:11:42,462 - cvlization.torch.training_pipeline.image_gen.diffuser_unconditional.pipeline - INFO -   Instantaneous batch size per device = 32 (pipeline.py:76)
2024-10-09 21:11:42,462 - cvlization.torch.training_pipeline.image_gen.diffuser_unconditional.pipeline - INFO -   Total train batch size (w. parallel, distributed & accumulation) = 32 (pipeline.py:77)
2024-10-09 21:11:42,462 - cvlization.torch.training_pipeline.image_gen.diffuser_unconditional.pipeline - INFO -   Gradient Accumulation steps = 1 (pipeline.py:78)
2024-10-09 21:11:42,463 - cvlization.torch.training_pipeline.image_gen.diffuser_unconditional.pipeline - INFO -   Total optimization steps = 1 (pipeline.py:79)


  0%|          | 0/1 [00:00<?, ?it/s]

noise.mean(): tensor(-0.0030, device='cuda:0') shape: torch.Size([32, 3, 32, 32])
timesteps: tensor(532.7188, device='cuda:0')
loss: 1.0660488605499268 lr: 0.0001 step: 1
Parameter 0: -0.0009030032088048756
Parameter 1: -0.033293142914772034
Parameter 2: 0.00011159940186189488
Parameter 3: -0.006176290102303028
Parameter 4: 1.8253314920002595e-05
Parameter 5: 0.001105814822949469
Parameter 6: 1.000002145767212
Parameter 7: 1.0934378224192187e-05
Parameter 8: 5.899346433579922e-05
Parameter 9: 0.002464410848915577
Parameter 10: -7.634651410626248e-06
Parameter 11: -0.004975453019142151


In [10]:
# compare the two dataloaders
first_batch1 = next(iter(dataloader))
first_batch2 = next(iter(adapted_dataloader))
print(first_batch1[0].mean())
print(first_batch2["input"].mean())



tensor(-0.1218)
tensor(-0.1218)


In [11]:
torch.manual_seed(0)
# random.seed(0)
# np.random.seed(0)
print(torch.initial_seed())
# print(torch.seed())
print(torch.randn((32, 3, 32, 32)).to(device).mean())
print(torch.randn((32, 3, 32, 32)).to(device).mean())
torch.manual_seed(0)
# random.seed(0)
# np.random.seed(0)
print(torch.initial_seed())
# print(torch.seed())
print(torch.randn((32, 3, 32, 32)).to(device).mean())
print(torch.randn((32, 3, 32, 32)).to(device).mean())

0
tensor(-0.0025, device='cuda:0')
tensor(-0.0046, device='cuda:0')
0
tensor(-0.0025, device='cuda:0')
tensor(-0.0046, device='cuda:0')
