In [1]:
from diffusers import UNet3DConditionModel

model = UNet3DConditionModel(
    sample_size=(64, 64, 64),
    in_channels=6,         # your actual input channels
    out_channels=3,        # your desired output channels
    block_out_channels=(64, 128, 256, 512),
    layers_per_block=2,
    down_block_types=("DownBlock3D", "DownBlock3D", "DownBlock3D", "DownBlock3D"),
    up_block_types=("UpBlock3D", "UpBlock3D", "UpBlock3D", "UpBlock3D"),
)

2025-07-17 14:47:01.756373: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-17 14:47:01.768809: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752731221.783602 1582842 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752731221.787833 1582842 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752731221.799533 1582842 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
"%.2E" % model.num_parameters(only_trainable=True)

'1.00E+08'

In [39]:
from diffusers import SchedulerMixin, ConfigMixin

class SchedulerSubNoiseWrapper(SchedulerMixin, ConfigMixin):
    def __init__(self, base_scheduler, noise_affected_channels):
        self.base_scheduler = base_scheduler
        self.noise_affected_channels = noise_affected_channels
        self.intern_config = {
            "noise_affected_channels": noise_affected_channels,
            "base_scheduler_config": base_scheduler.config,
            "base_class": base_scheduler.__class__.__name__
        }

    @property
    def config(self):
        # Return your wrapped config
        return self.base_scheduler.config
    
    def add_noise(self, sample, noise, timesteps):
        # Sample combinaison of ( Y of shape (B, out_channels), and extra info )
        # sample of shape (B, in_channels)
        # noise is expected to be of shape (B, out_channels)
        noised_channels = self.base_scheduler.add_noise(sample[:, :self.noise_affected_channels], noise, timesteps)
        return torch.cat([noised_channels, sample[:, self.noise_affected_channels:]], dim=1)

    def step(self, model_output, timestep, sample, **kwargs):
        noisy_part = sample[:, :self.noise_affected_channels]
        informative_part = sample[:, self.noise_affected_channels:]

        # Step only on noisy part
        step_output = self.base_scheduler.step(model_output, timestep, noisy_part, **kwargs)
        new_vel = step_output.prev_sample

        # Recombine full input, dim=1 because of batching
        full_sample = torch.cat([new_vel, informative_part], dim=1)

        # Return modified StepOutput with recombined tensor
        return type(step_output)(**{**step_output.__dict__, "prev_sample": full_sample})

        
    def save_config(self, **kwargs):
        return self.base_scheduler.save_config(**kwargs)

    def save_pretrained(self, save_directory):
        import json
        os.makedirs(save_directory, exist_ok=True)
        self.base_scheduler.save_pretrained(os.path.join(save_directory, "base_scheduler"))
        with open(os.path.join(save_directory, "wrapper_config.json"), "w") as f:
            json.dump(self.intern_config, f)

    @classmethod
    def from_pretrained(cls, save_directory, **kwargs):
        import importlib
        import json
        from diffusers import SchedulerMixin

        config_path = os.path.join(save_directory, "wrapper_config.json")
        with open(config_path, "r") as f:
            wd = json.load(f)
        base_name = wd["base_class"]
        base_scheduler = SchedulerMixin.from_pretrained(os.path.join(save_directory, "base_scheduler"))
        return cls(base_scheduler, wd["noise_affected_channels"])
    

In [40]:
from diffusers import DDPMScheduler

base_scheduler = DDPMScheduler(num_train_timesteps=50)
scheduler = SchedulerSubNoiseWrapper(base_scheduler, 3)

In [21]:
scheduler.base_scheduler

DDPMScheduler {
  "_class_name": "DDPMScheduler",
  "_diffusers_version": "0.34.0",
  "beta_end": 0.02,
  "beta_schedule": "linear",
  "beta_start": 0.0001,
  "clip_sample": true,
  "clip_sample_range": 1.0,
  "dynamic_thresholding_ratio": 0.995,
  "num_train_timesteps": 50,
  "prediction_type": "epsilon",
  "rescale_betas_zero_snr": false,
  "sample_max_value": 1.0,
  "steps_offset": 0,
  "thresholding": false,
  "timestep_spacing": "leading",
  "trained_betas": null,
  "variance_type": "fixed_small"
}

In [6]:
from dataclasses import dataclass

@dataclass
class TrainingConfig:
    train_batch_size = 2
    eval_batch_size = 1  # how many images to sample during evaluation
    num_epochs = 5
    gradient_accumulation_steps = 1
    learning_rate = 1e-4
    lr_warmup_steps = 500
    save_image_epochs = 10
    save_model_epochs = 5
    mixed_precision = "fp16"  # `no` for float32, `fp16` for automatic mixed precision
    output_dir = "UNet-flow-test"  # the model name locally and on the HF Hub

    push_to_hub = False  # whether to upload the saved model to the HF Hub
    hub_model_id = "<your-username>/<my-awesome-model>"  # the name of the repository to create on the HF Hub
    hub_private_repo = None
    overwrite_output_dir = True  # overwrite the old model when re-running the notebook
    seed = 0


config = TrainingConfig()

In [7]:
# from space_exploration.beans.training_bean import Training
from space_exploration.beans.dataset_bean import Dataset
from space_exploration.training.training import ModelTraining
from space_exploration.dataset.transforms.AllTransforms import TransformationReferences
from space_exploration.training.training_utils import prepare_dataset
dataset_name = "re200-sr05etot"
dataset = Dataset.get_dataset_or_fail(dataset_name)
ds = dataset.get_training_dataset(64, TransformationReferences.UNET_ADAPTER64.transformation, TransformationReferences.Y_ALONG_COMPONENT_NORMALIZE.transformation, 
                                  size=6)
train_dataloader = prepare_dataset(ds, batch_size=config.train_batch_size)

Loading std & mean of dataset re200-sr05etot
Loading stds & means of dataset re200-sr05etot
⌛ Initializing Dataset...
X...
[########################################] | 100% Completed | 344.78 ms
Y...
[########################################] | 100% Completed | 721.71 ms


In [8]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x1506a8229240>

In [9]:
import torch
from diffusers.optimization import get_cosine_schedule_with_warmup

optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)
lr_scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=config.lr_warmup_steps,
    num_training_steps=(len(train_dataloader) * config.num_epochs),
)

In [10]:
import numpy as np
model.to("cuda")
sample_input, sample_output = next(iter(ds))
sample_input = torch.tensor(np.array([sample_input]))
print("Input shape:", sample_input.shape)
sample_input = sample_input.to(model.device)


Input shape: torch.Size([1, 6, 64, 64, 64])


In [11]:
sample_output = torch.tensor(np.array([sample_output]))

In [12]:
with torch.no_grad():
    encoder_hidden_states = torch.ones((1, 1, 1024), device=model.device)
    forward = model(sample_input, timestep=0, encoder_hidden_states=encoder_hidden_states)
    print("Output shape:", forward.sample.shape)

Output shape: torch.Size([1, 3, 64, 64, 64])


In [13]:
import torch.nn.functional as F

noise_pred = forward.sample.to("cpu")
loss = F.mse_loss(noise_pred, sample_output)

In [14]:
loss

tensor(1.3968)

In [18]:
from accelerate import Accelerator
from huggingface_hub import create_repo, upload_folder
from tqdm.auto import tqdm
from pathlib import Path
import os
from diffusers import DDPMPipeline

def train_loop(config, model, noise_scheduler, optimizer, train_dataloader, lr_scheduler):
    # Initialize accelerator and tensorboard logging
    accelerator = Accelerator(
        mixed_precision=config.mixed_precision,
        gradient_accumulation_steps=config.gradient_accumulation_steps,
        log_with="tensorboard",
        project_dir=os.path.join(config.output_dir, "logs"),
    )
    if accelerator.is_main_process:
        if config.output_dir is not None:
            os.makedirs(config.output_dir, exist_ok=True)
        if config.push_to_hub:
            repo_id = create_repo(
                repo_id=config.hub_model_id or Path(config.output_dir).name, exist_ok=True
            ).repo_id
        accelerator.init_trackers("train_example")

    # Prepare everything
    # There is no specific order to remember, you just need to unpack the
    # objects in the same order you gave them to the prepare method.
    model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, train_dataloader, lr_scheduler
    )

    global_step = 0

    # Define mock hidden states
    encoder_hidden_states = torch.ones((config.train_batch_size, 1, 1024), device=model.device)
    
    # Now you train the model
    for epoch in range(config.num_epochs):
        progress_bar = tqdm(total=len(train_dataloader), disable=not accelerator.is_local_main_process)
        progress_bar.set_description(f"Epoch {epoch}")

        for step, (x, y) in enumerate(train_dataloader):

            bs = x.shape[0]
            noise_shape = y.shape
            noise = torch.randn(noise_shape, device=x.device)

            # Sample a random timestep for each image
            timesteps = torch.randint(
                0, noise_scheduler.config.num_train_timesteps, (bs,), device=x.device,
                dtype=torch.int64
            )

            # Add noise to the clean images according to the noise magnitude at each timestep
            # (this is the forward diffusion process)
            y_with_extra_info = torch.cat([y, x[:, 3:]], dim=1)
            noisy_input = noise_scheduler.add_noise(y_with_extra_info, noise, timesteps)

            with accelerator.accumulate(model):
                # Predict the noise residual
                noise_pred = model(noisy_input, timesteps, encoder_hidden_states, return_dict=False)[0]
                loss = F.mse_loss(noise_pred, noise)
                accelerator.backward(loss)

                if accelerator.sync_gradients:
                    accelerator.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

            progress_bar.update(1)
            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0], "step": global_step}
            progress_bar.set_postfix(**logs)
            accelerator.log(logs, step=global_step)
            global_step += 1

        # After each epoch you optionally sample some demo images with evaluate() and save the model
        if accelerator.is_main_process:
            pipeline = DDPMPipeline(unet=accelerator.unwrap_model(model), scheduler=noise_scheduler)

            # if (epoch + 1) % config.save_image_epochs == 0 or epoch == config.num_epochs - 1:
            #     evaluate(config, epoch, pipeline)

            if (epoch + 1) % config.save_model_epochs == 0 or epoch == config.num_epochs - 1:
                if config.push_to_hub:
                    upload_folder(
                        repo_id=repo_id,
                        folder_path=config.output_dir,
                        commit_message=f"Epoch {epoch}",
                        ignore_patterns=["step_*", "epoch_*"],
                    )
                else:
                    pipeline.save_pretrained(config.output_dir)

In [41]:
from accelerate import notebook_launcher

args = (config, model, scheduler, optimizer, train_dataloader, lr_scheduler)

notebook_launcher(train_loop, args, num_processes=1)

Launching training on one GPU.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [42]:
from diffusers import DiffusionPipeline

pipeline = DiffusionPipeline.from_pretrained(config.output_dir, use_safetensors=True)

Loading pipeline components...:   0%|          | 0/2 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/queues.py", line 239, in _feed
    reader_close()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.10/threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "/data3/chancrin/code/3D-GAN/.venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "/usr/lib/python3.10/threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.10/multiprocessing/queues.py", line 271, in _feed
    queue_sem.release()
ValueError: semaphore or lock released t

AttributeError: type object 'SchedulerMixin' has no attribute 'load_config'