kohya-ss · rockerBOO · Jan 28, 2025 · Jun 19, 2024 · Jun 24, 2024 · Jul 5, 2024
diff --git a/flux_train_network.py b/flux_train_network.py
@@ -311,27 +311,34 @@ def shift_scale_latents(self, args, latents):
 
     def get_noise_pred_and_target(
         self,
-        args,
-        accelerator,
+        args: argparse.Namespace,
+        accelerator: Accelerator,
         noise_scheduler,
-        latents,
-        batch,
+        latents: torch.FloatTensor,
+        batch: dict[str, torch.Tensor],
         text_encoder_conds,
-        unet: flux_models.Flux,
+        unet,
         network,
-        weight_dtype,
-        train_unet,
+        weight_dtype: torch.dtype,
+        train_unet: bool,
         is_train=True,
-    ):
+        timesteps: torch.FloatTensor | None = None,
+    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.IntTensor, torch.Tensor | None]:
         # Sample noise that we'll add to the latents
         noise = torch.randn_like(latents)
         bsz = latents.shape[0]
 
         # get noisy model input and timesteps
-        noisy_model_input, timesteps, sigmas = flux_train_utils.get_noisy_model_input_and_timesteps(
+        noisy_model_input, rand_timesteps, sigmas = flux_train_utils.get_noisy_model_input_and_timestep(
             args, noise_scheduler, latents, noise, accelerator.device, weight_dtype
         )
 
+        if timesteps is None:
+            timesteps = rand_timesteps
+        else:
+            # Convert timesteps into sigmas
+            sigmas: torch.FloatTensor = timesteps - noise_scheduler.config.num_train_timesteps
+
         # pack latents and get img_ids
         packed_noisy_model_input = flux_utils.pack_latents(noisy_model_input)  # b, c, h*2, w*2 -> b, h*w, c*4
         packed_latent_height, packed_latent_width = noisy_model_input.shape[2] // 2, noisy_model_input.shape[3] // 2
@@ -364,6 +371,7 @@ def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t
             # grad is enabled even if unet is not in train mode, because Text Encoder is in train mode
             with torch.set_grad_enabled(is_train), accelerator.autocast():
                 # YiYi notes: divide it by 1000 for now because we scale it by 1000 in the transformer model (we should not keep it but I want to keep the inputs same for the model for testing)
+                accelerator.unwrap_model(unet).prepare_block_swap_before_forward()
                 model_pred = unet(
                     img=img,
                     img_ids=img_ids,
@@ -431,7 +439,7 @@ def call_dit(img, img_ids, t5_out, txt_ids, l_pooled, timesteps, guidance_vec, t
                 )
                 target[diff_output_pr_indices] = model_pred_prior.to(target.dtype)
 
-        return model_pred, target, timesteps, weighting
+        return model_pred, noisy_model_input, target, sigmas, timesteps, weighting
 
     def post_process_loss(self, loss, args, timesteps, noise_scheduler):
         return loss

diff --git a/library/config_util.py b/library/config_util.py
@@ -76,6 +76,11 @@ class BaseSubsetParams:
     validation_seed: int = 0
     validation_split: float = 0.0
     resize_interpolation: Optional[str] = None
+    preference: bool = False
+    preference_caption_prefix: Optional[str] = None
+    preference_caption_suffix: Optional[str] = None
+    non_preference_caption_prefix: Optional[str] = None
+    non_preference_caption_suffix: Optional[str] = None
 
 
 @dataclass
@@ -198,6 +203,11 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
         "caption_suffix": str,
         "custom_attributes": dict,
         "resize_interpolation": str,
+        "preference": bool,
+        "preference_caption_prefix": str,
+        "preference_caption_suffix": str,
+        "non_preference_caption_prefix": str,
+        "non_preference_caption_suffix": str
     }
     # DO means DropOut
     DO_SUBSET_ASCENDABLE_SCHEMA = {