From d5f7f8a53b87fca038eac7d6a80a6beba6f323b8 Mon Sep 17 00:00:00 2001 From: Joseph Turian Date: Wed, 5 Nov 2025 14:49:53 +0000 Subject: [PATCH 1/2] Add optional precision-preserving preprocessing --- .../unconditional_image_generation/README.md | 2 + .../train_unconditional.py | 57 +++++++++++++++++-- 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/examples/unconditional_image_generation/README.md b/examples/unconditional_image_generation/README.md index 22f982509bb1..e18ce4c02b0b 100644 --- a/examples/unconditional_image_generation/README.md +++ b/examples/unconditional_image_generation/README.md @@ -104,6 +104,8 @@ To use your own dataset, there are 2 ways: - you can either provide your own folder as `--train_data_dir` - or you can upload your dataset to the hub (possibly as a private repo, if you prefer so), and simply pass the `--dataset_name` argument. +If your dataset contains 16 or 32-bit channels (for example, medical TIFFs), add the `--preserve_input_precision` flag so the preprocessing keeps the original precision while still training a 3-channel model. + Below, we explain both in more detail. #### Provide the dataset as a folder diff --git a/examples/unconditional_image_generation/train_unconditional.py b/examples/unconditional_image_generation/train_unconditional.py index 3ffeef13647c..0cc96220b932 100644 --- a/examples/unconditional_image_generation/train_unconditional.py +++ b/examples/unconditional_image_generation/train_unconditional.py @@ -52,6 +52,24 @@ def _extract_into_tensor(arr, timesteps, broadcast_shape): return res.expand(broadcast_shape) +def _ensure_three_channels(tensor: torch.Tensor) -> torch.Tensor: + """ + Ensure the tensor has exactly three channels (C, H, W) by repeating or truncating channels when needed. + """ + if tensor.ndim == 2: + tensor = tensor.unsqueeze(0) + channels = tensor.shape[0] + if channels == 3: + return tensor + if channels == 1: + return tensor.repeat(3, 1, 1) + if channels == 2: + return torch.cat([tensor, tensor[:1]], dim=0) + if channels > 3: + return tensor[:3] + raise ValueError(f"Unsupported number of channels: {channels}") + + def parse_args(): parser = argparse.ArgumentParser(description="Simple example of a training script.") parser.add_argument( @@ -260,6 +278,11 @@ def parse_args(): parser.add_argument( "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers." ) + parser.add_argument( + "--preserve_input_precision", + action="store_true", + help="Preserve 16/32-bit image precision by avoiding 8-bit RGB conversion while still producing 3-channel tensors.", + ) args = parser.parse_args() env_local_rank = int(os.environ.get("LOCAL_RANK", -1)) @@ -453,19 +476,41 @@ def load_model_hook(models, input_dir): # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder # Preprocessing the datasets and DataLoaders creation. + spatial_augmentations = [ + transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR), + transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution), + transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x), + ] + augmentations = transforms.Compose( - [ - transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR), - transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution), - transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x), + spatial_augmentations + + [ transforms.ToTensor(), transforms.Normalize([0.5], [0.5]), ] ) + precision_augmentations = transforms.Compose( + [ + transforms.PILToTensor(), + transforms.Lambda(_ensure_three_channels), + transforms.ConvertImageDtype(torch.float32), + ] + + spatial_augmentations + + [transforms.Normalize([0.5], [0.5])] + ) + def transform_images(examples): - images = [augmentations(image.convert("RGB")) for image in examples["image"]] - return {"input": images} + processed = [] + for image in examples["image"]: + if not args.preserve_input_precision: + processed.append(augmentations(image.convert("RGB"))) + else: + precise_image = image + if precise_image.mode == "P": + precise_image = precise_image.convert("RGB") + processed.append(precision_augmentations(precise_image)) + return {"input": processed} logger.info(f"Dataset size: {len(dataset)}") From 41290be20ef1cbca7e7e3c3b4041b6c3c3add19c Mon Sep 17 00:00:00 2001 From: Joseph Turian Date: Wed, 5 Nov 2025 16:35:44 +0000 Subject: [PATCH 2/2] Document decoder caveat for precision flag --- examples/unconditional_image_generation/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/unconditional_image_generation/README.md b/examples/unconditional_image_generation/README.md index e18ce4c02b0b..6f8276a632f7 100644 --- a/examples/unconditional_image_generation/README.md +++ b/examples/unconditional_image_generation/README.md @@ -104,7 +104,7 @@ To use your own dataset, there are 2 ways: - you can either provide your own folder as `--train_data_dir` - or you can upload your dataset to the hub (possibly as a private repo, if you prefer so), and simply pass the `--dataset_name` argument. -If your dataset contains 16 or 32-bit channels (for example, medical TIFFs), add the `--preserve_input_precision` flag so the preprocessing keeps the original precision while still training a 3-channel model. +If your dataset contains 16 or 32-bit channels (for example, medical TIFFs), add the `--preserve_input_precision` flag so the preprocessing keeps the original precision while still training a 3-channel model. Precision still depends on the decoder: Pillow keeps 16-bit grayscale and float inputs, but many 16-bit RGB files are decoded as 8-bit RGB, and the flag cannot recover precision lost at load time. Below, we explain both in more detail.