From 7100f710daac5e1acdd98d554861358a8529ed64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Tolga=20Cang=C3=B6z?= Date: Wed, 21 Feb 2024 16:42:42 +0300 Subject: [PATCH 01/11] Fix typos --- examples/text_to_image/README.md | 2 +- examples/text_to_image/README_sdxl.md | 6 +++--- examples/text_to_image/train_text_to_image.py | 7 ++++--- examples/text_to_image/train_text_to_image_lora.py | 5 +++-- examples/text_to_image/train_text_to_image_lora_sdxl.py | 6 +++--- examples/text_to_image/train_text_to_image_sdxl.py | 8 ++++---- 6 files changed, 18 insertions(+), 16 deletions(-) diff --git a/examples/text_to_image/README.md b/examples/text_to_image/README.md index a56cccbcf5d7..f2931d3f347e 100644 --- a/examples/text_to_image/README.md +++ b/examples/text_to_image/README.md @@ -4,7 +4,7 @@ The `train_text_to_image.py` script shows how to fine-tune stable diffusion mode ___Note___: -___This script is experimental. The script fine-tunes the whole model and often times the model overfits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparamters to get the best result on your dataset.___ +___This script is experimental. The script fine-tunes the whole model and often times the model overfits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparameters to get the best result on your dataset.___ ## Running locally with PyTorch diff --git a/examples/text_to_image/README_sdxl.md b/examples/text_to_image/README_sdxl.md index 0d35b2a8ab9d..349feef5008e 100644 --- a/examples/text_to_image/README_sdxl.md +++ b/examples/text_to_image/README_sdxl.md @@ -2,7 +2,7 @@ The `train_text_to_image_sdxl.py` script shows how to fine-tune Stable Diffusion XL (SDXL) on your own dataset. -🚨 This script is experimental. The script fine-tunes the whole model and often times the model overfits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparamters to get the best result on your dataset. 🚨 +🚨 This script is experimental. The script fine-tunes the whole model and often times the model overfits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparameters to get the best result on your dataset. 🚨 ## Running locally with PyTorch @@ -238,8 +238,8 @@ accelerate launch --config_file $ACCELERATE_CONFIG_FILE train_text_to_image_lor --validation_epochs=20 \ --seed=1234 \ --output_dir="sd-pokemon-model-lora-sdxl" \ - --validation_prompt="cute dragon creature" - + --validation_prompt="cute dragon creature" + ``` diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py index 6fb8b17944eb..46516c9198ec 100644 --- a/examples/text_to_image/train_text_to_image.py +++ b/examples/text_to_image/train_text_to_image.py @@ -12,6 +12,7 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and +# limitations under the License. import argparse import logging @@ -395,7 +396,7 @@ def parse_args(): "--prediction_type", type=str, default=None, - help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.", + help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.", ) parser.add_argument( "--hub_model_id", @@ -635,7 +636,7 @@ def load_model_hook(models, input_dir): ema_unet.to(accelerator.device) del load_model - for i in range(len(models)): + for _ in range(len(models)): # pop models so that they are not loaded again model = models.pop() @@ -810,7 +811,7 @@ def collate_fn(examples): if args.use_ema: ema_unet.to(accelerator.device) - # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision + # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision # as these weights are only used for inference, keeping weights in full precision is not required. weight_dtype = torch.float32 if accelerator.mixed_precision == "fp16": diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py index 47e67f695b08..39590fa8666b 100644 --- a/examples/text_to_image/train_text_to_image_lora.py +++ b/examples/text_to_image/train_text_to_image_lora.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python # coding=utf-8 # Copyright 2024 The HuggingFace Inc. team. All rights reserved. # @@ -293,7 +294,7 @@ def parse_args(): "--prediction_type", type=str, default=None, - help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.", + help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.", ) parser.add_argument( "--hub_model_id", @@ -454,7 +455,7 @@ def main(): vae.requires_grad_(False) text_encoder.requires_grad_(False) - # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision + # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision # as these weights are only used for inference, keeping weights in full precision is not required. weight_dtype = torch.float32 if accelerator.mixed_precision == "fp16": diff --git a/examples/text_to_image/train_text_to_image_lora_sdxl.py b/examples/text_to_image/train_text_to_image_lora_sdxl.py index 79bc66288338..be178d36dcde 100644 --- a/examples/text_to_image/train_text_to_image_lora_sdxl.py +++ b/examples/text_to_image/train_text_to_image_lora_sdxl.py @@ -370,7 +370,7 @@ def parse_args(input_args=None): "--prediction_type", type=str, default=None, - help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.", + help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.", ) parser.add_argument( "--hub_model_id", @@ -585,7 +585,7 @@ def main(args): text_encoder_two.requires_grad_(False) unet.requires_grad_(False) - # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision + # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision # as these weights are only used for inference, keeping weights in full precision is not required. weight_dtype = torch.float32 if accelerator.mixed_precision == "fp16": @@ -648,7 +648,7 @@ def unwrap_model(model): def save_model_hook(models, weights, output_dir): if accelerator.is_main_process: # there are only two options here. Either are just the unet attn processor layers - # or there are the unet and text encoder atten layers + # or there are the unet and text encoder attn layers unet_lora_layers_to_save = None text_encoder_one_lora_layers_to_save = None text_encoder_two_lora_layers_to_save = None diff --git a/examples/text_to_image/train_text_to_image_sdxl.py b/examples/text_to_image/train_text_to_image_sdxl.py index 292e52bca0f8..04f8c3dba417 100644 --- a/examples/text_to_image/train_text_to_image_sdxl.py +++ b/examples/text_to_image/train_text_to_image_sdxl.py @@ -419,7 +419,7 @@ def parse_args(input_args=None): "--prediction_type", type=str, default=None, - help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.", + help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.", ) parser.add_argument( "--hub_model_id", @@ -683,7 +683,7 @@ def main(args): # Set unet as trainable. unet.train() - # For mixed precision training we cast all non-trainable weigths to half-precision + # For mixed precision training we cast all non-trainable weights to half-precision # as these weights are only used for inference, keeping weights in full precision is not required. weight_dtype = torch.float32 if accelerator.mixed_precision == "fp16": @@ -738,7 +738,7 @@ def load_model_hook(models, input_dir): ema_unet.to(accelerator.device) del load_model - for i in range(len(models)): + for _ in range(len(models)): # pop models so that they are not loaded again model = models.pop() @@ -962,7 +962,7 @@ def collate_fn(examples): if accelerator.is_main_process: accelerator.init_trackers("text2image-fine-tune-sdxl", config=vars(args)) - # Function for unwraping if torch.compile() was used in accelerate. + # Function for unwrapping if torch.compile() was used in accelerate. def unwrap_model(model): model = accelerator.unwrap_model(model) model = model._orig_mod if is_compiled_module(model) else model From 9e9c819858a9b80b9b208f358a0154c309f14394 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Tolga=20Cang=C3=B6z?= Date: Wed, 21 Feb 2024 16:43:11 +0300 Subject: [PATCH 02/11] Add license header to train_text_to_image_flax.py --- .../text_to_image/train_text_to_image_flax.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/examples/text_to_image/train_text_to_image_flax.py b/examples/text_to_image/train_text_to_image_flax.py index ac6476fb0386..d1dc31e06403 100644 --- a/examples/text_to_image/train_text_to_image_flax.py +++ b/examples/text_to_image/train_text_to_image_flax.py @@ -1,3 +1,19 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import logging import math From 1ae2e982a02aab53b0865f6afcbd595842b2d026 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Tolga=20Cang=C3=B6z?= Date: Wed, 21 Feb 2024 16:46:06 +0300 Subject: [PATCH 03/11] Fix image saving bug in text_to_image scripts --- examples/text_to_image/train_text_to_image.py | 2 +- examples/text_to_image/train_text_to_image_lora.py | 7 ++++--- examples/text_to_image/train_text_to_image_sdxl.py | 7 ++++--- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py index 46516c9198ec..f1f1f314dfe1 100644 --- a/examples/text_to_image/train_text_to_image.py +++ b/examples/text_to_image/train_text_to_image.py @@ -72,7 +72,7 @@ def save_model_card( repo_folder: str = None, ): img_str = "" - if len(images) > 0: + if images is not None: image_grid = make_image_grid(images, 1, len(args.validation_prompts)) image_grid.save(os.path.join(repo_folder, "val_imgs_grid.png")) img_str += "![val_imgs_grid](./val_imgs_grid.png)\n" diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py index 39590fa8666b..47e0053f346f 100644 --- a/examples/text_to_image/train_text_to_image_lora.py +++ b/examples/text_to_image/train_text_to_image_lora.py @@ -61,9 +61,10 @@ def save_model_card( repo_id: str, images: list = None, base_model: str = None, dataset_name: str = None, repo_folder: str = None ): img_str = "" - for i, image in enumerate(images): - image.save(os.path.join(repo_folder, f"image_{i}.png")) - img_str += f"![img_{i}](./image_{i}.png)\n" + if images is not None: + for i, image in enumerate(images): + image.save(os.path.join(repo_folder, f"image_{i}.png")) + img_str += f"![img_{i}](./image_{i}.png)\n" model_description = f""" # LoRA text2image fine-tuning - {repo_id} diff --git a/examples/text_to_image/train_text_to_image_sdxl.py b/examples/text_to_image/train_text_to_image_sdxl.py index 04f8c3dba417..2d77e9c8bfa3 100644 --- a/examples/text_to_image/train_text_to_image_sdxl.py +++ b/examples/text_to_image/train_text_to_image_sdxl.py @@ -74,9 +74,10 @@ def save_model_card( vae_path: str = None, ): img_str = "" - for i, image in enumerate(images): - image.save(os.path.join(repo_folder, f"image_{i}.png")) - img_str += f"![img_{i}](./image_{i}.png)\n" + if images is not None: + for i, image in enumerate(images): + image.save(os.path.join(repo_folder, f"image_{i}.png")) + img_str += f"![img_{i}](./image_{i}.png)\n" model_description = f""" # Text-to-image finetuning - {repo_id} From bc787907f1e5569ffab1e654183a84ab2750fcd4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Tolga=20Cang=C3=B6z?= Date: Wed, 21 Feb 2024 16:47:46 +0300 Subject: [PATCH 04/11] Fix exponential moving average (EMA) during training --- examples/text_to_image/train_text_to_image_sdxl.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/examples/text_to_image/train_text_to_image_sdxl.py b/examples/text_to_image/train_text_to_image_sdxl.py index 2d77e9c8bfa3..78021b5afed4 100644 --- a/examples/text_to_image/train_text_to_image_sdxl.py +++ b/examples/text_to_image/train_text_to_image_sdxl.py @@ -951,6 +951,9 @@ def collate_fn(examples): unet, optimizer, train_dataloader, lr_scheduler ) + if args.use_ema: + ema_unet.to(accelerator.device) + # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if overrode_max_train_steps: @@ -1126,6 +1129,8 @@ def compute_time_ids(original_size, crops_coords_top_left): # Checks if the accelerator has performed an optimization step behind the scenes if accelerator.sync_gradients: + if args.use_ema: + ema_unet.step(unet.parameters()) progress_bar.update(1) global_step += 1 accelerator.log({"train_loss": train_loss}, step=global_step) From 2b370a3979f11ce51b44a04d0372319026dbe3e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Tolga=20Cang=C3=B6z?= <46008593+standardAI@users.noreply.github.com> Date: Wed, 21 Feb 2024 17:21:17 +0300 Subject: [PATCH 05/11] Discard changes to examples/text_to_image/README.md --- examples/text_to_image/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/text_to_image/README.md b/examples/text_to_image/README.md index f2931d3f347e..a56cccbcf5d7 100644 --- a/examples/text_to_image/README.md +++ b/examples/text_to_image/README.md @@ -4,7 +4,7 @@ The `train_text_to_image.py` script shows how to fine-tune stable diffusion mode ___Note___: -___This script is experimental. The script fine-tunes the whole model and often times the model overfits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparameters to get the best result on your dataset.___ +___This script is experimental. The script fine-tunes the whole model and often times the model overfits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparamters to get the best result on your dataset.___ ## Running locally with PyTorch From 7ab995d6c725d1ba2c42901879964e8373cd8d8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Tolga=20Cang=C3=B6z?= <46008593+standardAI@users.noreply.github.com> Date: Wed, 21 Feb 2024 17:21:25 +0300 Subject: [PATCH 06/11] Discard changes to examples/text_to_image/README_sdxl.md --- examples/text_to_image/README_sdxl.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/text_to_image/README_sdxl.md b/examples/text_to_image/README_sdxl.md index 349feef5008e..0d35b2a8ab9d 100644 --- a/examples/text_to_image/README_sdxl.md +++ b/examples/text_to_image/README_sdxl.md @@ -2,7 +2,7 @@ The `train_text_to_image_sdxl.py` script shows how to fine-tune Stable Diffusion XL (SDXL) on your own dataset. -🚨 This script is experimental. The script fine-tunes the whole model and often times the model overfits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparameters to get the best result on your dataset. 🚨 +🚨 This script is experimental. The script fine-tunes the whole model and often times the model overfits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparamters to get the best result on your dataset. 🚨 ## Running locally with PyTorch @@ -238,8 +238,8 @@ accelerate launch --config_file $ACCELERATE_CONFIG_FILE train_text_to_image_lor --validation_epochs=20 \ --seed=1234 \ --output_dir="sd-pokemon-model-lora-sdxl" \ - --validation_prompt="cute dragon creature" - + --validation_prompt="cute dragon creature" + ``` From 314befa7d3871308a6c796e0e85d536cdb9d2f6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Tolga=20Cang=C3=B6z?= <46008593+standardAI@users.noreply.github.com> Date: Wed, 21 Feb 2024 17:21:58 +0300 Subject: [PATCH 07/11] Discard changes to examples/text_to_image/train_text_to_image.py --- examples/text_to_image/train_text_to_image.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py index f1f1f314dfe1..6fb8b17944eb 100644 --- a/examples/text_to_image/train_text_to_image.py +++ b/examples/text_to_image/train_text_to_image.py @@ -12,7 +12,6 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. import argparse import logging @@ -72,7 +71,7 @@ def save_model_card( repo_folder: str = None, ): img_str = "" - if images is not None: + if len(images) > 0: image_grid = make_image_grid(images, 1, len(args.validation_prompts)) image_grid.save(os.path.join(repo_folder, "val_imgs_grid.png")) img_str += "![val_imgs_grid](./val_imgs_grid.png)\n" @@ -396,7 +395,7 @@ def parse_args(): "--prediction_type", type=str, default=None, - help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.", + help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.", ) parser.add_argument( "--hub_model_id", @@ -636,7 +635,7 @@ def load_model_hook(models, input_dir): ema_unet.to(accelerator.device) del load_model - for _ in range(len(models)): + for i in range(len(models)): # pop models so that they are not loaded again model = models.pop() @@ -811,7 +810,7 @@ def collate_fn(examples): if args.use_ema: ema_unet.to(accelerator.device) - # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision + # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision # as these weights are only used for inference, keeping weights in full precision is not required. weight_dtype = torch.float32 if accelerator.mixed_precision == "fp16": From 98469c3705fa5e37cd3ad3b0cb1544f649fad29e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Tolga=20Cang=C3=B6z?= <46008593+standardAI@users.noreply.github.com> Date: Wed, 21 Feb 2024 17:22:08 +0300 Subject: [PATCH 08/11] Discard changes to examples/text_to_image/train_text_to_image_flax.py --- .../text_to_image/train_text_to_image_flax.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/examples/text_to_image/train_text_to_image_flax.py b/examples/text_to_image/train_text_to_image_flax.py index d1dc31e06403..ac6476fb0386 100644 --- a/examples/text_to_image/train_text_to_image_flax.py +++ b/examples/text_to_image/train_text_to_image_flax.py @@ -1,19 +1,3 @@ -#!/usr/bin/env python -# coding=utf-8 -# Copyright 2024 The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - import argparse import logging import math From 1c11a0de94007c247b1068014974de6d2d12f360 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Tolga=20Cang=C3=B6z?= <46008593+standardAI@users.noreply.github.com> Date: Wed, 21 Feb 2024 17:22:26 +0300 Subject: [PATCH 09/11] Discard changes to examples/text_to_image/train_text_to_image_lora.py --- examples/text_to_image/train_text_to_image_lora.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py index 47e0053f346f..47e67f695b08 100644 --- a/examples/text_to_image/train_text_to_image_lora.py +++ b/examples/text_to_image/train_text_to_image_lora.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python # coding=utf-8 # Copyright 2024 The HuggingFace Inc. team. All rights reserved. # @@ -61,10 +60,9 @@ def save_model_card( repo_id: str, images: list = None, base_model: str = None, dataset_name: str = None, repo_folder: str = None ): img_str = "" - if images is not None: - for i, image in enumerate(images): - image.save(os.path.join(repo_folder, f"image_{i}.png")) - img_str += f"![img_{i}](./image_{i}.png)\n" + for i, image in enumerate(images): + image.save(os.path.join(repo_folder, f"image_{i}.png")) + img_str += f"![img_{i}](./image_{i}.png)\n" model_description = f""" # LoRA text2image fine-tuning - {repo_id} @@ -295,7 +293,7 @@ def parse_args(): "--prediction_type", type=str, default=None, - help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.", + help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.", ) parser.add_argument( "--hub_model_id", @@ -456,7 +454,7 @@ def main(): vae.requires_grad_(False) text_encoder.requires_grad_(False) - # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision + # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision # as these weights are only used for inference, keeping weights in full precision is not required. weight_dtype = torch.float32 if accelerator.mixed_precision == "fp16": From 6fd8aa197f716035183c6bfc91e300c1bdb46c55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Tolga=20Cang=C3=B6z?= <46008593+standardAI@users.noreply.github.com> Date: Wed, 21 Feb 2024 17:22:36 +0300 Subject: [PATCH 10/11] Discard changes to examples/text_to_image/train_text_to_image_lora_sdxl.py --- examples/text_to_image/train_text_to_image_lora_sdxl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/text_to_image/train_text_to_image_lora_sdxl.py b/examples/text_to_image/train_text_to_image_lora_sdxl.py index be178d36dcde..79bc66288338 100644 --- a/examples/text_to_image/train_text_to_image_lora_sdxl.py +++ b/examples/text_to_image/train_text_to_image_lora_sdxl.py @@ -370,7 +370,7 @@ def parse_args(input_args=None): "--prediction_type", type=str, default=None, - help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.", + help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.", ) parser.add_argument( "--hub_model_id", @@ -585,7 +585,7 @@ def main(args): text_encoder_two.requires_grad_(False) unet.requires_grad_(False) - # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision + # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision # as these weights are only used for inference, keeping weights in full precision is not required. weight_dtype = torch.float32 if accelerator.mixed_precision == "fp16": @@ -648,7 +648,7 @@ def unwrap_model(model): def save_model_hook(models, weights, output_dir): if accelerator.is_main_process: # there are only two options here. Either are just the unet attn processor layers - # or there are the unet and text encoder attn layers + # or there are the unet and text encoder atten layers unet_lora_layers_to_save = None text_encoder_one_lora_layers_to_save = None text_encoder_two_lora_layers_to_save = None From 0271c3941c62b1275ae26d731e1d3a0b1336cd00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=2E=20Tolga=20Cang=C3=B6z?= <46008593+standardAI@users.noreply.github.com> Date: Wed, 21 Feb 2024 17:25:15 +0300 Subject: [PATCH 11/11] Update train_text_to_image_sdxl.py --- .../text_to_image/train_text_to_image_sdxl.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/examples/text_to_image/train_text_to_image_sdxl.py b/examples/text_to_image/train_text_to_image_sdxl.py index 78021b5afed4..b54a132e9ec6 100644 --- a/examples/text_to_image/train_text_to_image_sdxl.py +++ b/examples/text_to_image/train_text_to_image_sdxl.py @@ -74,10 +74,9 @@ def save_model_card( vae_path: str = None, ): img_str = "" - if images is not None: - for i, image in enumerate(images): - image.save(os.path.join(repo_folder, f"image_{i}.png")) - img_str += f"![img_{i}](./image_{i}.png)\n" + for i, image in enumerate(images): + image.save(os.path.join(repo_folder, f"image_{i}.png")) + img_str += f"![img_{i}](./image_{i}.png)\n" model_description = f""" # Text-to-image finetuning - {repo_id} @@ -420,7 +419,7 @@ def parse_args(input_args=None): "--prediction_type", type=str, default=None, - help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.", + help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.", ) parser.add_argument( "--hub_model_id", @@ -684,7 +683,7 @@ def main(args): # Set unet as trainable. unet.train() - # For mixed precision training we cast all non-trainable weights to half-precision + # For mixed precision training we cast all non-trainable weigths to half-precision # as these weights are only used for inference, keeping weights in full precision is not required. weight_dtype = torch.float32 if accelerator.mixed_precision == "fp16": @@ -739,7 +738,7 @@ def load_model_hook(models, input_dir): ema_unet.to(accelerator.device) del load_model - for _ in range(len(models)): + for i in range(len(models)): # pop models so that they are not loaded again model = models.pop() @@ -966,7 +965,7 @@ def collate_fn(examples): if accelerator.is_main_process: accelerator.init_trackers("text2image-fine-tune-sdxl", config=vars(args)) - # Function for unwrapping if torch.compile() was used in accelerate. + # Function for unwraping if torch.compile() was used in accelerate. def unwrap_model(model): model = accelerator.unwrap_model(model) model = model._orig_mod if is_compiled_module(model) else model