From a85b8543cdbd64e9966f5e1e09239b3a0051dc28 Mon Sep 17 00:00:00 2001 From: William Berman Date: Wed, 15 Feb 2023 14:30:45 -0800 Subject: [PATCH 1/2] add total number checkpoints to training scripts --- examples/dreambooth/train_dreambooth.py | 15 ++++++++++++++- examples/dreambooth/train_dreambooth_lora.py | 15 ++++++++++++++- .../train_dreambooth_inpaint.py | 15 ++++++++++++++- .../train_dreambooth_inpaint_lora.py | 15 ++++++++++++++- .../train_multi_subject_dreambooth.py | 15 ++++++++++++++- .../text_to_image/train_text_to_image.py | 15 ++++++++++++++- .../textual_inversion/textual_inversion.py | 15 ++++++++++++++- .../train_unconditional.py | 14 ++++++++++++++ examples/text_to_image/train_text_to_image.py | 15 ++++++++++++++- .../text_to_image/train_text_to_image_lora.py | 15 ++++++++++++++- examples/textual_inversion/textual_inversion.py | 15 ++++++++++++++- .../train_unconditional.py | 14 ++++++++++++++ 12 files changed, 168 insertions(+), 10 deletions(-) diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py index ce8f0a52b8a1..177f3b66b70a 100644 --- a/examples/dreambooth/train_dreambooth.py +++ b/examples/dreambooth/train_dreambooth.py @@ -30,7 +30,7 @@ import transformers from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import set_seed +from accelerate.utils import ProjectConfiguration, set_seed from huggingface_hub import HfFolder, Repository, create_repo, whoami from packaging import version from PIL import Image @@ -195,6 +195,16 @@ def parse_args(input_args=None): "instructions." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -488,11 +498,14 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: def main(args): logging_dir = Path(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, logging_dir=logging_dir, + project_config=accelerator_project_config, ) # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index aa38761f4f8f..52951577ad34 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -29,7 +29,7 @@ import transformers from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import set_seed +from accelerate.utils import ProjectConfiguration, set_seed from huggingface_hub import HfFolder, Repository, create_repo, whoami from PIL import Image from torch.utils.data import Dataset @@ -242,6 +242,16 @@ def parse_args(input_args=None): " training using `--resume_from_checkpoint`." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -526,11 +536,14 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: def main(args): logging_dir = Path(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, logging_dir=logging_dir, + project_config=accelerator_project_config, ) if args.report_to == "wandb": diff --git a/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py b/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py index 789440e750f1..c8dc819b2631 100644 --- a/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py +++ b/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py @@ -13,7 +13,7 @@ import torch.utils.checkpoint from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import set_seed +from accelerate.utils import ProjectConfiguration, set_seed from huggingface_hub import HfFolder, Repository, create_repo, whoami from PIL import Image, ImageDraw from torch.utils.data import Dataset @@ -258,6 +258,16 @@ def parse_args(): " using `--resume_from_checkpoint`." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -406,11 +416,14 @@ def main(): args = parse_args() logging_dir = Path(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with="tensorboard", logging_dir=logging_dir, + accelerator_project_config=accelerator_project_config, ) # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate diff --git a/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py b/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py index 5d6f249d8469..2c6f1bc46581 100644 --- a/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py +++ b/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py @@ -12,7 +12,7 @@ import torch.utils.checkpoint from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import set_seed +from accelerate.utils import ProjectConfiguration, set_seed from huggingface_hub import HfFolder, Repository, create_repo, whoami from PIL import Image, ImageDraw from torch.utils.data import Dataset @@ -254,6 +254,16 @@ def parse_args(): " using `--resume_from_checkpoint`." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -405,11 +415,14 @@ def main(): args = parse_args() logging_dir = Path(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with="tensorboard", logging_dir=logging_dir, + accelerator_project_config=accelerator_project_config, ) # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate diff --git a/examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py b/examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py index 3865deb2e3a9..85bacc7aa2e4 100644 --- a/examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py +++ b/examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py @@ -15,7 +15,7 @@ import transformers from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import set_seed +from accelerate.utils import ProjectConfiguration, set_seed from huggingface_hub import HfFolder, Repository, create_repo, whoami from PIL import Image from torch.utils.data import Dataset @@ -170,6 +170,16 @@ def parse_args(input_args=None): " training using `--resume_from_checkpoint`." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -466,11 +476,14 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: def main(args): logging_dir = Path(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, logging_dir=logging_dir, + project_config=accelerator_project_config, ) # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate diff --git a/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py b/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py index 4bca25167b0e..c21e152004fa 100644 --- a/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py +++ b/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py @@ -29,7 +29,7 @@ import transformers from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import set_seed +from accelerate.utils import ProjectConfiguration, set_seed from datasets import load_dataset from huggingface_hub import HfFolder, Repository, create_repo, whoami from onnxruntime.training.ortmodule import ORTModule @@ -274,6 +274,16 @@ def parse_args(): " training using `--resume_from_checkpoint`." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -322,11 +332,14 @@ def main(): args = parse_args() logging_dir = os.path.join(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, logging_dir=logging_dir, + accelerator_project_config=accelerator_project_config, ) # Make one log on every process with the configuration for debugging. diff --git a/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py b/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py index f54e2d3e3f53..100d742b9aca 100644 --- a/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py +++ b/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py @@ -30,7 +30,7 @@ import transformers from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import set_seed +from accelerate.utils import ProjectConfiguration, set_seed from huggingface_hub import HfFolder, Repository, create_repo, whoami from onnxruntime.training.ortmodule import ORTModule @@ -290,6 +290,16 @@ def parse_args(): " training using `--resume_from_checkpoint`." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -467,11 +477,14 @@ def main(): args = parse_args() logging_dir = os.path.join(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, logging_dir=logging_dir, + project_config=accelerator_project_config, ) if args.report_to == "wandb": diff --git a/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py b/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py index cd120460c10f..429f3eac4faf 100644 --- a/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py +++ b/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py @@ -11,6 +11,7 @@ import torch.nn.functional as F from accelerate import Accelerator from accelerate.logging import get_logger +from accelerate.utils import ProjectConfiguration from datasets import load_dataset from huggingface_hub import HfFolder, Repository, create_repo, whoami from onnxruntime.training.ortmodule import ORTModule @@ -231,6 +232,16 @@ def parse_args(): " training using `--resume_from_checkpoint`." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -265,11 +276,14 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: def main(args): logging_dir = os.path.join(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.logger, logging_dir=logging_dir, + project_config=accelerator_project_config, ) if args.logger == "tensorboard": diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py index 39089a85680f..f915bdb9cfae 100644 --- a/examples/text_to_image/train_text_to_image.py +++ b/examples/text_to_image/train_text_to_image.py @@ -30,7 +30,7 @@ import transformers from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import set_seed +from accelerate.utils import ProjectConfiguration, set_seed from datasets import load_dataset from huggingface_hub import HfFolder, Repository, create_repo, whoami from packaging import version @@ -275,6 +275,16 @@ def parse_args(): " training using `--resume_from_checkpoint`." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -333,11 +343,14 @@ def main(): ) logging_dir = os.path.join(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, logging_dir=logging_dir, + project_config=accelerator_project_config, ) # Make one log on every process with the configuration for debugging. diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py index abc535594d8c..69c978aca518 100644 --- a/examples/text_to_image/train_text_to_image_lora.py +++ b/examples/text_to_image/train_text_to_image_lora.py @@ -30,7 +30,7 @@ import transformers from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import set_seed +from accelerate.utils import ProjectConfiguration, set_seed from datasets import load_dataset from huggingface_hub import HfFolder, Repository, create_repo, whoami from torchvision import transforms @@ -310,6 +310,16 @@ def parse_args(): " training using `--resume_from_checkpoint`." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -354,11 +364,14 @@ def main(): args = parse_args() logging_dir = os.path.join(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, logging_dir=logging_dir, + project_config=accelerator_project_config, ) if args.report_to == "wandb": if not is_wandb_available(): diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py index c61c2ae44c8a..9117ca5336db 100644 --- a/examples/textual_inversion/textual_inversion.py +++ b/examples/textual_inversion/textual_inversion.py @@ -29,7 +29,7 @@ import transformers from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import set_seed +from accelerate.utils import ProjectConfiguration, set_seed from huggingface_hub import HfFolder, Repository, create_repo, whoami # TODO: remove and import from diffusers.utils when the new version of diffusers is released @@ -288,6 +288,16 @@ def parse_args(): " training using `--resume_from_checkpoint`." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -465,11 +475,14 @@ def main(): args = parse_args() logging_dir = os.path.join(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, logging_dir=logging_dir, + project_config=accelerator_project_config, ) if args.report_to == "wandb": diff --git a/examples/unconditional_image_generation/train_unconditional.py b/examples/unconditional_image_generation/train_unconditional.py index f76594a78c32..4b8c8fccdbc7 100644 --- a/examples/unconditional_image_generation/train_unconditional.py +++ b/examples/unconditional_image_generation/train_unconditional.py @@ -12,6 +12,7 @@ import torch.nn.functional as F from accelerate import Accelerator from accelerate.logging import get_logger +from accelerate.utils import ProjectConfiguration from datasets import load_dataset from huggingface_hub import HfFolder, Repository, create_repo, whoami from packaging import version @@ -239,6 +240,16 @@ def parse_args(): " training using `--resume_from_checkpoint`." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -273,11 +284,14 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: def main(args): logging_dir = os.path.join(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.logger, logging_dir=logging_dir, + project_config=accelerator_project_config, ) if args.logger == "tensorboard": From c9a72bf7bba75f609b228126f3f05e9e8ed5f2ed Mon Sep 17 00:00:00 2001 From: Will Berman Date: Wed, 15 Feb 2023 23:20:16 -0800 Subject: [PATCH 2/2] Update examples/dreambooth/train_dreambooth.py Co-authored-by: Sayak Paul --- examples/dreambooth/train_dreambooth.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py index 177f3b66b70a..955f6ec9547e 100644 --- a/examples/dreambooth/train_dreambooth.py +++ b/examples/dreambooth/train_dreambooth.py @@ -202,7 +202,7 @@ def parse_args(input_args=None): help=( "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" - " for more docs" + " for more details" ), ) parser.add_argument(