diff --git a/examples/dreambooth/train_dreambooth.py b/examples/dreambooth/train_dreambooth.py index ce8f0a52b8a1..955f6ec9547e 100644 --- a/examples/dreambooth/train_dreambooth.py +++ b/examples/dreambooth/train_dreambooth.py @@ -30,7 +30,7 @@ import transformers from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import set_seed +from accelerate.utils import ProjectConfiguration, set_seed from huggingface_hub import HfFolder, Repository, create_repo, whoami from packaging import version from PIL import Image @@ -195,6 +195,16 @@ def parse_args(input_args=None): "instructions." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more details" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -488,11 +498,14 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: def main(args): logging_dir = Path(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, logging_dir=logging_dir, + project_config=accelerator_project_config, ) # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py index aa38761f4f8f..52951577ad34 100644 --- a/examples/dreambooth/train_dreambooth_lora.py +++ b/examples/dreambooth/train_dreambooth_lora.py @@ -29,7 +29,7 @@ import transformers from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import set_seed +from accelerate.utils import ProjectConfiguration, set_seed from huggingface_hub import HfFolder, Repository, create_repo, whoami from PIL import Image from torch.utils.data import Dataset @@ -242,6 +242,16 @@ def parse_args(input_args=None): " training using `--resume_from_checkpoint`." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -526,11 +536,14 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: def main(args): logging_dir = Path(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, logging_dir=logging_dir, + project_config=accelerator_project_config, ) if args.report_to == "wandb": diff --git a/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py b/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py index 789440e750f1..c8dc819b2631 100644 --- a/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py +++ b/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint.py @@ -13,7 +13,7 @@ import torch.utils.checkpoint from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import set_seed +from accelerate.utils import ProjectConfiguration, set_seed from huggingface_hub import HfFolder, Repository, create_repo, whoami from PIL import Image, ImageDraw from torch.utils.data import Dataset @@ -258,6 +258,16 @@ def parse_args(): " using `--resume_from_checkpoint`." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -406,11 +416,14 @@ def main(): args = parse_args() logging_dir = Path(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with="tensorboard", logging_dir=logging_dir, + accelerator_project_config=accelerator_project_config, ) # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate diff --git a/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py b/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py index 5d6f249d8469..2c6f1bc46581 100644 --- a/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py +++ b/examples/research_projects/dreambooth_inpaint/train_dreambooth_inpaint_lora.py @@ -12,7 +12,7 @@ import torch.utils.checkpoint from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import set_seed +from accelerate.utils import ProjectConfiguration, set_seed from huggingface_hub import HfFolder, Repository, create_repo, whoami from PIL import Image, ImageDraw from torch.utils.data import Dataset @@ -254,6 +254,16 @@ def parse_args(): " using `--resume_from_checkpoint`." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -405,11 +415,14 @@ def main(): args = parse_args() logging_dir = Path(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with="tensorboard", logging_dir=logging_dir, + accelerator_project_config=accelerator_project_config, ) # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate diff --git a/examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py b/examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py index 3865deb2e3a9..85bacc7aa2e4 100644 --- a/examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py +++ b/examples/research_projects/multi_subject_dreambooth/train_multi_subject_dreambooth.py @@ -15,7 +15,7 @@ import transformers from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import set_seed +from accelerate.utils import ProjectConfiguration, set_seed from huggingface_hub import HfFolder, Repository, create_repo, whoami from PIL import Image from torch.utils.data import Dataset @@ -170,6 +170,16 @@ def parse_args(input_args=None): " training using `--resume_from_checkpoint`." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -466,11 +476,14 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: def main(args): logging_dir = Path(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, logging_dir=logging_dir, + project_config=accelerator_project_config, ) # Currently, it's not possible to do gradient accumulation when training two models with accelerate.accumulate diff --git a/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py b/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py index 4bca25167b0e..c21e152004fa 100644 --- a/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py +++ b/examples/research_projects/onnxruntime/text_to_image/train_text_to_image.py @@ -29,7 +29,7 @@ import transformers from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import set_seed +from accelerate.utils import ProjectConfiguration, set_seed from datasets import load_dataset from huggingface_hub import HfFolder, Repository, create_repo, whoami from onnxruntime.training.ortmodule import ORTModule @@ -274,6 +274,16 @@ def parse_args(): " training using `--resume_from_checkpoint`." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -322,11 +332,14 @@ def main(): args = parse_args() logging_dir = os.path.join(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, logging_dir=logging_dir, + accelerator_project_config=accelerator_project_config, ) # Make one log on every process with the configuration for debugging. diff --git a/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py b/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py index f54e2d3e3f53..100d742b9aca 100644 --- a/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py +++ b/examples/research_projects/onnxruntime/textual_inversion/textual_inversion.py @@ -30,7 +30,7 @@ import transformers from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import set_seed +from accelerate.utils import ProjectConfiguration, set_seed from huggingface_hub import HfFolder, Repository, create_repo, whoami from onnxruntime.training.ortmodule import ORTModule @@ -290,6 +290,16 @@ def parse_args(): " training using `--resume_from_checkpoint`." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -467,11 +477,14 @@ def main(): args = parse_args() logging_dir = os.path.join(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, logging_dir=logging_dir, + project_config=accelerator_project_config, ) if args.report_to == "wandb": diff --git a/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py b/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py index cd120460c10f..429f3eac4faf 100644 --- a/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py +++ b/examples/research_projects/onnxruntime/unconditional_image_generation/train_unconditional.py @@ -11,6 +11,7 @@ import torch.nn.functional as F from accelerate import Accelerator from accelerate.logging import get_logger +from accelerate.utils import ProjectConfiguration from datasets import load_dataset from huggingface_hub import HfFolder, Repository, create_repo, whoami from onnxruntime.training.ortmodule import ORTModule @@ -231,6 +232,16 @@ def parse_args(): " training using `--resume_from_checkpoint`." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -265,11 +276,14 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: def main(args): logging_dir = os.path.join(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.logger, logging_dir=logging_dir, + project_config=accelerator_project_config, ) if args.logger == "tensorboard": diff --git a/examples/text_to_image/train_text_to_image.py b/examples/text_to_image/train_text_to_image.py index 39089a85680f..f915bdb9cfae 100644 --- a/examples/text_to_image/train_text_to_image.py +++ b/examples/text_to_image/train_text_to_image.py @@ -30,7 +30,7 @@ import transformers from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import set_seed +from accelerate.utils import ProjectConfiguration, set_seed from datasets import load_dataset from huggingface_hub import HfFolder, Repository, create_repo, whoami from packaging import version @@ -275,6 +275,16 @@ def parse_args(): " training using `--resume_from_checkpoint`." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -333,11 +343,14 @@ def main(): ) logging_dir = os.path.join(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, logging_dir=logging_dir, + project_config=accelerator_project_config, ) # Make one log on every process with the configuration for debugging. diff --git a/examples/text_to_image/train_text_to_image_lora.py b/examples/text_to_image/train_text_to_image_lora.py index abc535594d8c..69c978aca518 100644 --- a/examples/text_to_image/train_text_to_image_lora.py +++ b/examples/text_to_image/train_text_to_image_lora.py @@ -30,7 +30,7 @@ import transformers from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import set_seed +from accelerate.utils import ProjectConfiguration, set_seed from datasets import load_dataset from huggingface_hub import HfFolder, Repository, create_repo, whoami from torchvision import transforms @@ -310,6 +310,16 @@ def parse_args(): " training using `--resume_from_checkpoint`." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -354,11 +364,14 @@ def main(): args = parse_args() logging_dir = os.path.join(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, logging_dir=logging_dir, + project_config=accelerator_project_config, ) if args.report_to == "wandb": if not is_wandb_available(): diff --git a/examples/textual_inversion/textual_inversion.py b/examples/textual_inversion/textual_inversion.py index c61c2ae44c8a..9117ca5336db 100644 --- a/examples/textual_inversion/textual_inversion.py +++ b/examples/textual_inversion/textual_inversion.py @@ -29,7 +29,7 @@ import transformers from accelerate import Accelerator from accelerate.logging import get_logger -from accelerate.utils import set_seed +from accelerate.utils import ProjectConfiguration, set_seed from huggingface_hub import HfFolder, Repository, create_repo, whoami # TODO: remove and import from diffusers.utils when the new version of diffusers is released @@ -288,6 +288,16 @@ def parse_args(): " training using `--resume_from_checkpoint`." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -465,11 +475,14 @@ def main(): args = parse_args() logging_dir = os.path.join(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.report_to, logging_dir=logging_dir, + project_config=accelerator_project_config, ) if args.report_to == "wandb": diff --git a/examples/unconditional_image_generation/train_unconditional.py b/examples/unconditional_image_generation/train_unconditional.py index f76594a78c32..4b8c8fccdbc7 100644 --- a/examples/unconditional_image_generation/train_unconditional.py +++ b/examples/unconditional_image_generation/train_unconditional.py @@ -12,6 +12,7 @@ import torch.nn.functional as F from accelerate import Accelerator from accelerate.logging import get_logger +from accelerate.utils import ProjectConfiguration from datasets import load_dataset from huggingface_hub import HfFolder, Repository, create_repo, whoami from packaging import version @@ -239,6 +240,16 @@ def parse_args(): " training using `--resume_from_checkpoint`." ), ) + parser.add_argument( + "--checkpointing_steps_total_limit", + type=int, + default=None, + help=( + "Max number of checkpoints to store. Passed as `total_limit` to the `Accelerator` `ProjectConfiguration`." + " See Accelerator::save_state https://huggingface.co/docs/accelerate/package_reference/accelerator#accelerate.Accelerator.save_state" + " for more docs" + ), + ) parser.add_argument( "--resume_from_checkpoint", type=str, @@ -273,11 +284,14 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: def main(args): logging_dir = os.path.join(args.output_dir, args.logging_dir) + accelerator_project_config = ProjectConfiguration(total_limit=args.checkpointing_steps_total_limit) + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, log_with=args.logger, logging_dir=logging_dir, + project_config=accelerator_project_config, ) if args.logger == "tensorboard":