From eea69a6c8d8e39435bb372941b516494fcc901b1 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 22 Nov 2023 15:53:24 +0530
Subject: [PATCH 01/12] add code changes

1. Refactor FSDP
2. Add `--save_only_model` option: When checkpointing, whether to only save the model, or also the optimizer, scheduler & rng state.
3. Bump up the minimum `accelerate` version to `0.21.0`
---
 setup.py                               |   2 +-
 src/transformers/modeling_utils.py     |  12 +-
 src/transformers/trainer.py            | 225 ++++++++++---------------
 src/transformers/trainer_utils.py      |   2 +
 src/transformers/training_args.py      |  30 +++-
 src/transformers/utils/import_utils.py |   2 +-
 6 files changed, 122 insertions(+), 151 deletions(-)

diff --git a/setup.py b/setup.py
index deccac468a8a6..258d8a77209c4 100644
--- a/setup.py
+++ b/setup.py
@@ -96,7 +96,7 @@
 # 2. once modified, run: `make deps_table_update` to update src/transformers/dependency_versions_table.py
 _deps = [
     "Pillow<10.0.0",
-    "accelerate>=0.20.3",
+    "accelerate>=0.21.0",
     "av==9.2.0",  # Latest version of PyAV (10.0.0) has issues with audio stream.
     "beautifulsoup4",
     "codecarbon==1.2.0",
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index cee472036b274..d4241aa3badce 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -132,7 +132,7 @@ def is_fsdp_enabled():
     )
 
 
-def is_fsdp_enabled_and_dist_rank_0():
+def is_fsdp_enabled_and_local_dist_rank_0():
     return is_fsdp_enabled() and int(os.environ.get("LOCAL_RANK", -1)) == 0
 
 
@@ -473,14 +473,12 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike]):
             )
         return safe_load_file(checkpoint_file)
     try:
-        if (
-            (is_deepspeed_zero3_enabled() or is_fsdp_enabled())
-            and torch.distributed.is_initialized()
-            and torch.distributed.get_rank() > 0
-        ):
+        if (is_deepspeed_zero3_enabled()) and torch.distributed.is_initialized() and torch.distributed.get_rank() > 0:
             map_location = "meta"
         else:
             map_location = "cpu"
+
+        map_location = "cpu" if is_fsdp_enabled_and_local_dist_rank_0 else "meta"
         return torch.load(checkpoint_file, map_location=map_location)
     except Exception as e:
         try:
@@ -3900,7 +3898,7 @@ def _find_mismatched_keys(
                     ignore_mismatched_sizes,
                 )
                 if low_cpu_mem_usage:
-                    if not is_fsdp_enabled() or is_fsdp_enabled_and_dist_rank_0():
+                    if not is_fsdp_enabled() or is_fsdp_enabled_and_local_dist_rank_0():
                         new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
                             model_to_load,
                             state_dict,
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 25100dad69bca..4cbc07db04dfd 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -193,15 +193,15 @@
 if is_accelerate_available():
     from accelerate import Accelerator, skip_first_batches
     from accelerate import __version__ as accelerate_version
-    from accelerate.utils import DistributedDataParallelKwargs, GradientAccumulationPlugin
-
-    if version.parse(accelerate_version) > version.parse("0.20.3"):
-        from accelerate.utils import (
-            load_fsdp_model,
-            load_fsdp_optimizer,
-            save_fsdp_model,
-            save_fsdp_optimizer,
-        )
+    from accelerate.utils import (
+        DistributedDataParallelKwargs,
+        GradientAccumulationPlugin,
+        load_fsdp_model,
+        load_fsdp_optimizer,
+        save_fsdp_model,
+        save_fsdp_optimizer,
+    )
+
     DATA_SAMPLERS = [RandomSampler]
     if version.parse(accelerate_version) > version.parse("0.23.0"):
         from accelerate.data_loader import SeedableRandomSampler
@@ -226,6 +226,7 @@
 OPTIMIZER_NAME_BIN = "optimizer.bin"
 SCHEDULER_NAME = "scheduler.pt"
 SCALER_NAME = "scaler.pt"
+FSDP_MODEL_NAME = "pytorch_model_fsdp"
 
 
 class Trainer:
@@ -415,7 +416,7 @@ def __init__(
                 " model, please make sure that you have installed `bitsandbytes>=0.37.0`. "
             )
 
-        self.fsdp = None
+        self.is_fsdp_xla_enabled = args.fsdp_config["xla"]
         if len(args.fsdp) > 0:
             if self.is_deepspeed_enabled:
                 raise ValueError(
@@ -424,32 +425,6 @@ def __init__(
             if not args.fsdp_config["xla"] and args.parallel_mode != ParallelMode.DISTRIBUTED:
                 raise ValueError("Using fsdp only works in distributed training.")
 
-            # dep_version_check("torch>=1.12.0")
-            # Would have to update setup.py with torch>=1.12.0
-            # which isn't ideally given that it will force people not using FSDP to also use torch>=1.12.0
-            # below is the current alternative.
-            if version.parse(version.parse(torch.__version__).base_version) < version.parse("1.12.0"):
-                raise ValueError("FSDP requires PyTorch >= 1.12.0")
-
-            from torch.distributed.fsdp.fully_sharded_data_parallel import BackwardPrefetch, ShardingStrategy
-
-            if FSDPOption.FULL_SHARD in args.fsdp:
-                self.fsdp = ShardingStrategy.FULL_SHARD
-            elif FSDPOption.SHARD_GRAD_OP in args.fsdp:
-                self.fsdp = ShardingStrategy.SHARD_GRAD_OP
-            elif FSDPOption.NO_SHARD in args.fsdp:
-                self.fsdp = ShardingStrategy.NO_SHARD
-
-            self.backward_prefetch = BackwardPrefetch.BACKWARD_PRE
-            if "backward_prefetch" in self.args.fsdp_config and "backward_post" in self.args.fsdp_config.get(
-                "backward_prefetch", []
-            ):
-                self.backward_prefetch = BackwardPrefetch.BACKWARD_POST
-
-            self.limit_all_gathers = False
-            if self.args.fsdp_config.get("limit_all_gathers", False):
-                self.limit_all_gathers = True
-
         # one place to sort out whether to place the model on device or not
         # postpone switching model to cuda when:
         # 1. MP - since we are trying to fit a much bigger than 1 gpu model
@@ -462,7 +437,7 @@ def __init__(
             self.is_model_parallel
             or self.is_deepspeed_enabled
             or ((args.fp16_full_eval or args.bf16_full_eval) and not args.do_train)
-            or (self.fsdp is not None)
+            or self.is_fsdp_xla_enabled
             or self.is_fsdp_enabled
         ):
             self.place_model_on_device = False
@@ -513,7 +488,7 @@ def __init__(
                     " `Trainer`. Make sure the lines `import torch_xla.core.xla_model as xm` and"
                     " `model.to(xm.xla_device())` is performed before the optimizer creation in your script."
                 )
-        if (self.is_deepspeed_enabled or (self.fsdp is not None)) and (
+        if (self.is_deepspeed_enabled or self.is_fsdp_xla_enabled or self.is_fsdp_enabled) and (
             self.optimizer is not None or self.lr_scheduler is not None
         ):
             raise RuntimeError(
@@ -1367,7 +1342,7 @@ def _wrap_model(self, model, training=True, dataloader=None):
 
         # Distributed training (should be after apex fp16 initialization)
         # Distributed training using PyTorch FSDP
-        if self.fsdp is not None and self.args.fsdp_config["xla"]:
+        if self.is_fsdp_xla_enabled:
             try:
                 from torch_xla.distributed.fsdp import XlaFullyShardedDataParallel as FSDP
                 from torch_xla.distributed.fsdp import checkpoint_module
@@ -1626,7 +1601,7 @@ def _inner_training_loop(
             else:
                 debug_overflow = DebugUnderflowOverflow(self.model)  # noqa
 
-        delay_optimizer_creation = is_sagemaker_mp_enabled() or self.fsdp is not None or self.is_fsdp_enabled
+        delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled
 
         # We need to reset the scheduler, as its parameters may be different on subsequent calls
         if self._created_lr_scheduler:
@@ -1676,8 +1651,6 @@ def _inner_training_loop(
         use_accelerator_prepare = True if model is self.model else False
 
         if delay_optimizer_creation:
-            if use_accelerator_prepare:
-                self.model = self.accelerator.prepare(self.model)
             self.create_optimizer_and_scheduler(num_training_steps=max_steps)
 
         # prepare using `accelerator` prepare
@@ -1895,9 +1868,7 @@ def _inner_training_loop(
                 ):
                     # the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered
                     # in accelerate. So, explicitly enable sync gradients to True in that case.
-                    if is_last_step_and_steps_less_than_grad_acc or (
-                        version.parse(accelerate_version) <= version.parse("0.20.3")
-                    ):
+                    if is_last_step_and_steps_less_than_grad_acc:
                         self.accelerator.gradient_state._set_sync_gradients(True)
 
                     # Gradient clipping
@@ -2051,7 +2022,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
         safe_weights_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_NAME)
         safe_weights_index_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_INDEX_NAME)
         is_fsdp_ckpt = os.path.isdir(resume_from_checkpoint) and any(
-            WEIGHTS_NAME.split(".")[0] in folder_name
+            FSDP_MODEL_NAME in folder_name
             for folder_name in os.listdir(resume_from_checkpoint)
             if os.path.isdir(os.path.join(resume_from_checkpoint, folder_name))
         )
@@ -2360,56 +2331,12 @@ def _save_checkpoint(self, model, trial, metrics=None):
         run_dir = self._get_output_dir(trial=trial)
         output_dir = os.path.join(run_dir, checkpoint_folder)
         self.save_model(output_dir, _internal_call=True)
-        if self.is_deepspeed_enabled:
-            # under zero3 model file itself doesn't get saved since it's bogus! Unless deepspeed
-            # config `stage3_gather_16bit_weights_on_model_save` is True
-            self.model_wrapped.save_checkpoint(output_dir)
 
-        # Save optimizer and scheduler
-        if self.fsdp or self.is_fsdp_enabled:
-            if self.is_fsdp_enabled:
-                save_fsdp_optimizer(
-                    self.accelerator.state.fsdp_plugin, self.accelerator, self.optimizer, self.model, output_dir
-                )
-            else:
-                # FSDP has a different interface for saving optimizer states.
-                # Needs to be called on all ranks to gather all states.
-                # full_optim_state_dict will be deprecated after Pytorch 2.2!
-                full_osd = self.model.__class__.full_optim_state_dict(self.model, self.optimizer)
-                torch.save(full_osd, os.path.join(output_dir, OPTIMIZER_NAME))
-
-        if is_torch_tpu_available():
-            xm.rendezvous("saving_optimizer_states")
-            xm.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
-            with warnings.catch_warnings(record=True) as caught_warnings:
-                xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
-                reissue_pt_warnings(caught_warnings)
-        elif is_sagemaker_mp_enabled():
-            opt_state_dict = self.optimizer.local_state_dict(gather_if_shard=False)
-            smp.barrier()
-            if smp.rdp_rank() == 0 or smp.state.cfg.shard_optimizer_state:
-                smp.save(
-                    opt_state_dict,
-                    os.path.join(output_dir, OPTIMIZER_NAME),
-                    partial=True,
-                    v3=smp.state.cfg.shard_optimizer_state,
-                )
-        elif self.args.should_save and not self.is_deepspeed_enabled and not (self.fsdp or self.is_fsdp_enabled):
-            # deepspeed.save_checkpoint above saves model/optim/sched
-            torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
-
-        # Save SCHEDULER & SCALER
-        is_deepspeed_custom_scheduler = self.is_deepspeed_enabled and not isinstance(
-            self.lr_scheduler, DeepSpeedSchedulerWrapper
-        )
-        if (
-            self.args.should_save
-            and (not self.is_deepspeed_enabled or is_deepspeed_custom_scheduler)
-            and not is_torch_tpu_available()
-        ):
-            with warnings.catch_warnings(record=True) as caught_warnings:
-                torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
-            reissue_pt_warnings(caught_warnings)
+        if not self.args.save_only_model:
+            # Save optimizer and scheduler
+            self._save_optimizer_and_scheduler(output_dir)
+            # Save RNG state
+            self._save_rng_state(output_dir)
 
         # Determine the new best metric / best model checkpoint
         if metrics is not None and self.args.metric_for_best_model is not None:
@@ -2431,6 +2358,14 @@ def _save_checkpoint(self, model, trial, metrics=None):
         if self.args.should_save:
             self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
 
+        if self.args.push_to_hub:
+            self._push_from_checkpoint(output_dir)
+
+        # Maybe delete some older checkpoints.
+        if self.args.should_save:
+            self._rotate_checkpoints(use_mtime=True, output_dir=run_dir)
+
+    def _save_rng_state(self, output_dir):
         # Save RNG state in non-distributed training
         rng_states = {
             "python": random.getstate(),
@@ -2462,12 +2397,49 @@ def _save_checkpoint(self, model, trial, metrics=None):
         else:
             torch.save(rng_states, os.path.join(output_dir, f"rng_state_{self.args.process_index}.pth"))
 
-        if self.args.push_to_hub:
-            self._push_from_checkpoint(output_dir)
+    def _save_optimizer_and_scheduler(self, output_dir):
+        if is_torch_tpu_available():
+            xm.rendezvous("saving_optimizer_states")
+            xm.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
+            with warnings.catch_warnings(record=True) as caught_warnings:
+                xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
+                reissue_pt_warnings(caught_warnings)
+        elif is_sagemaker_mp_enabled():
+            opt_state_dict = self.optimizer.local_state_dict(gather_if_shard=False)
+            smp.barrier()
+            if smp.rdp_rank() == 0 or smp.state.cfg.shard_optimizer_state:
+                smp.save(
+                    opt_state_dict,
+                    os.path.join(output_dir, OPTIMIZER_NAME),
+                    partial=True,
+                    v3=smp.state.cfg.shard_optimizer_state,
+                )
+        elif self.is_deepspeed_enabled:
+            # under zero3 model file itself doesn't get saved since it's bogus! Unless deepspeed
+            # config `stage3_gather_16bit_weights_on_model_save` is True
+            self.model_wrapped.save_checkpoint(output_dir)
+        elif self.is_fsdp_enabled:
+            # save fsdp specific ckpt for resuming from ckpt
+            save_fsdp_model(self.accelerator.state.fsdp_plugin, self.accelerator, self.model, output_dir)
+            save_fsdp_optimizer(
+                self.accelerator.state.fsdp_plugin, self.accelerator, self.optimizer, self.model, output_dir
+            )
+        elif self.args.should_save:
+            # deepspeed.save_checkpoint above saves model/optim/sched
+            torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
 
-        # Maybe delete some older checkpoints.
-        if self.args.should_save:
-            self._rotate_checkpoints(use_mtime=True, output_dir=run_dir)
+        # Save SCHEDULER & SCALER
+        is_deepspeed_custom_scheduler = self.is_deepspeed_enabled and not isinstance(
+            self.lr_scheduler, DeepSpeedSchedulerWrapper
+        )
+        if (
+            self.args.should_save
+            and (not self.is_deepspeed_enabled or is_deepspeed_custom_scheduler)
+            and not is_torch_tpu_available()
+        ):
+            with warnings.catch_warnings(record=True) as caught_warnings:
+                torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
+            reissue_pt_warnings(caught_warnings)
 
     def _load_optimizer_and_scheduler(self, checkpoint):
         """If optimizer and scheduler states exist, load them."""
@@ -2535,23 +2507,14 @@ def opt_load_hook(mod, opt):
                     # In distributed training however, we load directly on each GPU and risk the GPU OOM as it's more
                     # likely to get OOM on CPU (since we load num_gpu times the optimizer state
                     map_location = self.args.device if self.args.world_size > 1 else "cpu"
-                    if self.fsdp or self.is_fsdp_enabled:
-                        if self.is_fsdp_enabled:
-                            load_fsdp_optimizer(
-                                self.accelerator.state.fsdp_plugin,
-                                self.accelerator,
-                                self.optimizer,
-                                self.model,
-                                checkpoint,
-                            )
-                        else:
-                            full_osd = None
-                            # In FSDP, we need to load the full optimizer state dict on rank 0 and then shard it
-                            if self.args.process_index == 0:
-                                full_osd = torch.load(os.path.join(checkpoint, OPTIMIZER_NAME))
-                            # call scatter_full_optim_state_dict on all ranks
-                            sharded_osd = self.model.__class__.scatter_full_optim_state_dict(full_osd, self.model)
-                            self.optimizer.load_state_dict(sharded_osd)
+                    if self.is_fsdp_enabled:
+                        load_fsdp_optimizer(
+                            self.accelerator.state.fsdp_plugin,
+                            self.accelerator,
+                            self.optimizer,
+                            self.model,
+                            checkpoint,
+                        )
                     else:
                         self.optimizer.load_state_dict(
                             torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
@@ -2826,19 +2789,14 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
             if IS_SAGEMAKER_MP_POST_1_10:
                 # 'user_content.pt' indicates model state_dict saved with smp >= 1.10
                 Path(os.path.join(output_dir, "user_content.pt")).touch()
-        elif self.fsdp is not None or self.is_fsdp_enabled:
-            state_dict = self.model.state_dict() if not self.is_fsdp_enabled else {}
-            if self.args.should_save:
-                self._save(output_dir, state_dict=state_dict)
-            if self.is_fsdp_enabled:
-                # remove the dummy state_dict
-                remove_dummy_checkpoint(self.args.should_save, output_dir, [WEIGHTS_NAME, SAFE_WEIGHTS_NAME])
-                save_fsdp_model(self.accelerator.state.fsdp_plugin, self.accelerator, self.model, output_dir)
-
+        elif self.is_fsdp_enabled:
+            if ("FULL_STATE_DICT" in str(self.accelerator.state.fsdp_plugin.state_dict_type)) and (
+                version.parse(accelerate_version) > version.parse("0.24.1")
+            ):
+                state_dict = self.accelerator.get_state_dict(self.model)
+                if self.args.should_save:
+                    self._save(output_dir, state_dict=state_dict)
         elif self.is_deepspeed_enabled:
-            # this takes care of everything as long as we aren't under zero3
-            if version.parse(accelerate_version) <= version.parse("0.20.3"):
-                raise ValueError("Install Accelerate from main branch")
             try:
                 state_dict = self.accelerator.get_state_dict(self.deepspeed)
                 if self.args.should_save:
@@ -3247,11 +3205,7 @@ def evaluation_loop(
             self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
 
             # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
-            if (
-                args.eval_accumulation_steps is not None
-                and (step + 1) % args.eval_accumulation_steps == 0
-                and (self.accelerator.sync_gradients or version.parse(accelerate_version) > version.parse("0.20.3"))
-            ):
+            if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0:
                 if losses_host is not None:
                     losses = nested_numpify(losses_host)
                     all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
@@ -3934,8 +3888,7 @@ def _add_sm_patterns_to_gitignore(self) -> None:
 
     def create_accelerator_and_postprocess(self):
         grad_acc_kwargs = {"num_steps": self.args.gradient_accumulation_steps}
-        if version.parse(accelerate_version) > version.parse("0.20.3"):
-            grad_acc_kwargs["sync_with_dataloader"] = False
+        grad_acc_kwargs["sync_with_dataloader"] = False
         gradient_accumulation_plugin = GradientAccumulationPlugin(**grad_acc_kwargs)
 
         # create accelerator object
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index e6f26d0df5196..dbd868d112024 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -727,6 +727,8 @@ class FSDPOption(ExplicitEnum):
     FULL_SHARD = "full_shard"
     SHARD_GRAD_OP = "shard_grad_op"
     NO_SHARD = "no_shard"
+    HYBRID_SHARD = "hybrid_shard"
+    HYBRID_SHARD_ZERO2 = "hybrid_shard_zero2"
     OFFLOAD = "offload"
     AUTO_WRAP = "auto_wrap"
 
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index b368d86e0ed8e..c249adfbb15d7 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -304,6 +304,11 @@ class TrainingArguments:
 
             This should not be activated when the different nodes use the same storage as the files will be saved with
             the same names for each node.
+        save_only_model (`bool`, *optional*, defaults to `False`):
+            When checkpointing, whether to only save the model, or also the optimizer, scheduler & rng state.
+            Note that when this is true, you won't be able to resume training from checkpoint.
+            This enables you to save storage by not storing the optimizer, scheduler & rng state.
+            You can only load the model using `from_pretrained` with this option set to `True`.
         use_cpu (`bool`, *optional*, defaults to `False`):
             Whether or not to use cpu. If set to False, we will use cuda or mps device if available.
         seed (`int`, *optional*, defaults to 42):
@@ -418,6 +423,8 @@ class TrainingArguments:
 
             - `"full_shard"`: Shard parameters, gradients and optimizer states.
             - `"shard_grad_op"`: Shard optimizer states and gradients.
+            - `"hybrid_shard"`: Apply ``FULL_SHARD`` within a node, and replicate parameters across nodes.
+            - `"hybrid_shard_zero2"`: Apply ``SHARD_GRAD_OP`` within a node, and replicate parameters across nodes.
             - `"offload"`: Offload parameters and gradients to CPUs (only compatible with `"full_shard"` and
               `"shard_grad_op"`).
             - `"auto_wrap"`: Automatically recursively wrap layers with FSDP using `default_auto_wrap_policy`.
@@ -452,7 +459,7 @@ class TrainingArguments:
                     FSDP's limit_all_gathers (useful only when `fsdp` field is passed).
                      If `"True"`, FSDP explicitly synchronizes the CPU thread to prevent too many in-flight
                      all-gathers.
-                - use_orig_params (`bool`, *optional*, defaults to `False`)
+                - use_orig_params (`bool`, *optional*, defaults to `True`)
                     If `"True"`, allows non-uniform `requires_grad` during init, which means support for interspersed
                     frozen and trainable paramteres. Useful in cases such as parameter-efficient fine-tuning. Please
                     refer this
@@ -460,6 +467,10 @@ class TrainingArguments:
                 - sync_module_states (`bool`, *optional*, defaults to `True`)
                     If `"True"`, each individually wrapped FSDP unit will broadcast module parameters from rank 0 to
                     ensure they are the same across all ranks after initialization
+                - activation_checkpointing (`bool`, *optional*, defaults to `False`):
+                    If True, activation checkpointing is a technique to reduce memory usage by clearing activations of
+                    certain layers and recomputing them during a backward pass. Effectively, this trades extra
+                    computation time for reduced memory usage.
                 - xla (`bool`, *optional*, defaults to `False`):
                     Whether to use PyTorch/XLA Fully Sharded Data Parallel Training. This is an experimental feature
                     and its API may evolve in the future.
@@ -472,10 +483,6 @@ class TrainingArguments:
                     Will use gradient checkpointing over each nested XLA FSDP wrapped layer. This setting can only be
                     used when the xla flag is set to true, and an auto wrapping policy is specified through
                     fsdp_min_num_params or fsdp_transformer_layer_cls_to_wrap.
-                - activation_checkpointing (`bool`, *optional*, defaults to `False`):
-                    If True, activation checkpointing is a technique to reduce memory usage by clearing activations of
-                    certain layers and recomputing them during a backward pass. Effectively, this trades extra
-                    computation time for reduced memory usage.
 
         deepspeed (`str` or `dict`, *optional*):
             Use [Deepspeed](https://github.com/microsoft/deepspeed). This is an experimental feature and its API may
@@ -835,6 +842,17 @@ class TrainingArguments:
             )
         },
     )
+    save_only_model: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "When checkpointing, whether to only save the model, or also the optimizer, scheduler & rng state."
+                "Note that when this is true, you won't be able to resume training from checkpoint."
+                "This enables you to save storage by not storing the optimizer, scheduler & rng state."
+                "You can only load the model using `from_pretrained` with this option set to `True`."
+            )
+        },
+    )
     no_cuda: bool = field(
         default=False,
         metadata={"help": "This argument is deprecated. It will be removed in version 5.0 of 🤗 Transformers."},
@@ -1670,7 +1688,7 @@ def __post_init__(self):
             os.environ[f"{prefix}BACKWARD_PREFETCH"] = prefetch_policy.upper()
             os.environ[f"{prefix}FORWARD_PREFETCH"] = self.fsdp_config.get("forward_prefect", "false")
             os.environ[f"{prefix}SYNC_MODULE_STATES"] = self.fsdp_config.get("sync_module_states", "true")
-            os.environ[f"{prefix}USE_ORIG_PARAMS"] = self.fsdp_config.get("use_orig_params", "false")
+            os.environ[f"{prefix}USE_ORIG_PARAMS"] = self.fsdp_config.get("use_orig_params", "true")
 
         if self.tpu_metrics_debug:
             warnings.warn(
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index c4862b197c97e..beb6c4779573e 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -652,7 +652,7 @@ def is_protobuf_available():
     return importlib.util.find_spec("google.protobuf") is not None
 
 
-def is_accelerate_available(min_version: str = None):
+def is_accelerate_available(min_version: str = "0.21.0"):
     if min_version is not None:
         return _accelerate_available and version.parse(_accelerate_version) >= version.parse(min_version)
     return _accelerate_available

From 4d2a7940660fe3388e6c40c3e981a1d3b175aa42 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 22 Nov 2023 16:27:14 +0530
Subject: [PATCH 02/12] quality

---
 src/transformers/dependency_versions_table.py | 2 +-
 src/transformers/trainer.py                   | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 93e21ab2d3e56..ecb1db5dba8ab 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -3,7 +3,7 @@
 # 2. run `make deps_table_update``
 deps = {
     "Pillow": "Pillow<10.0.0",
-    "accelerate": "accelerate>=0.20.3",
+    "accelerate": "accelerate>=0.21.0",
     "av": "av==9.2.0",
     "beautifulsoup4": "beautifulsoup4",
     "codecarbon": "codecarbon==1.2.0",
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 4cbc07db04dfd..ee0897b47469d 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -99,7 +99,6 @@
     BestRun,
     EvalLoopOutput,
     EvalPrediction,
-    FSDPOption,
     HPSearchBackend,
     HubStrategy,
     IntervalStrategy,

From 149330a6abc078827be274db84c8a2d26a76eba1 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 22 Nov 2023 17:46:39 +0530
Subject: [PATCH 03/12] fix quality?

---
 src/transformers/training_args.py | 514 +++++++++++++++---------------
 1 file changed, 257 insertions(+), 257 deletions(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index c249adfbb15d7..d5914ee36e11d 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -163,498 +163,498 @@ class TrainingArguments:
     TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
     itself**.
 
-    Using [`HfArgumentParser`] we can turn this class into
+    Using [*HfArgumentParser*] we can turn this class into
     [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
     command line.
 
     Parameters:
-        output_dir (`str`):
+        output_dir (*str*):
             The output directory where the model predictions and checkpoints will be written.
-        overwrite_output_dir (`bool`, *optional*, defaults to `False`):
-            If `True`, overwrite the content of the output directory. Use this to continue training if `output_dir`
+        overwrite_output_dir (*bool*, *optional*, defaults to *False*):
+            If *True*, overwrite the content of the output directory. Use this to continue training if *output_dir*
             points to a checkpoint directory.
-        do_train (`bool`, *optional*, defaults to `False`):
-            Whether to run training or not. This argument is not directly used by [`Trainer`], it's intended to be used
+        do_train (*bool*, *optional*, defaults to *False*):
+            Whether to run training or not. This argument is not directly used by [*Trainer*], it's intended to be used
             by your training/evaluation scripts instead. See the [example
             scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details.
-        do_eval (`bool`, *optional*):
-            Whether to run evaluation on the validation set or not. Will be set to `True` if `evaluation_strategy` is
-            different from `"no"`. This argument is not directly used by [`Trainer`], it's intended to be used by your
+        do_eval (*bool*, *optional*):
+            Whether to run evaluation on the validation set or not. Will be set to *True* if *evaluation_strategy* is
+            different from *"no"*. This argument is not directly used by [*Trainer*], it's intended to be used by your
             training/evaluation scripts instead. See the [example
             scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details.
-        do_predict (`bool`, *optional*, defaults to `False`):
-            Whether to run predictions on the test set or not. This argument is not directly used by [`Trainer`], it's
+        do_predict (*bool*, *optional*, defaults to *False*):
+            Whether to run predictions on the test set or not. This argument is not directly used by [*Trainer*], it's
             intended to be used by your training/evaluation scripts instead. See the [example
             scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details.
-        evaluation_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
+        evaluation_strategy (*str* or [*~trainer_utils.IntervalStrategy*], *optional*, defaults to *"no"*):
             The evaluation strategy to adopt during training. Possible values are:
 
-                - `"no"`: No evaluation is done during training.
-                - `"steps"`: Evaluation is done (and logged) every `eval_steps`.
-                - `"epoch"`: Evaluation is done at the end of each epoch.
+                - *"no"*: No evaluation is done during training.
+                - *"steps"*: Evaluation is done (and logged) every *eval_steps*.
+                - *"epoch"*: Evaluation is done at the end of each epoch.
 
-        prediction_loss_only (`bool`, *optional*, defaults to `False`):
+        prediction_loss_only (*bool*, *optional*, defaults to *False*):
             When performing evaluation and generating predictions, only returns the loss.
-        per_device_train_batch_size (`int`, *optional*, defaults to 8):
+        per_device_train_batch_size (*int*, *optional*, defaults to 8):
             The batch size per GPU/XPU/TPU/MPS/NPU core/CPU for training.
-        per_device_eval_batch_size (`int`, *optional*, defaults to 8):
+        per_device_eval_batch_size (*int*, *optional*, defaults to 8):
             The batch size per GPU/XPU/TPU/MPS/NPU core/CPU for evaluation.
-        gradient_accumulation_steps (`int`, *optional*, defaults to 1):
+        gradient_accumulation_steps (*int*, *optional*, defaults to 1):
             Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
 
             <Tip warning={true}>
 
             When using gradient accumulation, one step is counted as one step with backward pass. Therefore, logging,
-            evaluation, save will be conducted every `gradient_accumulation_steps * xxx_step` training examples.
+            evaluation, save will be conducted every *gradient_accumulation_steps * xxx_step* training examples.
 
             </Tip>
 
-        eval_accumulation_steps (`int`, *optional*):
+        eval_accumulation_steps (*int*, *optional*):
             Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. If
             left unset, the whole predictions are accumulated on GPU/NPU/TPU before being moved to the CPU (faster but
             requires more memory).
-        eval_delay (`float`, *optional*):
+        eval_delay (*float*, *optional*):
             Number of epochs or steps to wait for before the first evaluation can be performed, depending on the
             evaluation_strategy.
-        learning_rate (`float`, *optional*, defaults to 5e-5):
-            The initial learning rate for [`AdamW`] optimizer.
-        weight_decay (`float`, *optional*, defaults to 0):
-            The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in [`AdamW`]
+        learning_rate (*float*, *optional*, defaults to 5e-5):
+            The initial learning rate for [*AdamW*] optimizer.
+        weight_decay (*float*, *optional*, defaults to 0):
+            The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in [*AdamW*]
             optimizer.
-        adam_beta1 (`float`, *optional*, defaults to 0.9):
-            The beta1 hyperparameter for the [`AdamW`] optimizer.
-        adam_beta2 (`float`, *optional*, defaults to 0.999):
-            The beta2 hyperparameter for the [`AdamW`] optimizer.
-        adam_epsilon (`float`, *optional*, defaults to 1e-8):
-            The epsilon hyperparameter for the [`AdamW`] optimizer.
-        max_grad_norm (`float`, *optional*, defaults to 1.0):
+        adam_beta1 (*float*, *optional*, defaults to 0.9):
+            The beta1 hyperparameter for the [*AdamW*] optimizer.
+        adam_beta2 (*float*, *optional*, defaults to 0.999):
+            The beta2 hyperparameter for the [*AdamW*] optimizer.
+        adam_epsilon (*float*, *optional*, defaults to 1e-8):
+            The epsilon hyperparameter for the [*AdamW*] optimizer.
+        max_grad_norm (*float*, *optional*, defaults to 1.0):
             Maximum gradient norm (for gradient clipping).
-        num_train_epochs(`float`, *optional*, defaults to 3.0):
+        num_train_epochs(*float*, *optional*, defaults to 3.0):
             Total number of training epochs to perform (if not an integer, will perform the decimal part percents of
             the last epoch before stopping training).
-        max_steps (`int`, *optional*, defaults to -1):
-            If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`.
+        max_steps (*int*, *optional*, defaults to -1):
+            If set to a positive number, the total number of training steps to perform. Overrides *num_train_epochs*.
             In case of using a finite iterable dataset the training may stop before reaching the set number of steps
             when all data is exhausted
-        lr_scheduler_type (`str` or [`SchedulerType`], *optional*, defaults to `"linear"`):
-            The scheduler type to use. See the documentation of [`SchedulerType`] for all possible values.
+        lr_scheduler_type (*str* or [*SchedulerType*], *optional*, defaults to *"linear"*):
+            The scheduler type to use. See the documentation of [*SchedulerType*] for all possible values.
         lr_scheduler_kwargs ('dict', *optional*, defaults to {}):
             The extra arguments for the lr_scheduler. See the documentation of each scheduler for possible values.
-        warmup_ratio (`float`, *optional*, defaults to 0.0):
-            Ratio of total training steps used for a linear warmup from 0 to `learning_rate`.
-        warmup_steps (`int`, *optional*, defaults to 0):
-            Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of `warmup_ratio`.
-        log_level (`str`, *optional*, defaults to `passive`):
+        warmup_ratio (*float*, *optional*, defaults to 0.0):
+            Ratio of total training steps used for a linear warmup from 0 to *learning_rate*.
+        warmup_steps (*int*, *optional*, defaults to 0):
+            Number of steps used for a linear warmup from 0 to *learning_rate*. Overrides any effect of *warmup_ratio*.
+        log_level (*str*, *optional*, defaults to *passive*):
             Logger log level to use on the main process. Possible choices are the log levels as strings: 'debug',
             'info', 'warning', 'error' and 'critical', plus a 'passive' level which doesn't set anything and keeps the
-            current log level for the Transformers library (which will be `"warning"` by default).
-        log_level_replica (`str`, *optional*, defaults to `"warning"`):
-            Logger log level to use on replicas. Same choices as `log_level`"
-        log_on_each_node (`bool`, *optional*, defaults to `True`):
-            In multinode distributed training, whether to log using `log_level` once per node, or only on the main
+            current log level for the Transformers library (which will be *"warning"* by default).
+        log_level_replica (*str*, *optional*, defaults to *"warning"*):
+            Logger log level to use on replicas. Same choices as *log_level*"
+        log_on_each_node (*bool*, *optional*, defaults to *True*):
+            In multinode distributed training, whether to log using *log_level* once per node, or only on the main
             node.
-        logging_dir (`str`, *optional*):
+        logging_dir (*str*, *optional*):
             [TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to
             *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***.
-        logging_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
+        logging_strategy (*str* or [*~trainer_utils.IntervalStrategy*], *optional*, defaults to *"steps"*):
             The logging strategy to adopt during training. Possible values are:
 
-                - `"no"`: No logging is done during training.
-                - `"epoch"`: Logging is done at the end of each epoch.
-                - `"steps"`: Logging is done every `logging_steps`.
+                - *"no"*: No logging is done during training.
+                - *"epoch"*: Logging is done at the end of each epoch.
+                - *"steps"*: Logging is done every *logging_steps*.
 
-        logging_first_step (`bool`, *optional*, defaults to `False`):
-            Whether to log and evaluate the first `global_step` or not.
-        logging_steps (`int` or `float`, *optional*, defaults to 500):
-            Number of update steps between two logs if `logging_strategy="steps"`. Should be an integer or a float in
-            range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.
-        logging_nan_inf_filter (`bool`, *optional*, defaults to `True`):
-            Whether to filter `nan` and `inf` losses for logging. If set to `True` the loss of every step that is `nan`
-            or `inf` is filtered and the average loss of the current logging window is taken instead.
+        logging_first_step (*bool*, *optional*, defaults to *False*):
+            Whether to log and evaluate the first *global_step* or not.
+        logging_steps (*int* or *float*, *optional*, defaults to 500):
+            Number of update steps between two logs if *logging_strategy="steps"*. Should be an integer or a float in
+            range *[0,1)*. If smaller than 1, will be interpreted as ratio of total training steps.
+        logging_nan_inf_filter (*bool*, *optional*, defaults to *True*):
+            Whether to filter *nan* and *inf* losses for logging. If set to *True* the loss of every step that is *nan*
+            or *inf* is filtered and the average loss of the current logging window is taken instead.
 
             <Tip>
 
-            `logging_nan_inf_filter` only influences the logging of loss values, it does not change the behavior the
+            *logging_nan_inf_filter* only influences the logging of loss values, it does not change the behavior the
             gradient is computed or applied to the model.
 
             </Tip>
 
-        save_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
+        save_strategy (*str* or [*~trainer_utils.IntervalStrategy*], *optional*, defaults to *"steps"*):
             The checkpoint save strategy to adopt during training. Possible values are:
 
-                - `"no"`: No save is done during training.
-                - `"epoch"`: Save is done at the end of each epoch.
-                - `"steps"`: Save is done every `save_steps`.
-        save_steps (`int` or `float`, *optional*, defaults to 500):
-            Number of updates steps before two checkpoint saves if `save_strategy="steps"`. Should be an integer or a
-            float in range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.
-        save_total_limit (`int`, *optional*):
+                - *"no"*: No save is done during training.
+                - *"epoch"*: Save is done at the end of each epoch.
+                - *"steps"*: Save is done every *save_steps*.
+        save_steps (*int* or *float*, *optional*, defaults to 500):
+            Number of updates steps before two checkpoint saves if *save_strategy="steps"*. Should be an integer or a
+            float in range *[0,1)*. If smaller than 1, will be interpreted as ratio of total training steps.
+        save_total_limit (*int*, *optional*):
             If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
-            `output_dir`. When `load_best_model_at_end` is enabled, the "best" checkpoint according to
-            `metric_for_best_model` will always be retained in addition to the most recent ones. For example, for
-            `save_total_limit=5` and `load_best_model_at_end`, the four last checkpoints will always be retained
-            alongside the best model. When `save_total_limit=1` and `load_best_model_at_end`, it is possible that two
+            *output_dir*. When *load_best_model_at_end* is enabled, the "best" checkpoint according to
+            *metric_for_best_model* will always be retained in addition to the most recent ones. For example, for
+            *save_total_limit=5* and *load_best_model_at_end*, the four last checkpoints will always be retained
+            alongside the best model. When *save_total_limit=1* and *load_best_model_at_end*, it is possible that two
             checkpoints are saved: the last one and the best one (if they are different).
-        save_safetensors (`bool`, *optional*, defaults to `True`):
+        save_safetensors (*bool*, *optional*, defaults to *True*):
             Use [safetensors](https://huggingface.co/docs/safetensors) saving and loading for state dicts instead of
-            default `torch.load` and `torch.save`.
-        save_on_each_node (`bool`, *optional*, defaults to `False`):
+            default *torch.load* and *torch.save*.
+        save_on_each_node (*bool*, *optional*, defaults to *False*):
             When doing multi-node distributed training, whether to save models and checkpoints on each node, or only on
             the main one.
 
             This should not be activated when the different nodes use the same storage as the files will be saved with
             the same names for each node.
-        save_only_model (`bool`, *optional*, defaults to `False`):
+        save_only_model (*bool*, *optional*, defaults to *False*):
             When checkpointing, whether to only save the model, or also the optimizer, scheduler & rng state.
             Note that when this is true, you won't be able to resume training from checkpoint.
             This enables you to save storage by not storing the optimizer, scheduler & rng state.
-            You can only load the model using `from_pretrained` with this option set to `True`.
-        use_cpu (`bool`, *optional*, defaults to `False`):
+            You can only load the model using *from_pretrained* with this option set to *True*.
+        use_cpu (*bool*, *optional*, defaults to *False*):
             Whether or not to use cpu. If set to False, we will use cuda or mps device if available.
-        seed (`int`, *optional*, defaults to 42):
+        seed (*int*, *optional*, defaults to 42):
             Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use the
-            [`~Trainer.model_init`] function to instantiate the model if it has some randomly initialized parameters.
-        data_seed (`int`, *optional*):
+            [*~Trainer.model_init*] function to instantiate the model if it has some randomly initialized parameters.
+        data_seed (*int*, *optional*):
             Random seed to be used with data samplers. If not set, random generators for data sampling will use the
-            same seed as `seed`. This can be used to ensure reproducibility of data sampling, independent of the model
+            same seed as *seed*. This can be used to ensure reproducibility of data sampling, independent of the model
             seed.
-        jit_mode_eval (`bool`, *optional*, defaults to `False`):
+        jit_mode_eval (*bool*, *optional*, defaults to *False*):
             Whether or not to use PyTorch jit trace for inference.
-        use_ipex (`bool`, *optional*, defaults to `False`):
+        use_ipex (*bool*, *optional*, defaults to *False*):
             Use Intel extension for PyTorch when it is available. [IPEX
             installation](https://github.com/intel/intel-extension-for-pytorch).
-        bf16 (`bool`, *optional*, defaults to `False`):
+        bf16 (*bool*, *optional*, defaults to *False*):
             Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training. Requires Ampere or higher
             NVIDIA architecture or using CPU (use_cpu) or Ascend NPU. This is an experimental API and it may change.
-        fp16 (`bool`, *optional*, defaults to `False`):
+        fp16 (*bool*, *optional*, defaults to *False*):
             Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.
-        fp16_opt_level (`str`, *optional*, defaults to 'O1'):
-            For `fp16` training, Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details on
+        fp16_opt_level (*str*, *optional*, defaults to 'O1'):
+            For *fp16* training, Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details on
             the [Apex documentation](https://nvidia.github.io/apex/amp).
-        fp16_backend (`str`, *optional*, defaults to `"auto"`):
-            This argument is deprecated. Use `half_precision_backend` instead.
-        half_precision_backend (`str`, *optional*, defaults to `"auto"`):
-            The backend to use for mixed precision training. Must be one of `"auto", "apex", "cpu_amp"`. `"auto"` will
+        fp16_backend (*str*, *optional*, defaults to *"auto"*):
+            This argument is deprecated. Use *half_precision_backend* instead.
+        half_precision_backend (*str*, *optional*, defaults to *"auto"*):
+            The backend to use for mixed precision training. Must be one of *"auto", "apex", "cpu_amp"*. *"auto"* will
             use CPU/CUDA AMP or APEX depending on the PyTorch version detected, while the other choices will force the
             requested backend.
-        bf16_full_eval (`bool`, *optional*, defaults to `False`):
+        bf16_full_eval (*bool*, *optional*, defaults to *False*):
             Whether to use full bfloat16 evaluation instead of 32-bit. This will be faster and save memory but can harm
             metric values. This is an experimental API and it may change.
-        fp16_full_eval (`bool`, *optional*, defaults to `False`):
+        fp16_full_eval (*bool*, *optional*, defaults to *False*):
             Whether to use full float16 evaluation instead of 32-bit. This will be faster and save memory but can harm
             metric values.
-        tf32 (`bool`, *optional*):
+        tf32 (*bool*, *optional*):
             Whether to enable the TF32 mode, available in Ampere and newer GPU architectures. The default value depends
-            on PyTorch's version default of `torch.backends.cuda.matmul.allow_tf32`. For more details please refer to
+            on PyTorch's version default of *torch.backends.cuda.matmul.allow_tf32*. For more details please refer to
             the [TF32](https://huggingface.co/docs/transformers/performance#tf32) documentation. This is an
             experimental API and it may change.
-        local_rank (`int`, *optional*, defaults to -1):
+        local_rank (*int*, *optional*, defaults to -1):
             Rank of the process during distributed training.
-        ddp_backend (`str`, *optional*):
-            The backend to use for distributed training. Must be one of `"nccl"`, `"mpi"`, `"ccl"`, `"gloo"`, `"hccl"`.
-        tpu_num_cores (`int`, *optional*):
+        ddp_backend (*str*, *optional*):
+            The backend to use for distributed training. Must be one of *"nccl"*, *"mpi"*, *"ccl"*, *"gloo"*, *"hccl"*.
+        tpu_num_cores (*int*, *optional*):
             When training on TPU, the number of TPU cores (automatically passed by launcher script).
-        dataloader_drop_last (`bool`, *optional*, defaults to `False`):
+        dataloader_drop_last (*bool*, *optional*, defaults to *False*):
             Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
             or not.
-        eval_steps (`int` or `float`, *optional*):
-            Number of update steps between two evaluations if `evaluation_strategy="steps"`. Will default to the same
-            value as `logging_steps` if not set. Should be an integer or a float in range `[0,1)`. If smaller than 1,
+        eval_steps (*int* or *float*, *optional*):
+            Number of update steps between two evaluations if *evaluation_strategy="steps"*. Will default to the same
+            value as *logging_steps* if not set. Should be an integer or a float in range *[0,1)*. If smaller than 1,
             will be interpreted as ratio of total training steps.
-        dataloader_num_workers (`int`, *optional*, defaults to 0):
+        dataloader_num_workers (*int*, *optional*, defaults to 0):
             Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the
             main process.
-        past_index (`int`, *optional*, defaults to -1):
+        past_index (*int*, *optional*, defaults to -1):
             Some models like [TransformerXL](../model_doc/transformerxl) or [XLNet](../model_doc/xlnet) can make use of
-            the past hidden states for their predictions. If this argument is set to a positive int, the `Trainer` will
+            the past hidden states for their predictions. If this argument is set to a positive int, the *Trainer* will
             use the corresponding output (usually index 2) as the past state and feed it to the model at the next
-            training step under the keyword argument `mems`.
-        run_name (`str`, *optional*):
+            training step under the keyword argument *mems*.
+        run_name (*str*, *optional*):
             A descriptor for the run. Typically used for [wandb](https://www.wandb.com/) and
             [mlflow](https://www.mlflow.org/) logging.
-        disable_tqdm (`bool`, *optional*):
+        disable_tqdm (*bool*, *optional*):
             Whether or not to disable the tqdm progress bars and table of metrics produced by
-            [`~notebook.NotebookTrainingTracker`] in Jupyter Notebooks. Will default to `True` if the logging level is
-            set to warn or lower (default), `False` otherwise.
-        remove_unused_columns (`bool`, *optional*, defaults to `True`):
+            [*~notebook.NotebookTrainingTracker*] in Jupyter Notebooks. Will default to *True* if the logging level is
+            set to warn or lower (default), *False* otherwise.
+        remove_unused_columns (*bool*, *optional*, defaults to *True*):
             Whether or not to automatically remove the columns unused by the model forward method.
 
-            (Note that this behavior is not implemented for [`TFTrainer`] yet.)
-        label_names (`List[str]`, *optional*):
+            (Note that this behavior is not implemented for [*TFTrainer*] yet.)
+        label_names (*List[str]*, *optional*):
             The list of keys in your dictionary of inputs that correspond to the labels.
 
             Will eventually default to the list of argument names accepted by the model that contain the word "label",
-            except if the model used is one of the `XxxForQuestionAnswering` in which case it will also include the
-            `["start_positions", "end_positions"]` keys.
-        load_best_model_at_end (`bool`, *optional*, defaults to `False`):
+            except if the model used is one of the *XxxForQuestionAnswering* in which case it will also include the
+            *["start_positions", "end_positions"]* keys.
+        load_best_model_at_end (*bool*, *optional*, defaults to *False*):
             Whether or not to load the best model found during training at the end of training. When this option is
             enabled, the best checkpoint will always be saved. See
-            [`save_total_limit`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.save_total_limit)
+            [*save_total_limit*](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.save_total_limit)
             for more.
 
             <Tip>
 
-            When set to `True`, the parameters `save_strategy` needs to be the same as `evaluation_strategy`, and in
-            the case it is "steps", `save_steps` must be a round multiple of `eval_steps`.
+            When set to *True*, the parameters *save_strategy* needs to be the same as *evaluation_strategy*, and in
+            the case it is "steps", *save_steps* must be a round multiple of *eval_steps*.
 
             </Tip>
 
-        metric_for_best_model (`str`, *optional*):
-            Use in conjunction with `load_best_model_at_end` to specify the metric to use to compare two different
-            models. Must be the name of a metric returned by the evaluation with or without the prefix `"eval_"`. Will
-            default to `"loss"` if unspecified and `load_best_model_at_end=True` (to use the evaluation loss).
+        metric_for_best_model (*str*, *optional*):
+            Use in conjunction with *load_best_model_at_end* to specify the metric to use to compare two different
+            models. Must be the name of a metric returned by the evaluation with or without the prefix *"eval_"*. Will
+            default to *"loss"* if unspecified and *load_best_model_at_end=True* (to use the evaluation loss).
 
-            If you set this value, `greater_is_better` will default to `True`. Don't forget to set it to `False` if
+            If you set this value, *greater_is_better* will default to *True*. Don't forget to set it to *False* if
             your metric is better when lower.
-        greater_is_better (`bool`, *optional*):
-            Use in conjunction with `load_best_model_at_end` and `metric_for_best_model` to specify if better models
+        greater_is_better (*bool*, *optional*):
+            Use in conjunction with *load_best_model_at_end* and *metric_for_best_model* to specify if better models
             should have a greater metric or not. Will default to:
 
-            - `True` if `metric_for_best_model` is set to a value that isn't `"loss"` or `"eval_loss"`.
-            - `False` if `metric_for_best_model` is not set, or set to `"loss"` or `"eval_loss"`.
-        ignore_data_skip (`bool`, *optional*, defaults to `False`):
+            - *True* if *metric_for_best_model* is set to a value that isn't *"loss"* or *"eval_loss"*.
+            - *False* if *metric_for_best_model* is not set, or set to *"loss"* or *"eval_loss"*.
+        ignore_data_skip (*bool*, *optional*, defaults to *False*):
             When resuming training, whether or not to skip the epochs and batches to get the data loading at the same
-            stage as in the previous training. If set to `True`, the training will begin faster (as that skipping step
+            stage as in the previous training. If set to *True*, the training will begin faster (as that skipping step
             can take a long time) but will not yield the same results as the interrupted training would have.
-        fsdp (`bool`, `str` or list of [`~trainer_utils.FSDPOption`], *optional*, defaults to `''`):
+        fsdp (*bool*, *str* or list of [*~trainer_utils.FSDPOption*], *optional*, defaults to *''*):
             Use PyTorch Distributed Parallel Training (in distributed training only).
 
             A list of options along the following:
 
-            - `"full_shard"`: Shard parameters, gradients and optimizer states.
-            - `"shard_grad_op"`: Shard optimizer states and gradients.
-            - `"hybrid_shard"`: Apply ``FULL_SHARD`` within a node, and replicate parameters across nodes.
-            - `"hybrid_shard_zero2"`: Apply ``SHARD_GRAD_OP`` within a node, and replicate parameters across nodes.
-            - `"offload"`: Offload parameters and gradients to CPUs (only compatible with `"full_shard"` and
-              `"shard_grad_op"`).
-            - `"auto_wrap"`: Automatically recursively wrap layers with FSDP using `default_auto_wrap_policy`.
-        fsdp_config (`str` or `dict`, *optional*):
+            - *"full_shard"*: Shard parameters, gradients and optimizer states.
+            - *"shard_grad_op"*: Shard optimizer states and gradients.
+            - *"hybrid_shard"*: Apply `FULL_SHARD` within a node, and replicate parameters across nodes.
+            - *"hybrid_shard_zero2"*: Apply `SHARD_GRAD_OP` within a node, and replicate parameters across nodes.
+            - *"offload"*: Offload parameters and gradients to CPUs (only compatible with *"full_shard"* and
+              *"shard_grad_op"*).
+            - *"auto_wrap"*: Automatically recursively wrap layers with FSDP using *default_auto_wrap_policy*.
+        fsdp_config (*str* or *dict*, *optional*):
             Config to be used with fsdp (Pytorch Distributed Parallel Training). The value is either a location of
-            deepspeed json config file (e.g., `ds_config.json`) or an already loaded json file as `dict`.
+            deepspeed json config file (e.g., *ds_config.json*) or an already loaded json file as *dict*.
 
             A List of config and its options:
-                - min_num_params (`int`, *optional*, defaults to `0`):
-                    FSDP's minimum number of parameters for Default Auto Wrapping. (useful only when `fsdp` field is
+                - min_num_params (*int*, *optional*, defaults to *0*):
+                    FSDP's minimum number of parameters for Default Auto Wrapping. (useful only when *fsdp* field is
                     passed).
-                - transformer_layer_cls_to_wrap (`List[str]`, *optional*):
-                    List of transformer layer class names (case-sensitive) to wrap, e.g, `BertLayer`, `GPTJBlock`,
-                    `T5Block` .... (useful only when `fsdp` flag is passed).
-                - backward_prefetch (`str`, *optional*)
+                - transformer_layer_cls_to_wrap (*List[str]*, *optional*):
+                    List of transformer layer class names (case-sensitive) to wrap, e.g, *BertLayer*, *GPTJBlock*,
+                    *T5Block* .... (useful only when *fsdp* flag is passed).
+                - backward_prefetch (*str*, *optional*)
                     FSDP's backward prefetch mode. Controls when to prefetch next set of parameters (useful only when
-                    `fsdp` field is passed).
+                    *fsdp* field is passed).
 
                     A list of options along the following:
 
-                    - `"backward_pre"` : Prefetches the next set of parameters before the current set of parameter's
+                    - *"backward_pre"* : Prefetches the next set of parameters before the current set of parameter's
                       gradient
                         computation.
-                    - `"backward_post"` : This prefetches the next set of parameters after the current set of
+                    - *"backward_post"* : This prefetches the next set of parameters after the current set of
                       parameter’s
                         gradient computation.
-                - forward_prefetch (`bool`, *optional*, defaults to `False`)
-                    FSDP's forward prefetch mode (useful only when `fsdp` field is passed).
-                     If `"True"`, then FSDP explicitly prefetches the next upcoming all-gather while executing in the
+                - forward_prefetch (*bool*, *optional*, defaults to *False*)
+                    FSDP's forward prefetch mode (useful only when *fsdp* field is passed).
+                     If *"True"*, then FSDP explicitly prefetches the next upcoming all-gather while executing in the
                      forward pass.
-                - limit_all_gathers (`bool`, *optional*, defaults to `False`)
-                    FSDP's limit_all_gathers (useful only when `fsdp` field is passed).
-                     If `"True"`, FSDP explicitly synchronizes the CPU thread to prevent too many in-flight
+                - limit_all_gathers (*bool*, *optional*, defaults to *False*)
+                    FSDP's limit_all_gathers (useful only when *fsdp* field is passed).
+                     If *"True"*, FSDP explicitly synchronizes the CPU thread to prevent too many in-flight
                      all-gathers.
-                - use_orig_params (`bool`, *optional*, defaults to `True`)
-                    If `"True"`, allows non-uniform `requires_grad` during init, which means support for interspersed
+                - use_orig_params (*bool*, *optional*, defaults to *True*)
+                    If *"True"*, allows non-uniform *requires_grad* during init, which means support for interspersed
                     frozen and trainable paramteres. Useful in cases such as parameter-efficient fine-tuning. Please
                     refer this
                     [blog](https://dev-discuss.pytorch.org/t/rethinking-pytorch-fully-sharded-data-parallel-fsdp-from-first-principles/1019
-                - sync_module_states (`bool`, *optional*, defaults to `True`)
-                    If `"True"`, each individually wrapped FSDP unit will broadcast module parameters from rank 0 to
+                - sync_module_states (*bool*, *optional*, defaults to *True*)
+                    If *"True"*, each individually wrapped FSDP unit will broadcast module parameters from rank 0 to
                     ensure they are the same across all ranks after initialization
-                - activation_checkpointing (`bool`, *optional*, defaults to `False`):
+                - activation_checkpointing (*bool*, *optional*, defaults to *False*):
                     If True, activation checkpointing is a technique to reduce memory usage by clearing activations of
                     certain layers and recomputing them during a backward pass. Effectively, this trades extra
                     computation time for reduced memory usage.
-                - xla (`bool`, *optional*, defaults to `False`):
+                - xla (*bool*, *optional*, defaults to *False*):
                     Whether to use PyTorch/XLA Fully Sharded Data Parallel Training. This is an experimental feature
                     and its API may evolve in the future.
-                - xla_fsdp_settings (`dict`, *optional*)
+                - xla_fsdp_settings (*dict*, *optional*)
                     The value is a dictionary which stores the XLA FSDP wrapping parameters.
 
                     For a complete list of options, please see [here](
                     https://github.com/pytorch/xla/blob/master/torch_xla/distributed/fsdp/xla_fully_sharded_data_parallel.py).
-                - xla_fsdp_grad_ckpt (`bool`, *optional*, defaults to `False`):
+                - xla_fsdp_grad_ckpt (*bool*, *optional*, defaults to *False*):
                     Will use gradient checkpointing over each nested XLA FSDP wrapped layer. This setting can only be
                     used when the xla flag is set to true, and an auto wrapping policy is specified through
                     fsdp_min_num_params or fsdp_transformer_layer_cls_to_wrap.
 
-        deepspeed (`str` or `dict`, *optional*):
+        deepspeed (*str* or *dict*, *optional*):
             Use [Deepspeed](https://github.com/microsoft/deepspeed). This is an experimental feature and its API may
             evolve in the future. The value is either the location of DeepSpeed json config file (e.g.,
-            `ds_config.json`) or an already loaded json file as a `dict`"
-        label_smoothing_factor (`float`, *optional*, defaults to 0.0):
+            *ds_config.json*) or an already loaded json file as a *dict*"
+        label_smoothing_factor (*float*, *optional*, defaults to 0.0):
             The label smoothing factor to use. Zero means no label smoothing, otherwise the underlying onehot-encoded
-            labels are changed from 0s and 1s to `label_smoothing_factor/num_labels` and `1 - label_smoothing_factor +
-            label_smoothing_factor/num_labels` respectively.
-        debug (`str` or list of [`~debug_utils.DebugOption`], *optional*, defaults to `""`):
+            labels are changed from 0s and 1s to *label_smoothing_factor/num_labels* and *1 - label_smoothing_factor +
+            label_smoothing_factor/num_labels* respectively.
+        debug (*str* or list of [*~debug_utils.DebugOption*], *optional*, defaults to *""*):
             Enable one or more debug features. This is an experimental feature.
 
             Possible options are:
 
-            - `"underflow_overflow"`: detects overflow in model's input/outputs and reports the last frames that led to
+            - *"underflow_overflow"*: detects overflow in model's input/outputs and reports the last frames that led to
               the event
-            - `"tpu_metrics_debug"`: print debug metrics on TPU
+            - *"tpu_metrics_debug"*: print debug metrics on TPU
 
             The options should be separated by whitespaces.
-        optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch"`):
+        optim (*str* or [*training_args.OptimizerNames*], *optional*, defaults to *"adamw_torch"*):
             The optimizer to use: adamw_hf, adamw_torch, adamw_torch_fused, adamw_apex_fused, adamw_anyprecision or
             adafactor.
-        optim_args (`str`, *optional*):
+        optim_args (*str*, *optional*):
             Optional arguments that are supplied to AnyPrecisionAdamW.
-        group_by_length (`bool`, *optional*, defaults to `False`):
+        group_by_length (*bool*, *optional*, defaults to *False*):
             Whether or not to group together samples of roughly the same length in the training dataset (to minimize
             padding applied and be more efficient). Only useful if applying dynamic padding.
-        length_column_name (`str`, *optional*, defaults to `"length"`):
+        length_column_name (*str*, *optional*, defaults to *"length"*):
             Column name for precomputed lengths. If the column exists, grouping by length will use these values rather
-            than computing them on train startup. Ignored unless `group_by_length` is `True` and the dataset is an
-            instance of `Dataset`.
-        report_to (`str` or `List[str]`, *optional*, defaults to `"all"`):
-            The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
-            `"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`, `"neptune"`,
-            `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations installed, `"none"` for no
+            than computing them on train startup. Ignored unless *group_by_length* is *True* and the dataset is an
+            instance of *Dataset*.
+        report_to (*str* or *List[str]*, *optional*, defaults to *"all"*):
+            The list of integrations to report the results and logs to. Supported platforms are *"azure_ml"*,
+            *"clearml"*, *"codecarbon"*, *"comet_ml"*, *"dagshub"*, *"dvclive"*, *"flyte"*, *"mlflow"*, *"neptune"*,
+            *"tensorboard"*, and *"wandb"*. Use *"all"* to report to all integrations installed, *"none"* for no
             integrations.
-        ddp_find_unused_parameters (`bool`, *optional*):
-            When using distributed training, the value of the flag `find_unused_parameters` passed to
-            `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.
-        ddp_bucket_cap_mb (`int`, *optional*):
-            When using distributed training, the value of the flag `bucket_cap_mb` passed to `DistributedDataParallel`.
-        ddp_broadcast_buffers (`bool`, *optional*):
-            When using distributed training, the value of the flag `broadcast_buffers` passed to
-            `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.
-        dataloader_pin_memory (`bool`, *optional*, defaults to `True`):
-            Whether you want to pin memory in data loaders or not. Will default to `True`.
-        skip_memory_metrics (`bool`, *optional*, defaults to `True`):
+        ddp_find_unused_parameters (*bool*, *optional*):
+            When using distributed training, the value of the flag *find_unused_parameters* passed to
+            *DistributedDataParallel*. Will default to *False* if gradient checkpointing is used, *True* otherwise.
+        ddp_bucket_cap_mb (*int*, *optional*):
+            When using distributed training, the value of the flag *bucket_cap_mb* passed to *DistributedDataParallel*.
+        ddp_broadcast_buffers (*bool*, *optional*):
+            When using distributed training, the value of the flag *broadcast_buffers* passed to
+            *DistributedDataParallel*. Will default to *False* if gradient checkpointing is used, *True* otherwise.
+        dataloader_pin_memory (*bool*, *optional*, defaults to *True*):
+            Whether you want to pin memory in data loaders or not. Will default to *True*.
+        skip_memory_metrics (*bool*, *optional*, defaults to *True*):
             Whether to skip adding of memory profiler reports to metrics. This is skipped by default because it slows
             down the training and evaluation speed.
-        push_to_hub (`bool`, *optional*, defaults to `False`):
+        push_to_hub (*bool*, *optional*, defaults to *False*):
             Whether or not to push the model to the Hub every time the model is saved. If this is activated,
-            `output_dir` will begin a git directory synced with the repo (determined by `hub_model_id`) and the content
-            will be pushed each time a save is triggered (depending on your `save_strategy`). Calling
-            [`~Trainer.save_model`] will also trigger a push.
+            *output_dir* will begin a git directory synced with the repo (determined by *hub_model_id*) and the content
+            will be pushed each time a save is triggered (depending on your *save_strategy*). Calling
+            [*~Trainer.save_model*] will also trigger a push.
 
             <Tip warning={true}>
 
-            If `output_dir` exists, it needs to be a local clone of the repository to which the [`Trainer`] will be
+            If *output_dir* exists, it needs to be a local clone of the repository to which the [*Trainer*] will be
             pushed.
 
             </Tip>
 
-        resume_from_checkpoint (`str`, *optional*):
+        resume_from_checkpoint (*str*, *optional*):
             The path to a folder with a valid checkpoint for your model. This argument is not directly used by
-            [`Trainer`], it's intended to be used by your training/evaluation scripts instead. See the [example
+            [*Trainer*], it's intended to be used by your training/evaluation scripts instead. See the [example
             scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details.
-        hub_model_id (`str`, *optional*):
+        hub_model_id (*str*, *optional*):
             The name of the repository to keep in sync with the local *output_dir*. It can be a simple model ID in
             which case the model will be pushed in your namespace. Otherwise it should be the whole repository name,
-            for instance `"user_name/model"`, which allows you to push to an organization you are a member of with
-            `"organization_name/model"`. Will default to `user_name/output_dir_name` with *output_dir_name* being the
-            name of `output_dir`.
+            for instance *"user_name/model"*, which allows you to push to an organization you are a member of with
+            *"organization_name/model"*. Will default to *user_name/output_dir_name* with *output_dir_name* being the
+            name of *output_dir*.
 
-            Will default to the name of `output_dir`.
-        hub_strategy (`str` or [`~trainer_utils.HubStrategy`], *optional*, defaults to `"every_save"`):
+            Will default to the name of *output_dir*.
+        hub_strategy (*str* or [*~trainer_utils.HubStrategy*], *optional*, defaults to *"every_save"*):
             Defines the scope of what is pushed to the Hub and when. Possible values are:
 
-            - `"end"`: push the model, its configuration, the tokenizer (if passed along to the [`Trainer`]) and a
-              draft of a model card when the [`~Trainer.save_model`] method is called.
-            - `"every_save"`: push the model, its configuration, the tokenizer (if passed along to the [`Trainer`]) and
+            - *"end"*: push the model, its configuration, the tokenizer (if passed along to the [*Trainer*]) and a
+              draft of a model card when the [*~Trainer.save_model*] method is called.
+            - *"every_save"*: push the model, its configuration, the tokenizer (if passed along to the [*Trainer*]) and
               a draft of a model card each time there is a model save. The pushes are asynchronous to not block
               training, and in case the save are very frequent, a new push is only attempted if the previous one is
               finished. A last push is made with the final model at the end of training.
-            - `"checkpoint"`: like `"every_save"` but the latest checkpoint is also pushed in a subfolder named
+            - *"checkpoint"*: like *"every_save"* but the latest checkpoint is also pushed in a subfolder named
               last-checkpoint, allowing you to resume training easily with
-              `trainer.train(resume_from_checkpoint="last-checkpoint")`.
-            - `"all_checkpoints"`: like `"checkpoint"` but all checkpoints are pushed like they appear in the output
+              *trainer.train(resume_from_checkpoint="last-checkpoint")*.
+            - *"all_checkpoints"*: like *"checkpoint"* but all checkpoints are pushed like they appear in the output
               folder (so you will get one checkpoint folder per folder in your final repository)
 
-        hub_token (`str`, *optional*):
+        hub_token (*str*, *optional*):
             The token to use to push the model to the Hub. Will default to the token in the cache folder obtained with
-            `huggingface-cli login`.
-        hub_private_repo (`bool`, *optional*, defaults to `False`):
+            *huggingface-cli login*.
+        hub_private_repo (*bool*, *optional*, defaults to *False*):
             If True, the Hub repo will be set to private.
-        hub_always_push (`bool`, *optional*, defaults to `False`):
-            Unless this is `True`, the `Trainer` will skip pushing a checkpoint when the previous push is not finished.
-        gradient_checkpointing (`bool`, *optional*, defaults to `False`):
+        hub_always_push (*bool*, *optional*, defaults to *False*):
+            Unless this is *True*, the *Trainer* will skip pushing a checkpoint when the previous push is not finished.
+        gradient_checkpointing (*bool*, *optional*, defaults to *False*):
             If True, use gradient checkpointing to save memory at the expense of slower backward pass.
-        gradient_checkpointing_kwargs (`dict`, *optional*, defaults to `None`):
-            Key word arguments to be passed to the `gradient_checkpointing_enable` method.
-        include_inputs_for_metrics (`bool`, *optional*, defaults to `False`):
-            Whether or not the inputs will be passed to the `compute_metrics` function. This is intended for metrics
+        gradient_checkpointing_kwargs (*dict*, *optional*, defaults to *None*):
+            Key word arguments to be passed to the *gradient_checkpointing_enable* method.
+        include_inputs_for_metrics (*bool*, *optional*, defaults to *False*):
+            Whether or not the inputs will be passed to the *compute_metrics* function. This is intended for metrics
             that need inputs, predictions and references for scoring calculation in Metric class.
-        auto_find_batch_size (`bool`, *optional*, defaults to `False`)
+        auto_find_batch_size (*bool*, *optional*, defaults to *False*)
             Whether to find a batch size that will fit into memory automatically through exponential decay, avoiding
-            CUDA Out-of-Memory errors. Requires accelerate to be installed (`pip install accelerate`)
-        full_determinism (`bool`, *optional*, defaults to `False`)
-            If `True`, [`enable_full_determinism`] is called instead of [`set_seed`] to ensure reproducible results in
+            CUDA Out-of-Memory errors. Requires accelerate to be installed (*pip install accelerate*)
+        full_determinism (*bool*, *optional*, defaults to *False*)
+            If *True*, [*enable_full_determinism*] is called instead of [*set_seed*] to ensure reproducible results in
             distributed training. Important: this will negatively impact the performance, so only use it for debugging.
-        torchdynamo (`str`, *optional*):
-            If set, the backend compiler for TorchDynamo. Possible choices are `"eager"`, `"aot_eager"`, `"inductor"`,
-            `"nvfuser"`, `"aot_nvfuser"`, `"aot_cudagraphs"`, `"ofi"`, `"fx2trt"`, `"onnxrt"` and `"ipex"`.
-        ray_scope (`str`, *optional*, defaults to `"last"`):
-            The scope to use when doing hyperparameter search with Ray. By default, `"last"` will be used. Ray will
+        torchdynamo (*str*, *optional*):
+            If set, the backend compiler for TorchDynamo. Possible choices are *"eager"*, *"aot_eager"*, *"inductor"*,
+            *"nvfuser"*, *"aot_nvfuser"*, *"aot_cudagraphs"*, *"ofi"*, *"fx2trt"*, *"onnxrt"* and *"ipex"*.
+        ray_scope (*str*, *optional*, defaults to *"last"*):
+            The scope to use when doing hyperparameter search with Ray. By default, *"last"* will be used. Ray will
             then use the last checkpoint of all trials, compare those, and select the best one. However, other options
             are also available. See the [Ray documentation](
-            https://docs.ray.io/en/latest/tune/api_docs/analysis.html#ray.tune.ExperimentAnalysis.get_best_trial) for
+            https://docs.ray.io/en/latest/tune/api_docs/analysis#ray.tune.ExperimentAnalysis.get_best_trial) for
             more options.
-        ddp_timeout (`int`, *optional*, defaults to 1800):
-            The timeout for `torch.distributed.init_process_group` calls, used to avoid GPU socket timeouts when
+        ddp_timeout (*int*, *optional*, defaults to 1800):
+            The timeout for *torch.distributed.init_process_group* calls, used to avoid GPU socket timeouts when
             performing slow operations in distributed runnings. Please refer the [PyTorch documentation]
-            (https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) for more
+            (https://pytorch.org/docs/stable/distributed#torch.distributed.init_process_group) for more
             information.
-        use_mps_device (`bool`, *optional*, defaults to `False`):
-            This argument is deprecated.`mps` device will be used if it is available similar to `cuda` device.
-        torch_compile (`bool`, *optional*, defaults to `False`):
+        use_mps_device (*bool*, *optional*, defaults to *False*):
+            This argument is deprecated.*mps* device will be used if it is available similar to *cuda* device.
+        torch_compile (*bool*, *optional*, defaults to *False*):
             Whether or not to compile the model using PyTorch 2.0
-            [`torch.compile`](https://pytorch.org/get-started/pytorch-2.0/).
+            [*torch.compile*](https://pytorch.org/get-started/pytorch-2.0/).
 
-            This will use the best defaults for the [`torch.compile`
-            API](https://pytorch.org/docs/stable/generated/torch.compile.html?highlight=torch+compile#torch.compile).
-            You can customize the defaults with the argument `torch_compile_backend` and `torch_compile_mode` but we
+            This will use the best defaults for the [*torch.compile*
+            API](https://pytorch.org/docs/stable/generated/torch.compile?highlight=torch+compile#torch.compile).
+            You can customize the defaults with the argument *torch_compile_backend* and *torch_compile_mode* but we
             don't guarantee any of them will work as the support is progressively rolled in in PyTorch.
 
             This flag and the whole compile API is experimental and subject to change in future releases.
-        torch_compile_backend (`str`, *optional*):
-            The backend to use in `torch.compile`. If set to any value, `torch_compile` will be set to `True`.
+        torch_compile_backend (*str*, *optional*):
+            The backend to use in *torch.compile*. If set to any value, *torch_compile* will be set to *True*.
 
             Refer to the PyTorch doc for possible values and note that they may change across PyTorch versions.
 
             This flag is experimental and subject to change in future releases.
-        torch_compile_mode (`str`, *optional*):
-            The mode to use in `torch.compile`. If set to any value, `torch_compile` will be set to `True`.
+        torch_compile_mode (*str*, *optional*):
+            The mode to use in *torch.compile*. If set to any value, *torch_compile* will be set to *True*.
 
             Refer to the PyTorch doc for possible values and note that they may change across PyTorch versions.
 
             This flag is experimental and subject to change in future releases.
-        split_batches (`bool`, *optional*):
+        split_batches (*bool*, *optional*):
             Whether or not the accelerator should split the batches yielded by the dataloaders across the devices
             during distributed training. If
 
-            set to `True`, the actual batch size used will be the same on any kind of distributed processes, but it
+            set to *True*, the actual batch size used will be the same on any kind of distributed processes, but it
             must be a
 
             round multiple of the number of processes you are using (such as GPUs).
-        include_tokens_per_second (`bool`, *optional*):
+        include_tokens_per_second (*bool*, *optional*):
             Whether or not to compute the number of tokens per second per device for training speed metrics.
 
             This will iterate over the entire training dataloader once beforehand,
 
             and will slow down the entire process.
 
-        include_num_input_tokens_seen (`bool`, *optional*):
+        include_num_input_tokens_seen (*bool*, *optional*):
             Whether or not to track the number of input tokens seen throughout training.
 
             May be slower in distributed training as gather operations must be called.
 
-        neftune_noise_alpha (`Optional[float]`):
-            If not `None`, this will activate NEFTune noise embeddings. This can drastically improve model performance
+        neftune_noise_alpha (*Optional[float]*):
+            If not *None*, this will activate NEFTune noise embeddings. This can drastically improve model performance
             for instruction fine-tuning. Check out the [original paper](https://arxiv.org/abs/2310.05914) and the
-            [original code](https://github.com/neelsjain/NEFTune). Support transformers `PreTrainedModel` and also
-            `PeftModel` from peft.
+            [original code](https://github.com/neelsjain/NEFTune). Support transformers *PreTrainedModel* and also
+            *PeftModel* from peft.
     """
 
     framework = "pt"

From 3dc64c9657c78d10234a5f6d12f101dba2cd25b5 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 22 Nov 2023 18:01:30 +0530
Subject: [PATCH 04/12] Revert "fix quality?"

This reverts commit 149330a6abc078827be274db84c8a2d26a76eba1.
---
 src/transformers/training_args.py | 514 +++++++++++++++---------------
 1 file changed, 257 insertions(+), 257 deletions(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index d5914ee36e11d..c249adfbb15d7 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -163,498 +163,498 @@ class TrainingArguments:
     TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
     itself**.
 
-    Using [*HfArgumentParser*] we can turn this class into
+    Using [`HfArgumentParser`] we can turn this class into
     [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
     command line.
 
     Parameters:
-        output_dir (*str*):
+        output_dir (`str`):
             The output directory where the model predictions and checkpoints will be written.
-        overwrite_output_dir (*bool*, *optional*, defaults to *False*):
-            If *True*, overwrite the content of the output directory. Use this to continue training if *output_dir*
+        overwrite_output_dir (`bool`, *optional*, defaults to `False`):
+            If `True`, overwrite the content of the output directory. Use this to continue training if `output_dir`
             points to a checkpoint directory.
-        do_train (*bool*, *optional*, defaults to *False*):
-            Whether to run training or not. This argument is not directly used by [*Trainer*], it's intended to be used
+        do_train (`bool`, *optional*, defaults to `False`):
+            Whether to run training or not. This argument is not directly used by [`Trainer`], it's intended to be used
             by your training/evaluation scripts instead. See the [example
             scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details.
-        do_eval (*bool*, *optional*):
-            Whether to run evaluation on the validation set or not. Will be set to *True* if *evaluation_strategy* is
-            different from *"no"*. This argument is not directly used by [*Trainer*], it's intended to be used by your
+        do_eval (`bool`, *optional*):
+            Whether to run evaluation on the validation set or not. Will be set to `True` if `evaluation_strategy` is
+            different from `"no"`. This argument is not directly used by [`Trainer`], it's intended to be used by your
             training/evaluation scripts instead. See the [example
             scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details.
-        do_predict (*bool*, *optional*, defaults to *False*):
-            Whether to run predictions on the test set or not. This argument is not directly used by [*Trainer*], it's
+        do_predict (`bool`, *optional*, defaults to `False`):
+            Whether to run predictions on the test set or not. This argument is not directly used by [`Trainer`], it's
             intended to be used by your training/evaluation scripts instead. See the [example
             scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details.
-        evaluation_strategy (*str* or [*~trainer_utils.IntervalStrategy*], *optional*, defaults to *"no"*):
+        evaluation_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
             The evaluation strategy to adopt during training. Possible values are:
 
-                - *"no"*: No evaluation is done during training.
-                - *"steps"*: Evaluation is done (and logged) every *eval_steps*.
-                - *"epoch"*: Evaluation is done at the end of each epoch.
+                - `"no"`: No evaluation is done during training.
+                - `"steps"`: Evaluation is done (and logged) every `eval_steps`.
+                - `"epoch"`: Evaluation is done at the end of each epoch.
 
-        prediction_loss_only (*bool*, *optional*, defaults to *False*):
+        prediction_loss_only (`bool`, *optional*, defaults to `False`):
             When performing evaluation and generating predictions, only returns the loss.
-        per_device_train_batch_size (*int*, *optional*, defaults to 8):
+        per_device_train_batch_size (`int`, *optional*, defaults to 8):
             The batch size per GPU/XPU/TPU/MPS/NPU core/CPU for training.
-        per_device_eval_batch_size (*int*, *optional*, defaults to 8):
+        per_device_eval_batch_size (`int`, *optional*, defaults to 8):
             The batch size per GPU/XPU/TPU/MPS/NPU core/CPU for evaluation.
-        gradient_accumulation_steps (*int*, *optional*, defaults to 1):
+        gradient_accumulation_steps (`int`, *optional*, defaults to 1):
             Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
 
             <Tip warning={true}>
 
             When using gradient accumulation, one step is counted as one step with backward pass. Therefore, logging,
-            evaluation, save will be conducted every *gradient_accumulation_steps * xxx_step* training examples.
+            evaluation, save will be conducted every `gradient_accumulation_steps * xxx_step` training examples.
 
             </Tip>
 
-        eval_accumulation_steps (*int*, *optional*):
+        eval_accumulation_steps (`int`, *optional*):
             Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. If
             left unset, the whole predictions are accumulated on GPU/NPU/TPU before being moved to the CPU (faster but
             requires more memory).
-        eval_delay (*float*, *optional*):
+        eval_delay (`float`, *optional*):
             Number of epochs or steps to wait for before the first evaluation can be performed, depending on the
             evaluation_strategy.
-        learning_rate (*float*, *optional*, defaults to 5e-5):
-            The initial learning rate for [*AdamW*] optimizer.
-        weight_decay (*float*, *optional*, defaults to 0):
-            The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in [*AdamW*]
+        learning_rate (`float`, *optional*, defaults to 5e-5):
+            The initial learning rate for [`AdamW`] optimizer.
+        weight_decay (`float`, *optional*, defaults to 0):
+            The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in [`AdamW`]
             optimizer.
-        adam_beta1 (*float*, *optional*, defaults to 0.9):
-            The beta1 hyperparameter for the [*AdamW*] optimizer.
-        adam_beta2 (*float*, *optional*, defaults to 0.999):
-            The beta2 hyperparameter for the [*AdamW*] optimizer.
-        adam_epsilon (*float*, *optional*, defaults to 1e-8):
-            The epsilon hyperparameter for the [*AdamW*] optimizer.
-        max_grad_norm (*float*, *optional*, defaults to 1.0):
+        adam_beta1 (`float`, *optional*, defaults to 0.9):
+            The beta1 hyperparameter for the [`AdamW`] optimizer.
+        adam_beta2 (`float`, *optional*, defaults to 0.999):
+            The beta2 hyperparameter for the [`AdamW`] optimizer.
+        adam_epsilon (`float`, *optional*, defaults to 1e-8):
+            The epsilon hyperparameter for the [`AdamW`] optimizer.
+        max_grad_norm (`float`, *optional*, defaults to 1.0):
             Maximum gradient norm (for gradient clipping).
-        num_train_epochs(*float*, *optional*, defaults to 3.0):
+        num_train_epochs(`float`, *optional*, defaults to 3.0):
             Total number of training epochs to perform (if not an integer, will perform the decimal part percents of
             the last epoch before stopping training).
-        max_steps (*int*, *optional*, defaults to -1):
-            If set to a positive number, the total number of training steps to perform. Overrides *num_train_epochs*.
+        max_steps (`int`, *optional*, defaults to -1):
+            If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`.
             In case of using a finite iterable dataset the training may stop before reaching the set number of steps
             when all data is exhausted
-        lr_scheduler_type (*str* or [*SchedulerType*], *optional*, defaults to *"linear"*):
-            The scheduler type to use. See the documentation of [*SchedulerType*] for all possible values.
+        lr_scheduler_type (`str` or [`SchedulerType`], *optional*, defaults to `"linear"`):
+            The scheduler type to use. See the documentation of [`SchedulerType`] for all possible values.
         lr_scheduler_kwargs ('dict', *optional*, defaults to {}):
             The extra arguments for the lr_scheduler. See the documentation of each scheduler for possible values.
-        warmup_ratio (*float*, *optional*, defaults to 0.0):
-            Ratio of total training steps used for a linear warmup from 0 to *learning_rate*.
-        warmup_steps (*int*, *optional*, defaults to 0):
-            Number of steps used for a linear warmup from 0 to *learning_rate*. Overrides any effect of *warmup_ratio*.
-        log_level (*str*, *optional*, defaults to *passive*):
+        warmup_ratio (`float`, *optional*, defaults to 0.0):
+            Ratio of total training steps used for a linear warmup from 0 to `learning_rate`.
+        warmup_steps (`int`, *optional*, defaults to 0):
+            Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of `warmup_ratio`.
+        log_level (`str`, *optional*, defaults to `passive`):
             Logger log level to use on the main process. Possible choices are the log levels as strings: 'debug',
             'info', 'warning', 'error' and 'critical', plus a 'passive' level which doesn't set anything and keeps the
-            current log level for the Transformers library (which will be *"warning"* by default).
-        log_level_replica (*str*, *optional*, defaults to *"warning"*):
-            Logger log level to use on replicas. Same choices as *log_level*"
-        log_on_each_node (*bool*, *optional*, defaults to *True*):
-            In multinode distributed training, whether to log using *log_level* once per node, or only on the main
+            current log level for the Transformers library (which will be `"warning"` by default).
+        log_level_replica (`str`, *optional*, defaults to `"warning"`):
+            Logger log level to use on replicas. Same choices as `log_level`"
+        log_on_each_node (`bool`, *optional*, defaults to `True`):
+            In multinode distributed training, whether to log using `log_level` once per node, or only on the main
             node.
-        logging_dir (*str*, *optional*):
+        logging_dir (`str`, *optional*):
             [TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to
             *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***.
-        logging_strategy (*str* or [*~trainer_utils.IntervalStrategy*], *optional*, defaults to *"steps"*):
+        logging_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
             The logging strategy to adopt during training. Possible values are:
 
-                - *"no"*: No logging is done during training.
-                - *"epoch"*: Logging is done at the end of each epoch.
-                - *"steps"*: Logging is done every *logging_steps*.
+                - `"no"`: No logging is done during training.
+                - `"epoch"`: Logging is done at the end of each epoch.
+                - `"steps"`: Logging is done every `logging_steps`.
 
-        logging_first_step (*bool*, *optional*, defaults to *False*):
-            Whether to log and evaluate the first *global_step* or not.
-        logging_steps (*int* or *float*, *optional*, defaults to 500):
-            Number of update steps between two logs if *logging_strategy="steps"*. Should be an integer or a float in
-            range *[0,1)*. If smaller than 1, will be interpreted as ratio of total training steps.
-        logging_nan_inf_filter (*bool*, *optional*, defaults to *True*):
-            Whether to filter *nan* and *inf* losses for logging. If set to *True* the loss of every step that is *nan*
-            or *inf* is filtered and the average loss of the current logging window is taken instead.
+        logging_first_step (`bool`, *optional*, defaults to `False`):
+            Whether to log and evaluate the first `global_step` or not.
+        logging_steps (`int` or `float`, *optional*, defaults to 500):
+            Number of update steps between two logs if `logging_strategy="steps"`. Should be an integer or a float in
+            range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.
+        logging_nan_inf_filter (`bool`, *optional*, defaults to `True`):
+            Whether to filter `nan` and `inf` losses for logging. If set to `True` the loss of every step that is `nan`
+            or `inf` is filtered and the average loss of the current logging window is taken instead.
 
             <Tip>
 
-            *logging_nan_inf_filter* only influences the logging of loss values, it does not change the behavior the
+            `logging_nan_inf_filter` only influences the logging of loss values, it does not change the behavior the
             gradient is computed or applied to the model.
 
             </Tip>
 
-        save_strategy (*str* or [*~trainer_utils.IntervalStrategy*], *optional*, defaults to *"steps"*):
+        save_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
             The checkpoint save strategy to adopt during training. Possible values are:
 
-                - *"no"*: No save is done during training.
-                - *"epoch"*: Save is done at the end of each epoch.
-                - *"steps"*: Save is done every *save_steps*.
-        save_steps (*int* or *float*, *optional*, defaults to 500):
-            Number of updates steps before two checkpoint saves if *save_strategy="steps"*. Should be an integer or a
-            float in range *[0,1)*. If smaller than 1, will be interpreted as ratio of total training steps.
-        save_total_limit (*int*, *optional*):
+                - `"no"`: No save is done during training.
+                - `"epoch"`: Save is done at the end of each epoch.
+                - `"steps"`: Save is done every `save_steps`.
+        save_steps (`int` or `float`, *optional*, defaults to 500):
+            Number of updates steps before two checkpoint saves if `save_strategy="steps"`. Should be an integer or a
+            float in range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps.
+        save_total_limit (`int`, *optional*):
             If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
-            *output_dir*. When *load_best_model_at_end* is enabled, the "best" checkpoint according to
-            *metric_for_best_model* will always be retained in addition to the most recent ones. For example, for
-            *save_total_limit=5* and *load_best_model_at_end*, the four last checkpoints will always be retained
-            alongside the best model. When *save_total_limit=1* and *load_best_model_at_end*, it is possible that two
+            `output_dir`. When `load_best_model_at_end` is enabled, the "best" checkpoint according to
+            `metric_for_best_model` will always be retained in addition to the most recent ones. For example, for
+            `save_total_limit=5` and `load_best_model_at_end`, the four last checkpoints will always be retained
+            alongside the best model. When `save_total_limit=1` and `load_best_model_at_end`, it is possible that two
             checkpoints are saved: the last one and the best one (if they are different).
-        save_safetensors (*bool*, *optional*, defaults to *True*):
+        save_safetensors (`bool`, *optional*, defaults to `True`):
             Use [safetensors](https://huggingface.co/docs/safetensors) saving and loading for state dicts instead of
-            default *torch.load* and *torch.save*.
-        save_on_each_node (*bool*, *optional*, defaults to *False*):
+            default `torch.load` and `torch.save`.
+        save_on_each_node (`bool`, *optional*, defaults to `False`):
             When doing multi-node distributed training, whether to save models and checkpoints on each node, or only on
             the main one.
 
             This should not be activated when the different nodes use the same storage as the files will be saved with
             the same names for each node.
-        save_only_model (*bool*, *optional*, defaults to *False*):
+        save_only_model (`bool`, *optional*, defaults to `False`):
             When checkpointing, whether to only save the model, or also the optimizer, scheduler & rng state.
             Note that when this is true, you won't be able to resume training from checkpoint.
             This enables you to save storage by not storing the optimizer, scheduler & rng state.
-            You can only load the model using *from_pretrained* with this option set to *True*.
-        use_cpu (*bool*, *optional*, defaults to *False*):
+            You can only load the model using `from_pretrained` with this option set to `True`.
+        use_cpu (`bool`, *optional*, defaults to `False`):
             Whether or not to use cpu. If set to False, we will use cuda or mps device if available.
-        seed (*int*, *optional*, defaults to 42):
+        seed (`int`, *optional*, defaults to 42):
             Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use the
-            [*~Trainer.model_init*] function to instantiate the model if it has some randomly initialized parameters.
-        data_seed (*int*, *optional*):
+            [`~Trainer.model_init`] function to instantiate the model if it has some randomly initialized parameters.
+        data_seed (`int`, *optional*):
             Random seed to be used with data samplers. If not set, random generators for data sampling will use the
-            same seed as *seed*. This can be used to ensure reproducibility of data sampling, independent of the model
+            same seed as `seed`. This can be used to ensure reproducibility of data sampling, independent of the model
             seed.
-        jit_mode_eval (*bool*, *optional*, defaults to *False*):
+        jit_mode_eval (`bool`, *optional*, defaults to `False`):
             Whether or not to use PyTorch jit trace for inference.
-        use_ipex (*bool*, *optional*, defaults to *False*):
+        use_ipex (`bool`, *optional*, defaults to `False`):
             Use Intel extension for PyTorch when it is available. [IPEX
             installation](https://github.com/intel/intel-extension-for-pytorch).
-        bf16 (*bool*, *optional*, defaults to *False*):
+        bf16 (`bool`, *optional*, defaults to `False`):
             Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training. Requires Ampere or higher
             NVIDIA architecture or using CPU (use_cpu) or Ascend NPU. This is an experimental API and it may change.
-        fp16 (*bool*, *optional*, defaults to *False*):
+        fp16 (`bool`, *optional*, defaults to `False`):
             Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.
-        fp16_opt_level (*str*, *optional*, defaults to 'O1'):
-            For *fp16* training, Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details on
+        fp16_opt_level (`str`, *optional*, defaults to 'O1'):
+            For `fp16` training, Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details on
             the [Apex documentation](https://nvidia.github.io/apex/amp).
-        fp16_backend (*str*, *optional*, defaults to *"auto"*):
-            This argument is deprecated. Use *half_precision_backend* instead.
-        half_precision_backend (*str*, *optional*, defaults to *"auto"*):
-            The backend to use for mixed precision training. Must be one of *"auto", "apex", "cpu_amp"*. *"auto"* will
+        fp16_backend (`str`, *optional*, defaults to `"auto"`):
+            This argument is deprecated. Use `half_precision_backend` instead.
+        half_precision_backend (`str`, *optional*, defaults to `"auto"`):
+            The backend to use for mixed precision training. Must be one of `"auto", "apex", "cpu_amp"`. `"auto"` will
             use CPU/CUDA AMP or APEX depending on the PyTorch version detected, while the other choices will force the
             requested backend.
-        bf16_full_eval (*bool*, *optional*, defaults to *False*):
+        bf16_full_eval (`bool`, *optional*, defaults to `False`):
             Whether to use full bfloat16 evaluation instead of 32-bit. This will be faster and save memory but can harm
             metric values. This is an experimental API and it may change.
-        fp16_full_eval (*bool*, *optional*, defaults to *False*):
+        fp16_full_eval (`bool`, *optional*, defaults to `False`):
             Whether to use full float16 evaluation instead of 32-bit. This will be faster and save memory but can harm
             metric values.
-        tf32 (*bool*, *optional*):
+        tf32 (`bool`, *optional*):
             Whether to enable the TF32 mode, available in Ampere and newer GPU architectures. The default value depends
-            on PyTorch's version default of *torch.backends.cuda.matmul.allow_tf32*. For more details please refer to
+            on PyTorch's version default of `torch.backends.cuda.matmul.allow_tf32`. For more details please refer to
             the [TF32](https://huggingface.co/docs/transformers/performance#tf32) documentation. This is an
             experimental API and it may change.
-        local_rank (*int*, *optional*, defaults to -1):
+        local_rank (`int`, *optional*, defaults to -1):
             Rank of the process during distributed training.
-        ddp_backend (*str*, *optional*):
-            The backend to use for distributed training. Must be one of *"nccl"*, *"mpi"*, *"ccl"*, *"gloo"*, *"hccl"*.
-        tpu_num_cores (*int*, *optional*):
+        ddp_backend (`str`, *optional*):
+            The backend to use for distributed training. Must be one of `"nccl"`, `"mpi"`, `"ccl"`, `"gloo"`, `"hccl"`.
+        tpu_num_cores (`int`, *optional*):
             When training on TPU, the number of TPU cores (automatically passed by launcher script).
-        dataloader_drop_last (*bool*, *optional*, defaults to *False*):
+        dataloader_drop_last (`bool`, *optional*, defaults to `False`):
             Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
             or not.
-        eval_steps (*int* or *float*, *optional*):
-            Number of update steps between two evaluations if *evaluation_strategy="steps"*. Will default to the same
-            value as *logging_steps* if not set. Should be an integer or a float in range *[0,1)*. If smaller than 1,
+        eval_steps (`int` or `float`, *optional*):
+            Number of update steps between two evaluations if `evaluation_strategy="steps"`. Will default to the same
+            value as `logging_steps` if not set. Should be an integer or a float in range `[0,1)`. If smaller than 1,
             will be interpreted as ratio of total training steps.
-        dataloader_num_workers (*int*, *optional*, defaults to 0):
+        dataloader_num_workers (`int`, *optional*, defaults to 0):
             Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the
             main process.
-        past_index (*int*, *optional*, defaults to -1):
+        past_index (`int`, *optional*, defaults to -1):
             Some models like [TransformerXL](../model_doc/transformerxl) or [XLNet](../model_doc/xlnet) can make use of
-            the past hidden states for their predictions. If this argument is set to a positive int, the *Trainer* will
+            the past hidden states for their predictions. If this argument is set to a positive int, the `Trainer` will
             use the corresponding output (usually index 2) as the past state and feed it to the model at the next
-            training step under the keyword argument *mems*.
-        run_name (*str*, *optional*):
+            training step under the keyword argument `mems`.
+        run_name (`str`, *optional*):
             A descriptor for the run. Typically used for [wandb](https://www.wandb.com/) and
             [mlflow](https://www.mlflow.org/) logging.
-        disable_tqdm (*bool*, *optional*):
+        disable_tqdm (`bool`, *optional*):
             Whether or not to disable the tqdm progress bars and table of metrics produced by
-            [*~notebook.NotebookTrainingTracker*] in Jupyter Notebooks. Will default to *True* if the logging level is
-            set to warn or lower (default), *False* otherwise.
-        remove_unused_columns (*bool*, *optional*, defaults to *True*):
+            [`~notebook.NotebookTrainingTracker`] in Jupyter Notebooks. Will default to `True` if the logging level is
+            set to warn or lower (default), `False` otherwise.
+        remove_unused_columns (`bool`, *optional*, defaults to `True`):
             Whether or not to automatically remove the columns unused by the model forward method.
 
-            (Note that this behavior is not implemented for [*TFTrainer*] yet.)
-        label_names (*List[str]*, *optional*):
+            (Note that this behavior is not implemented for [`TFTrainer`] yet.)
+        label_names (`List[str]`, *optional*):
             The list of keys in your dictionary of inputs that correspond to the labels.
 
             Will eventually default to the list of argument names accepted by the model that contain the word "label",
-            except if the model used is one of the *XxxForQuestionAnswering* in which case it will also include the
-            *["start_positions", "end_positions"]* keys.
-        load_best_model_at_end (*bool*, *optional*, defaults to *False*):
+            except if the model used is one of the `XxxForQuestionAnswering` in which case it will also include the
+            `["start_positions", "end_positions"]` keys.
+        load_best_model_at_end (`bool`, *optional*, defaults to `False`):
             Whether or not to load the best model found during training at the end of training. When this option is
             enabled, the best checkpoint will always be saved. See
-            [*save_total_limit*](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.save_total_limit)
+            [`save_total_limit`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.save_total_limit)
             for more.
 
             <Tip>
 
-            When set to *True*, the parameters *save_strategy* needs to be the same as *evaluation_strategy*, and in
-            the case it is "steps", *save_steps* must be a round multiple of *eval_steps*.
+            When set to `True`, the parameters `save_strategy` needs to be the same as `evaluation_strategy`, and in
+            the case it is "steps", `save_steps` must be a round multiple of `eval_steps`.
 
             </Tip>
 
-        metric_for_best_model (*str*, *optional*):
-            Use in conjunction with *load_best_model_at_end* to specify the metric to use to compare two different
-            models. Must be the name of a metric returned by the evaluation with or without the prefix *"eval_"*. Will
-            default to *"loss"* if unspecified and *load_best_model_at_end=True* (to use the evaluation loss).
+        metric_for_best_model (`str`, *optional*):
+            Use in conjunction with `load_best_model_at_end` to specify the metric to use to compare two different
+            models. Must be the name of a metric returned by the evaluation with or without the prefix `"eval_"`. Will
+            default to `"loss"` if unspecified and `load_best_model_at_end=True` (to use the evaluation loss).
 
-            If you set this value, *greater_is_better* will default to *True*. Don't forget to set it to *False* if
+            If you set this value, `greater_is_better` will default to `True`. Don't forget to set it to `False` if
             your metric is better when lower.
-        greater_is_better (*bool*, *optional*):
-            Use in conjunction with *load_best_model_at_end* and *metric_for_best_model* to specify if better models
+        greater_is_better (`bool`, *optional*):
+            Use in conjunction with `load_best_model_at_end` and `metric_for_best_model` to specify if better models
             should have a greater metric or not. Will default to:
 
-            - *True* if *metric_for_best_model* is set to a value that isn't *"loss"* or *"eval_loss"*.
-            - *False* if *metric_for_best_model* is not set, or set to *"loss"* or *"eval_loss"*.
-        ignore_data_skip (*bool*, *optional*, defaults to *False*):
+            - `True` if `metric_for_best_model` is set to a value that isn't `"loss"` or `"eval_loss"`.
+            - `False` if `metric_for_best_model` is not set, or set to `"loss"` or `"eval_loss"`.
+        ignore_data_skip (`bool`, *optional*, defaults to `False`):
             When resuming training, whether or not to skip the epochs and batches to get the data loading at the same
-            stage as in the previous training. If set to *True*, the training will begin faster (as that skipping step
+            stage as in the previous training. If set to `True`, the training will begin faster (as that skipping step
             can take a long time) but will not yield the same results as the interrupted training would have.
-        fsdp (*bool*, *str* or list of [*~trainer_utils.FSDPOption*], *optional*, defaults to *''*):
+        fsdp (`bool`, `str` or list of [`~trainer_utils.FSDPOption`], *optional*, defaults to `''`):
             Use PyTorch Distributed Parallel Training (in distributed training only).
 
             A list of options along the following:
 
-            - *"full_shard"*: Shard parameters, gradients and optimizer states.
-            - *"shard_grad_op"*: Shard optimizer states and gradients.
-            - *"hybrid_shard"*: Apply `FULL_SHARD` within a node, and replicate parameters across nodes.
-            - *"hybrid_shard_zero2"*: Apply `SHARD_GRAD_OP` within a node, and replicate parameters across nodes.
-            - *"offload"*: Offload parameters and gradients to CPUs (only compatible with *"full_shard"* and
-              *"shard_grad_op"*).
-            - *"auto_wrap"*: Automatically recursively wrap layers with FSDP using *default_auto_wrap_policy*.
-        fsdp_config (*str* or *dict*, *optional*):
+            - `"full_shard"`: Shard parameters, gradients and optimizer states.
+            - `"shard_grad_op"`: Shard optimizer states and gradients.
+            - `"hybrid_shard"`: Apply ``FULL_SHARD`` within a node, and replicate parameters across nodes.
+            - `"hybrid_shard_zero2"`: Apply ``SHARD_GRAD_OP`` within a node, and replicate parameters across nodes.
+            - `"offload"`: Offload parameters and gradients to CPUs (only compatible with `"full_shard"` and
+              `"shard_grad_op"`).
+            - `"auto_wrap"`: Automatically recursively wrap layers with FSDP using `default_auto_wrap_policy`.
+        fsdp_config (`str` or `dict`, *optional*):
             Config to be used with fsdp (Pytorch Distributed Parallel Training). The value is either a location of
-            deepspeed json config file (e.g., *ds_config.json*) or an already loaded json file as *dict*.
+            deepspeed json config file (e.g., `ds_config.json`) or an already loaded json file as `dict`.
 
             A List of config and its options:
-                - min_num_params (*int*, *optional*, defaults to *0*):
-                    FSDP's minimum number of parameters for Default Auto Wrapping. (useful only when *fsdp* field is
+                - min_num_params (`int`, *optional*, defaults to `0`):
+                    FSDP's minimum number of parameters for Default Auto Wrapping. (useful only when `fsdp` field is
                     passed).
-                - transformer_layer_cls_to_wrap (*List[str]*, *optional*):
-                    List of transformer layer class names (case-sensitive) to wrap, e.g, *BertLayer*, *GPTJBlock*,
-                    *T5Block* .... (useful only when *fsdp* flag is passed).
-                - backward_prefetch (*str*, *optional*)
+                - transformer_layer_cls_to_wrap (`List[str]`, *optional*):
+                    List of transformer layer class names (case-sensitive) to wrap, e.g, `BertLayer`, `GPTJBlock`,
+                    `T5Block` .... (useful only when `fsdp` flag is passed).
+                - backward_prefetch (`str`, *optional*)
                     FSDP's backward prefetch mode. Controls when to prefetch next set of parameters (useful only when
-                    *fsdp* field is passed).
+                    `fsdp` field is passed).
 
                     A list of options along the following:
 
-                    - *"backward_pre"* : Prefetches the next set of parameters before the current set of parameter's
+                    - `"backward_pre"` : Prefetches the next set of parameters before the current set of parameter's
                       gradient
                         computation.
-                    - *"backward_post"* : This prefetches the next set of parameters after the current set of
+                    - `"backward_post"` : This prefetches the next set of parameters after the current set of
                       parameter’s
                         gradient computation.
-                - forward_prefetch (*bool*, *optional*, defaults to *False*)
-                    FSDP's forward prefetch mode (useful only when *fsdp* field is passed).
-                     If *"True"*, then FSDP explicitly prefetches the next upcoming all-gather while executing in the
+                - forward_prefetch (`bool`, *optional*, defaults to `False`)
+                    FSDP's forward prefetch mode (useful only when `fsdp` field is passed).
+                     If `"True"`, then FSDP explicitly prefetches the next upcoming all-gather while executing in the
                      forward pass.
-                - limit_all_gathers (*bool*, *optional*, defaults to *False*)
-                    FSDP's limit_all_gathers (useful only when *fsdp* field is passed).
-                     If *"True"*, FSDP explicitly synchronizes the CPU thread to prevent too many in-flight
+                - limit_all_gathers (`bool`, *optional*, defaults to `False`)
+                    FSDP's limit_all_gathers (useful only when `fsdp` field is passed).
+                     If `"True"`, FSDP explicitly synchronizes the CPU thread to prevent too many in-flight
                      all-gathers.
-                - use_orig_params (*bool*, *optional*, defaults to *True*)
-                    If *"True"*, allows non-uniform *requires_grad* during init, which means support for interspersed
+                - use_orig_params (`bool`, *optional*, defaults to `True`)
+                    If `"True"`, allows non-uniform `requires_grad` during init, which means support for interspersed
                     frozen and trainable paramteres. Useful in cases such as parameter-efficient fine-tuning. Please
                     refer this
                     [blog](https://dev-discuss.pytorch.org/t/rethinking-pytorch-fully-sharded-data-parallel-fsdp-from-first-principles/1019
-                - sync_module_states (*bool*, *optional*, defaults to *True*)
-                    If *"True"*, each individually wrapped FSDP unit will broadcast module parameters from rank 0 to
+                - sync_module_states (`bool`, *optional*, defaults to `True`)
+                    If `"True"`, each individually wrapped FSDP unit will broadcast module parameters from rank 0 to
                     ensure they are the same across all ranks after initialization
-                - activation_checkpointing (*bool*, *optional*, defaults to *False*):
+                - activation_checkpointing (`bool`, *optional*, defaults to `False`):
                     If True, activation checkpointing is a technique to reduce memory usage by clearing activations of
                     certain layers and recomputing them during a backward pass. Effectively, this trades extra
                     computation time for reduced memory usage.
-                - xla (*bool*, *optional*, defaults to *False*):
+                - xla (`bool`, *optional*, defaults to `False`):
                     Whether to use PyTorch/XLA Fully Sharded Data Parallel Training. This is an experimental feature
                     and its API may evolve in the future.
-                - xla_fsdp_settings (*dict*, *optional*)
+                - xla_fsdp_settings (`dict`, *optional*)
                     The value is a dictionary which stores the XLA FSDP wrapping parameters.
 
                     For a complete list of options, please see [here](
                     https://github.com/pytorch/xla/blob/master/torch_xla/distributed/fsdp/xla_fully_sharded_data_parallel.py).
-                - xla_fsdp_grad_ckpt (*bool*, *optional*, defaults to *False*):
+                - xla_fsdp_grad_ckpt (`bool`, *optional*, defaults to `False`):
                     Will use gradient checkpointing over each nested XLA FSDP wrapped layer. This setting can only be
                     used when the xla flag is set to true, and an auto wrapping policy is specified through
                     fsdp_min_num_params or fsdp_transformer_layer_cls_to_wrap.
 
-        deepspeed (*str* or *dict*, *optional*):
+        deepspeed (`str` or `dict`, *optional*):
             Use [Deepspeed](https://github.com/microsoft/deepspeed). This is an experimental feature and its API may
             evolve in the future. The value is either the location of DeepSpeed json config file (e.g.,
-            *ds_config.json*) or an already loaded json file as a *dict*"
-        label_smoothing_factor (*float*, *optional*, defaults to 0.0):
+            `ds_config.json`) or an already loaded json file as a `dict`"
+        label_smoothing_factor (`float`, *optional*, defaults to 0.0):
             The label smoothing factor to use. Zero means no label smoothing, otherwise the underlying onehot-encoded
-            labels are changed from 0s and 1s to *label_smoothing_factor/num_labels* and *1 - label_smoothing_factor +
-            label_smoothing_factor/num_labels* respectively.
-        debug (*str* or list of [*~debug_utils.DebugOption*], *optional*, defaults to *""*):
+            labels are changed from 0s and 1s to `label_smoothing_factor/num_labels` and `1 - label_smoothing_factor +
+            label_smoothing_factor/num_labels` respectively.
+        debug (`str` or list of [`~debug_utils.DebugOption`], *optional*, defaults to `""`):
             Enable one or more debug features. This is an experimental feature.
 
             Possible options are:
 
-            - *"underflow_overflow"*: detects overflow in model's input/outputs and reports the last frames that led to
+            - `"underflow_overflow"`: detects overflow in model's input/outputs and reports the last frames that led to
               the event
-            - *"tpu_metrics_debug"*: print debug metrics on TPU
+            - `"tpu_metrics_debug"`: print debug metrics on TPU
 
             The options should be separated by whitespaces.
-        optim (*str* or [*training_args.OptimizerNames*], *optional*, defaults to *"adamw_torch"*):
+        optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch"`):
             The optimizer to use: adamw_hf, adamw_torch, adamw_torch_fused, adamw_apex_fused, adamw_anyprecision or
             adafactor.
-        optim_args (*str*, *optional*):
+        optim_args (`str`, *optional*):
             Optional arguments that are supplied to AnyPrecisionAdamW.
-        group_by_length (*bool*, *optional*, defaults to *False*):
+        group_by_length (`bool`, *optional*, defaults to `False`):
             Whether or not to group together samples of roughly the same length in the training dataset (to minimize
             padding applied and be more efficient). Only useful if applying dynamic padding.
-        length_column_name (*str*, *optional*, defaults to *"length"*):
+        length_column_name (`str`, *optional*, defaults to `"length"`):
             Column name for precomputed lengths. If the column exists, grouping by length will use these values rather
-            than computing them on train startup. Ignored unless *group_by_length* is *True* and the dataset is an
-            instance of *Dataset*.
-        report_to (*str* or *List[str]*, *optional*, defaults to *"all"*):
-            The list of integrations to report the results and logs to. Supported platforms are *"azure_ml"*,
-            *"clearml"*, *"codecarbon"*, *"comet_ml"*, *"dagshub"*, *"dvclive"*, *"flyte"*, *"mlflow"*, *"neptune"*,
-            *"tensorboard"*, and *"wandb"*. Use *"all"* to report to all integrations installed, *"none"* for no
+            than computing them on train startup. Ignored unless `group_by_length` is `True` and the dataset is an
+            instance of `Dataset`.
+        report_to (`str` or `List[str]`, *optional*, defaults to `"all"`):
+            The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
+            `"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`, `"neptune"`,
+            `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations installed, `"none"` for no
             integrations.
-        ddp_find_unused_parameters (*bool*, *optional*):
-            When using distributed training, the value of the flag *find_unused_parameters* passed to
-            *DistributedDataParallel*. Will default to *False* if gradient checkpointing is used, *True* otherwise.
-        ddp_bucket_cap_mb (*int*, *optional*):
-            When using distributed training, the value of the flag *bucket_cap_mb* passed to *DistributedDataParallel*.
-        ddp_broadcast_buffers (*bool*, *optional*):
-            When using distributed training, the value of the flag *broadcast_buffers* passed to
-            *DistributedDataParallel*. Will default to *False* if gradient checkpointing is used, *True* otherwise.
-        dataloader_pin_memory (*bool*, *optional*, defaults to *True*):
-            Whether you want to pin memory in data loaders or not. Will default to *True*.
-        skip_memory_metrics (*bool*, *optional*, defaults to *True*):
+        ddp_find_unused_parameters (`bool`, *optional*):
+            When using distributed training, the value of the flag `find_unused_parameters` passed to
+            `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.
+        ddp_bucket_cap_mb (`int`, *optional*):
+            When using distributed training, the value of the flag `bucket_cap_mb` passed to `DistributedDataParallel`.
+        ddp_broadcast_buffers (`bool`, *optional*):
+            When using distributed training, the value of the flag `broadcast_buffers` passed to
+            `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.
+        dataloader_pin_memory (`bool`, *optional*, defaults to `True`):
+            Whether you want to pin memory in data loaders or not. Will default to `True`.
+        skip_memory_metrics (`bool`, *optional*, defaults to `True`):
             Whether to skip adding of memory profiler reports to metrics. This is skipped by default because it slows
             down the training and evaluation speed.
-        push_to_hub (*bool*, *optional*, defaults to *False*):
+        push_to_hub (`bool`, *optional*, defaults to `False`):
             Whether or not to push the model to the Hub every time the model is saved. If this is activated,
-            *output_dir* will begin a git directory synced with the repo (determined by *hub_model_id*) and the content
-            will be pushed each time a save is triggered (depending on your *save_strategy*). Calling
-            [*~Trainer.save_model*] will also trigger a push.
+            `output_dir` will begin a git directory synced with the repo (determined by `hub_model_id`) and the content
+            will be pushed each time a save is triggered (depending on your `save_strategy`). Calling
+            [`~Trainer.save_model`] will also trigger a push.
 
             <Tip warning={true}>
 
-            If *output_dir* exists, it needs to be a local clone of the repository to which the [*Trainer*] will be
+            If `output_dir` exists, it needs to be a local clone of the repository to which the [`Trainer`] will be
             pushed.
 
             </Tip>
 
-        resume_from_checkpoint (*str*, *optional*):
+        resume_from_checkpoint (`str`, *optional*):
             The path to a folder with a valid checkpoint for your model. This argument is not directly used by
-            [*Trainer*], it's intended to be used by your training/evaluation scripts instead. See the [example
+            [`Trainer`], it's intended to be used by your training/evaluation scripts instead. See the [example
             scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details.
-        hub_model_id (*str*, *optional*):
+        hub_model_id (`str`, *optional*):
             The name of the repository to keep in sync with the local *output_dir*. It can be a simple model ID in
             which case the model will be pushed in your namespace. Otherwise it should be the whole repository name,
-            for instance *"user_name/model"*, which allows you to push to an organization you are a member of with
-            *"organization_name/model"*. Will default to *user_name/output_dir_name* with *output_dir_name* being the
-            name of *output_dir*.
+            for instance `"user_name/model"`, which allows you to push to an organization you are a member of with
+            `"organization_name/model"`. Will default to `user_name/output_dir_name` with *output_dir_name* being the
+            name of `output_dir`.
 
-            Will default to the name of *output_dir*.
-        hub_strategy (*str* or [*~trainer_utils.HubStrategy*], *optional*, defaults to *"every_save"*):
+            Will default to the name of `output_dir`.
+        hub_strategy (`str` or [`~trainer_utils.HubStrategy`], *optional*, defaults to `"every_save"`):
             Defines the scope of what is pushed to the Hub and when. Possible values are:
 
-            - *"end"*: push the model, its configuration, the tokenizer (if passed along to the [*Trainer*]) and a
-              draft of a model card when the [*~Trainer.save_model*] method is called.
-            - *"every_save"*: push the model, its configuration, the tokenizer (if passed along to the [*Trainer*]) and
+            - `"end"`: push the model, its configuration, the tokenizer (if passed along to the [`Trainer`]) and a
+              draft of a model card when the [`~Trainer.save_model`] method is called.
+            - `"every_save"`: push the model, its configuration, the tokenizer (if passed along to the [`Trainer`]) and
               a draft of a model card each time there is a model save. The pushes are asynchronous to not block
               training, and in case the save are very frequent, a new push is only attempted if the previous one is
               finished. A last push is made with the final model at the end of training.
-            - *"checkpoint"*: like *"every_save"* but the latest checkpoint is also pushed in a subfolder named
+            - `"checkpoint"`: like `"every_save"` but the latest checkpoint is also pushed in a subfolder named
               last-checkpoint, allowing you to resume training easily with
-              *trainer.train(resume_from_checkpoint="last-checkpoint")*.
-            - *"all_checkpoints"*: like *"checkpoint"* but all checkpoints are pushed like they appear in the output
+              `trainer.train(resume_from_checkpoint="last-checkpoint")`.
+            - `"all_checkpoints"`: like `"checkpoint"` but all checkpoints are pushed like they appear in the output
               folder (so you will get one checkpoint folder per folder in your final repository)
 
-        hub_token (*str*, *optional*):
+        hub_token (`str`, *optional*):
             The token to use to push the model to the Hub. Will default to the token in the cache folder obtained with
-            *huggingface-cli login*.
-        hub_private_repo (*bool*, *optional*, defaults to *False*):
+            `huggingface-cli login`.
+        hub_private_repo (`bool`, *optional*, defaults to `False`):
             If True, the Hub repo will be set to private.
-        hub_always_push (*bool*, *optional*, defaults to *False*):
-            Unless this is *True*, the *Trainer* will skip pushing a checkpoint when the previous push is not finished.
-        gradient_checkpointing (*bool*, *optional*, defaults to *False*):
+        hub_always_push (`bool`, *optional*, defaults to `False`):
+            Unless this is `True`, the `Trainer` will skip pushing a checkpoint when the previous push is not finished.
+        gradient_checkpointing (`bool`, *optional*, defaults to `False`):
             If True, use gradient checkpointing to save memory at the expense of slower backward pass.
-        gradient_checkpointing_kwargs (*dict*, *optional*, defaults to *None*):
-            Key word arguments to be passed to the *gradient_checkpointing_enable* method.
-        include_inputs_for_metrics (*bool*, *optional*, defaults to *False*):
-            Whether or not the inputs will be passed to the *compute_metrics* function. This is intended for metrics
+        gradient_checkpointing_kwargs (`dict`, *optional*, defaults to `None`):
+            Key word arguments to be passed to the `gradient_checkpointing_enable` method.
+        include_inputs_for_metrics (`bool`, *optional*, defaults to `False`):
+            Whether or not the inputs will be passed to the `compute_metrics` function. This is intended for metrics
             that need inputs, predictions and references for scoring calculation in Metric class.
-        auto_find_batch_size (*bool*, *optional*, defaults to *False*)
+        auto_find_batch_size (`bool`, *optional*, defaults to `False`)
             Whether to find a batch size that will fit into memory automatically through exponential decay, avoiding
-            CUDA Out-of-Memory errors. Requires accelerate to be installed (*pip install accelerate*)
-        full_determinism (*bool*, *optional*, defaults to *False*)
-            If *True*, [*enable_full_determinism*] is called instead of [*set_seed*] to ensure reproducible results in
+            CUDA Out-of-Memory errors. Requires accelerate to be installed (`pip install accelerate`)
+        full_determinism (`bool`, *optional*, defaults to `False`)
+            If `True`, [`enable_full_determinism`] is called instead of [`set_seed`] to ensure reproducible results in
             distributed training. Important: this will negatively impact the performance, so only use it for debugging.
-        torchdynamo (*str*, *optional*):
-            If set, the backend compiler for TorchDynamo. Possible choices are *"eager"*, *"aot_eager"*, *"inductor"*,
-            *"nvfuser"*, *"aot_nvfuser"*, *"aot_cudagraphs"*, *"ofi"*, *"fx2trt"*, *"onnxrt"* and *"ipex"*.
-        ray_scope (*str*, *optional*, defaults to *"last"*):
-            The scope to use when doing hyperparameter search with Ray. By default, *"last"* will be used. Ray will
+        torchdynamo (`str`, *optional*):
+            If set, the backend compiler for TorchDynamo. Possible choices are `"eager"`, `"aot_eager"`, `"inductor"`,
+            `"nvfuser"`, `"aot_nvfuser"`, `"aot_cudagraphs"`, `"ofi"`, `"fx2trt"`, `"onnxrt"` and `"ipex"`.
+        ray_scope (`str`, *optional*, defaults to `"last"`):
+            The scope to use when doing hyperparameter search with Ray. By default, `"last"` will be used. Ray will
             then use the last checkpoint of all trials, compare those, and select the best one. However, other options
             are also available. See the [Ray documentation](
-            https://docs.ray.io/en/latest/tune/api_docs/analysis#ray.tune.ExperimentAnalysis.get_best_trial) for
+            https://docs.ray.io/en/latest/tune/api_docs/analysis.html#ray.tune.ExperimentAnalysis.get_best_trial) for
             more options.
-        ddp_timeout (*int*, *optional*, defaults to 1800):
-            The timeout for *torch.distributed.init_process_group* calls, used to avoid GPU socket timeouts when
+        ddp_timeout (`int`, *optional*, defaults to 1800):
+            The timeout for `torch.distributed.init_process_group` calls, used to avoid GPU socket timeouts when
             performing slow operations in distributed runnings. Please refer the [PyTorch documentation]
-            (https://pytorch.org/docs/stable/distributed#torch.distributed.init_process_group) for more
+            (https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) for more
             information.
-        use_mps_device (*bool*, *optional*, defaults to *False*):
-            This argument is deprecated.*mps* device will be used if it is available similar to *cuda* device.
-        torch_compile (*bool*, *optional*, defaults to *False*):
+        use_mps_device (`bool`, *optional*, defaults to `False`):
+            This argument is deprecated.`mps` device will be used if it is available similar to `cuda` device.
+        torch_compile (`bool`, *optional*, defaults to `False`):
             Whether or not to compile the model using PyTorch 2.0
-            [*torch.compile*](https://pytorch.org/get-started/pytorch-2.0/).
+            [`torch.compile`](https://pytorch.org/get-started/pytorch-2.0/).
 
-            This will use the best defaults for the [*torch.compile*
-            API](https://pytorch.org/docs/stable/generated/torch.compile?highlight=torch+compile#torch.compile).
-            You can customize the defaults with the argument *torch_compile_backend* and *torch_compile_mode* but we
+            This will use the best defaults for the [`torch.compile`
+            API](https://pytorch.org/docs/stable/generated/torch.compile.html?highlight=torch+compile#torch.compile).
+            You can customize the defaults with the argument `torch_compile_backend` and `torch_compile_mode` but we
             don't guarantee any of them will work as the support is progressively rolled in in PyTorch.
 
             This flag and the whole compile API is experimental and subject to change in future releases.
-        torch_compile_backend (*str*, *optional*):
-            The backend to use in *torch.compile*. If set to any value, *torch_compile* will be set to *True*.
+        torch_compile_backend (`str`, *optional*):
+            The backend to use in `torch.compile`. If set to any value, `torch_compile` will be set to `True`.
 
             Refer to the PyTorch doc for possible values and note that they may change across PyTorch versions.
 
             This flag is experimental and subject to change in future releases.
-        torch_compile_mode (*str*, *optional*):
-            The mode to use in *torch.compile*. If set to any value, *torch_compile* will be set to *True*.
+        torch_compile_mode (`str`, *optional*):
+            The mode to use in `torch.compile`. If set to any value, `torch_compile` will be set to `True`.
 
             Refer to the PyTorch doc for possible values and note that they may change across PyTorch versions.
 
             This flag is experimental and subject to change in future releases.
-        split_batches (*bool*, *optional*):
+        split_batches (`bool`, *optional*):
             Whether or not the accelerator should split the batches yielded by the dataloaders across the devices
             during distributed training. If
 
-            set to *True*, the actual batch size used will be the same on any kind of distributed processes, but it
+            set to `True`, the actual batch size used will be the same on any kind of distributed processes, but it
             must be a
 
             round multiple of the number of processes you are using (such as GPUs).
-        include_tokens_per_second (*bool*, *optional*):
+        include_tokens_per_second (`bool`, *optional*):
             Whether or not to compute the number of tokens per second per device for training speed metrics.
 
             This will iterate over the entire training dataloader once beforehand,
 
             and will slow down the entire process.
 
-        include_num_input_tokens_seen (*bool*, *optional*):
+        include_num_input_tokens_seen (`bool`, *optional*):
             Whether or not to track the number of input tokens seen throughout training.
 
             May be slower in distributed training as gather operations must be called.
 
-        neftune_noise_alpha (*Optional[float]*):
-            If not *None*, this will activate NEFTune noise embeddings. This can drastically improve model performance
+        neftune_noise_alpha (`Optional[float]`):
+            If not `None`, this will activate NEFTune noise embeddings. This can drastically improve model performance
             for instruction fine-tuning. Check out the [original paper](https://arxiv.org/abs/2310.05914) and the
-            [original code](https://github.com/neelsjain/NEFTune). Support transformers *PreTrainedModel* and also
-            *PeftModel* from peft.
+            [original code](https://github.com/neelsjain/NEFTune). Support transformers `PreTrainedModel` and also
+            `PeftModel` from peft.
     """
 
     framework = "pt"

From 9685857f00649c17b717f44cbe3136d89f98e52b Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 22 Nov 2023 18:30:26 +0530
Subject: [PATCH 05/12] fix fsdp doc strings

---
 src/transformers/training_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index c249adfbb15d7..4f7dd44e61692 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -430,7 +430,7 @@ class TrainingArguments:
             - `"auto_wrap"`: Automatically recursively wrap layers with FSDP using `default_auto_wrap_policy`.
         fsdp_config (`str` or `dict`, *optional*):
             Config to be used with fsdp (Pytorch Distributed Parallel Training). The value is either a location of
-            deepspeed json config file (e.g., `ds_config.json`) or an already loaded json file as `dict`.
+            fsdp json config file (e.g., `fsdp_config.json`) or an already loaded json file as `dict`.
 
             A List of config and its options:
                 - min_num_params (`int`, *optional*, defaults to `0`):

From 203c38d01c4365ecc988d7197904bed125954e88 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 22 Nov 2023 18:44:07 +0530
Subject: [PATCH 06/12] fix quality

---
 src/transformers/training_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 4f7dd44e61692..6c2097916a923 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -849,7 +849,7 @@ class TrainingArguments:
                 "When checkpointing, whether to only save the model, or also the optimizer, scheduler & rng state."
                 "Note that when this is true, you won't be able to resume training from checkpoint."
                 "This enables you to save storage by not storing the optimizer, scheduler & rng state."
-                "You can only load the model using `from_pretrained` with this option set to `True`."
+                "You can only load the model using from_pretrained with this option set to True."
             )
         },
     )

From 7e4f99a8abfcba388eaf7fb5db03657a519c6da7 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 22 Nov 2023 19:18:07 +0530
Subject: [PATCH 07/12] Update src/transformers/training_args.py

Co-authored-by: Zach Mueller <muellerzr@gmail.com>
---
 src/transformers/training_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 6c2097916a923..300e7fb54bec1 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -468,7 +468,7 @@ class TrainingArguments:
                     If `"True"`, each individually wrapped FSDP unit will broadcast module parameters from rank 0 to
                     ensure they are the same across all ranks after initialization
                 - activation_checkpointing (`bool`, *optional*, defaults to `False`):
-                    If True, activation checkpointing is a technique to reduce memory usage by clearing activations of
+                    If `"True"`, activation checkpointing is a technique to reduce memory usage by clearing activations of
                     certain layers and recomputing them during a backward pass. Effectively, this trades extra
                     computation time for reduced memory usage.
                 - xla (`bool`, *optional*, defaults to `False`):

From bebc321f7e43798934f647ed4b91d6e84bf930b3 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 22 Nov 2023 19:18:39 +0530
Subject: [PATCH 08/12] =?UTF-8?q?please=20fix=20the=20quality=20issue=20?=
 =?UTF-8?q?=F0=9F=98=85?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/transformers/training_args.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 300e7fb54bec1..6146395311087 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -423,8 +423,8 @@ class TrainingArguments:
 
             - `"full_shard"`: Shard parameters, gradients and optimizer states.
             - `"shard_grad_op"`: Shard optimizer states and gradients.
-            - `"hybrid_shard"`: Apply ``FULL_SHARD`` within a node, and replicate parameters across nodes.
-            - `"hybrid_shard_zero2"`: Apply ``SHARD_GRAD_OP`` within a node, and replicate parameters across nodes.
+            - `"hybrid_shard"`: Apply `FULL_SHARD` within a node, and replicate parameters across nodes.
+            - `"hybrid_shard_zero2"`: Apply `SHARD_GRAD_OP` within a node, and replicate parameters across nodes.
             - `"offload"`: Offload parameters and gradients to CPUs (only compatible with `"full_shard"` and
               `"shard_grad_op"`).
             - `"auto_wrap"`: Automatically recursively wrap layers with FSDP using `default_auto_wrap_policy`.

From 835e41bae27c08b4c7a275021f4fec3de2dd4130 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 22 Nov 2023 22:24:57 +0530
Subject: [PATCH 09/12] Apply suggestions from code review

Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>
---
 src/transformers/modeling_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index ed19fe156e4b7..64a60a7480471 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -473,12 +473,12 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike]):
             )
         return safe_load_file(checkpoint_file)
     try:
-        if (is_deepspeed_zero3_enabled()) and torch.distributed.is_initialized() and torch.distributed.get_rank() > 0:
+        if is_deepspeed_zero3_enabled() and torch.distributed.is_initialized() and torch.distributed.get_rank() > 0:
             map_location = "meta"
         else:
             map_location = "cpu"
 
-        map_location = "cpu" if is_fsdp_enabled_and_local_dist_rank_0 else "meta"
+        map_location = "cpu" if is_fsdp_enabled_and_local_dist_rank_0() else "meta"
         return torch.load(checkpoint_file, map_location=map_location)
     except Exception as e:
         try:

From 4237570577c9e3172b6dcac9af5bbc1742013ef9 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Wed, 22 Nov 2023 22:51:10 +0530
Subject: [PATCH 10/12] address comment

---
 src/transformers/modeling_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 64a60a7480471..024428941260a 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -473,12 +473,13 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike]):
             )
         return safe_load_file(checkpoint_file)
     try:
-        if is_deepspeed_zero3_enabled() and torch.distributed.is_initialized() and torch.distributed.get_rank() > 0:
+        if (
+            is_deepspeed_zero3_enabled() and torch.distributed.is_initialized() and torch.distributed.get_rank() > 0
+        ) or (is_fsdp_enabled() and not is_fsdp_enabled_and_local_dist_rank_0()):
             map_location = "meta"
         else:
             map_location = "cpu"
 
-        map_location = "cpu" if is_fsdp_enabled_and_local_dist_rank_0() else "meta"
         return torch.load(checkpoint_file, map_location=map_location)
     except Exception as e:
         try:

From bdef4ac70cfd47b8852192f3cb931f6d211c1845 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Fri, 24 Nov 2023 09:56:40 +0530
Subject: [PATCH 11/12] simplify conditional check as per the comment

---
 src/transformers/modeling_utils.py | 34 +++++++++++++++++-------------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 024428941260a..d60d795a0f93d 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -132,8 +132,12 @@ def is_fsdp_enabled():
     )
 
 
-def is_fsdp_enabled_and_local_dist_rank_0():
-    return is_fsdp_enabled() and int(os.environ.get("LOCAL_RANK", -1)) == 0
+def is_local_dist_rank_0():
+    return (
+        torch.distributed.is_available()
+        and torch.distributed.is_initialized()
+        and int(os.environ.get("LOCAL_RANK", -1)) == 0
+    )
 
 
 if is_sagemaker_mp_enabled():
@@ -475,7 +479,7 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike]):
     try:
         if (
             is_deepspeed_zero3_enabled() and torch.distributed.is_initialized() and torch.distributed.get_rank() > 0
-        ) or (is_fsdp_enabled() and not is_fsdp_enabled_and_local_dist_rank_0()):
+        ) or (is_fsdp_enabled() and not is_local_dist_rank_0()):
             map_location = "meta"
         else:
             map_location = "cpu"
@@ -3903,7 +3907,18 @@ def _find_mismatched_keys(
                     ignore_mismatched_sizes,
                 )
                 if low_cpu_mem_usage:
-                    if not is_fsdp_enabled() or is_fsdp_enabled_and_local_dist_rank_0():
+                    if is_fsdp_enabled() and not is_local_dist_rank_0():
+                        for key, param in model_to_load.state_dict().items():
+                            if param.device == torch.device("meta"):
+                                if not (is_quantized):
+                                    set_module_tensor_to_device(
+                                        model_to_load, key, "cpu", torch.empty(*param.size(), dtype=dtype)
+                                    )
+                                else:
+                                    set_module_quantized_tensor_to_device(
+                                        model_to_load, key, "cpu", torch.empty(*param.size(), dtype=dtype)
+                                    )
+                    else:
                         new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
                             model_to_load,
                             state_dict,
@@ -3921,17 +3936,6 @@ def _find_mismatched_keys(
                             keep_in_fp32_modules=keep_in_fp32_modules,
                         )
                         error_msgs += new_error_msgs
-                    else:
-                        for key, param in model_to_load.state_dict().items():
-                            if param.device == torch.device("meta"):
-                                if not (is_quantized):
-                                    set_module_tensor_to_device(
-                                        model_to_load, key, "cpu", torch.empty(*param.size(), dtype=dtype)
-                                    )
-                                else:
-                                    set_module_quantized_tensor_to_device(
-                                        model_to_load, key, "cpu", torch.empty(*param.size(), dtype=dtype)
-                                    )
                 else:
                     error_msgs += _load_state_dict_into_model(model_to_load, state_dict, start_prefix)
 

From c966e06df0645b8a48c73b923f976683b6561602 Mon Sep 17 00:00:00 2001
From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
Date: Fri, 24 Nov 2023 10:11:30 +0530
Subject: [PATCH 12/12] update documentation

---
 docs/source/en/main_classes/trainer.md | 33 +++++++++++++++++++-------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/docs/source/en/main_classes/trainer.md b/docs/source/en/main_classes/trainer.md
index 7f85d6d72ad02..7304de8174dcd 100644
--- a/docs/source/en/main_classes/trainer.md
+++ b/docs/source/en/main_classes/trainer.md
@@ -426,8 +426,7 @@ To read more about it and the benefits, check out the [Fully Sharded Data Parall
 We have integrated the latest PyTorch's Fully Sharded Data Parallel (FSDP) training feature.
 All you need to do is enable it through the config.
 
-**Required PyTorch version for FSDP support**: PyTorch Nightly (or 1.12.0 if you read this after it has been released)
-as the model saving with FSDP activated is only available with recent fixes.
+**Required PyTorch version for FSDP support**: PyTorch >=2.1.0
 
 **Usage**:
 
@@ -440,6 +439,8 @@ as the model saving with FSDP activated is only available with recent fixes.
   - SHARD_GRAD_OP : Shards optimizer states + gradients across data parallel workers/GPUs.
     For this, add `--fsdp shard_grad_op` to the command line arguments.
   - NO_SHARD : No sharding. For this, add `--fsdp no_shard` to the command line arguments.
+  - HYBRID_SHARD : No sharding. For this, add `--fsdp hybrid_shard` to the command line arguments.
+  - HYBRID_SHARD_ZERO2 : No sharding. For this, add `--fsdp hybrid_shard_zero2` to the command line arguments.
 - To offload the parameters and gradients to the CPU, 
   add `--fsdp "full_shard offload"` or `--fsdp "shard_grad_op offload"` to the command line arguments.
 - To automatically recursively wrap layers with FSDP using `default_auto_wrap_policy`, 
@@ -449,18 +450,18 @@ as the model saving with FSDP activated is only available with recent fixes.
 - Remaining FSDP config is passed via `--fsdp_config <path_to_fsdp_config.json>`. It is either a location of
   FSDP json config file (e.g., `fsdp_config.json`) or an already loaded json file as `dict`. 
   - If auto wrapping is enabled, you can either use transformer based auto wrap policy or size based auto wrap policy.
-    - For transformer based auto wrap policy, it is recommended to specify `fsdp_transformer_layer_cls_to_wrap` in the config file. If not specified, the default value is `model._no_split_modules` when available.
+    - For transformer based auto wrap policy, it is recommended to specify `transformer_layer_cls_to_wrap` in the config file. If not specified, the default value is `model._no_split_modules` when available.
       This specifies the list of transformer layer class name (case-sensitive) to wrap ,e.g, [`BertLayer`], [`GPTJBlock`], [`T5Block`] ....
       This is important because submodules that share weights (e.g., embedding layer) should not end up in different FSDP wrapped units.
       Using this policy, wrapping happens for each block containing Multi-Head Attention followed by couple of MLP layers. 
       Remaining layers including the shared embeddings are conveniently wrapped in same outermost FSDP unit.
       Therefore, use this for transformer based models.
-    - For size based auto wrap policy, please add `fsdp_min_num_params` in the config file. 
+    - For size based auto wrap policy, please add `min_num_params` in the config file. 
       It specifies FSDP's minimum number of parameters for auto wrapping.
-  - `fsdp_backward_prefetch` can be specified in the config file. It controls when to prefetch next set of parameters. 
+  - `backward_prefetch` can be specified in the config file. It controls when to prefetch next set of parameters. 
     `backward_pre` and `backward_pos` are available options. 
     For more information refer `torch.distributed.fsdp.fully_sharded_data_parallel.BackwardPrefetch`
-  - `fsdp_forward_prefetch` can be specified in the config file. It controls when to prefetch next set of parameters. 
+  - `forward_prefetch` can be specified in the config file. It controls when to prefetch next set of parameters. 
     If `"True"`, FSDP explicitly prefetches the next upcoming all-gather while executing in the forward pass. 
   - `limit_all_gathers` can be specified in the config file. 
     If `"True"`, FSDP explicitly synchronizes the CPU thread to prevent too many in-flight all-gathers.
@@ -468,6 +469,20 @@ as the model saving with FSDP activated is only available with recent fixes.
     If `"True"`, FSDP activation checkpointing is a technique to reduce memory usage by clearing activations of
     certain layers and recomputing them during a backward pass. Effectively, this trades extra computation time
     for reduced memory usage.
+  - `use_orig_params` can be specified in the config file. 
+    If True, allows non-uniform `requires_grad` during init, which means support for interspersed frozen and trainable paramteres. Useful in cases such as parameter-efficient fine-tuning. This also enables to have different optimizer param groups. This should be `True` when creating optimizer object before preparing/wrapping the model with FSDP.
+    Please refer this [blog](https://dev-discuss.pytorch.org/t/rethinking-pytorch-fully-sharded-data-parallel-fsdp-from-first-principles/1019). 
+
+**Saving and loading**
+Saving entire intermediate checkpoints using `FULL_STATE_DICT` state_dict_type with CPU offloading on rank 0 takes a lot of time and often results in NCCL Timeout errors due to indefinite hanging during broadcasting. However, at the end of training, we want the whole model state dict instead of the sharded state dict which is only compatible with FSDP. Use `SHARDED_STATE_DICT` (default) state_dict_type to save the intermediate checkpoints and optimizer states in this format recommended by the PyTorch team. 
+
+Saving the final checkpoint in transformers format using default `safetensors` format requires below changes.
+```python
+if trainer.is_fsdp_enabled:
+    trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
+
+trainer.save_model(script_args.output_dir)
+```
 
 **Few caveats to be aware of**
 - it is incompatible with `generate`, thus is incompatible with `--predict_with_generate` 
@@ -492,15 +507,15 @@ Pass `--fsdp "full shard"` along with following changes to be made in `--fsdp_co
   https://github.com/pytorch/xla/blob/master/torch_xla/distributed/fsdp/xla_fully_sharded_data_parallel.py).
 - `xla_fsdp_grad_ckpt`. When `True`, uses gradient checkpointing over each nested XLA FSDP wrapped layer. 
   This setting can only be used when the xla flag is set to true, and an auto wrapping policy is specified through
-  `fsdp_min_num_params` or `fsdp_transformer_layer_cls_to_wrap`. 
+  `min_num_params` or `transformer_layer_cls_to_wrap`. 
 - You can either use transformer based auto wrap policy or size based auto wrap policy.
-  - For transformer based auto wrap policy, it is recommended to specify `fsdp_transformer_layer_cls_to_wrap` in the config file. If not specified, the default value is `model._no_split_modules` when available.
+  - For transformer based auto wrap policy, it is recommended to specify `transformer_layer_cls_to_wrap` in the config file. If not specified, the default value is `model._no_split_modules` when available.
     This specifies the list of transformer layer class name (case-sensitive) to wrap ,e.g, [`BertLayer`], [`GPTJBlock`], [`T5Block`] ....
     This is important because submodules that share weights (e.g., embedding layer) should not end up in different FSDP wrapped units.
     Using this policy, wrapping happens for each block containing Multi-Head Attention followed by couple of MLP layers. 
     Remaining layers including the shared embeddings are conveniently wrapped in same outermost FSDP unit.
     Therefore, use this for transformer based models.
-  - For size based auto wrap policy, please add `fsdp_min_num_params` in the config file. 
+  - For size based auto wrap policy, please add `min_num_params` in the config file. 
     It specifies FSDP's minimum number of parameters for auto wrapping.