huggingface · tomaarsen · Sep 12, 2024 · Jul 3, 2024 · Sep 12, 2024
diff --git a/README.md b/README.md
@@ -69,7 +69,7 @@ model = SetFitModel.from_pretrained(
 args = TrainingArguments(
     batch_size=16,
     num_epochs=4,
-    evaluation_strategy="epoch",
+    eval_strategy="epoch",
     save_strategy="epoch",
     load_best_model_at_end=True,
 )

diff --git a/docs/source/en/how_to/absa.mdx b/docs/source/en/how_to/absa.mdx
@@ -87,7 +87,7 @@ args = TrainingArguments(
     num_epochs=5,
     use_amp=True,
     batch_size=128,
-    evaluation_strategy="steps",
+    eval_strategy="steps",
     eval_steps=50,
     save_steps=50,
     load_best_model_at_end=True,

diff --git a/docs/source/en/how_to/v1.0.0_migration_guide.mdx b/docs/source/en/how_to/v1.0.0_migration_guide.mdx
@@ -42,7 +42,7 @@ This list contains new functionality that can be used starting from v1.0.0.
 * [`AbsaTrainer`] and [`AbsaModel`] have been introduced for applying [SetFit for Aspect Based Sentiment Analysis](absa).
 * [`Trainer`] now supports a `callbacks` argument for a list of [`transformers` `TrainerCallback` instances](https://huggingface.co/docs/transformers/main/en/main_classes/callback).
     * By default, all installed callbacks integrated with `transformers` are supported, including [`TensorBoardCallback`](https://huggingface.co/docs/transformers/main/en/main_classes/callback#transformers.integrations.TensorBoardCallback), [`WandbCallback`](https://huggingface.co/docs/transformers/main/en/main_classes/callback#transformers.integrations.WandbCallback) to log training logs to [TensorBoard](https://www.tensorflow.org/tensorboard) and [W&B](https://wandb.ai), respectively.
-    * The [`Trainer`] will now print `embedding_loss` in the terminal, as well as `eval_embedding_loss` if `evaluation_strategy` is set to `"epoch"` or `"steps"` in [`TrainingArguments`].
+    * The [`Trainer`] will now print `embedding_loss` in the terminal, as well as `eval_embedding_loss` if `eval_strategy` is set to `"epoch"` or `"steps"` in [`TrainingArguments`].
 * [`Trainer.evaluate`] now works with string labels.
 * An updated contrastive pair sampler increases the variety of training pairs.
 * [`TrainingArguments`] supports various new arguments:
@@ -65,14 +65,14 @@ This list contains new functionality that can be used starting from v1.0.0.
 
     * `logging_first_step`: Whether to log and evaluate the first `global_step` or not.
     * `logging_steps`: Number of update steps between two logs if `logging_strategy="steps"`.
-    * `evaluation_strategy`: The evaluation strategy to adopt during training. Possible values are:
+    * `eval_strategy`: The evaluation strategy to adopt during training. Possible values are:
 
         - `"no"`: No evaluation is done during training.
         - `"steps"`: Evaluation is done (and logged) every `eval_steps`.
         - `"epoch"`: Evaluation is done at the end of each epoch.
 
-    * `eval_steps`: Number of update steps between two evaluations if `evaluation_strategy="steps"`. Will default to the same as `logging_steps` if not set.
-    * `eval_delay`: Number of epochs or steps to wait for before the first evaluation can be performed, depending on the `evaluation_strategy`.
+    * `eval_steps`: Number of update steps between two evaluations if `eval_strategy="steps"`. Will default to the same as `logging_steps` if not set.
+    * `eval_delay`: Number of epochs or steps to wait for before the first evaluation can be performed, depending on the `eval_strategy`.
     * `eval_max_steps`: If set to a positive number, the total number of evaluation steps to perform. The evaluation may stop before reaching the set number of steps when all data is exhausted.
     * `save_strategy`: The checkpoint save strategy to adopt during training. Possible values are:
 
@@ -81,12 +81,12 @@ This list contains new functionality that can be used starting from v1.0.0.
         - `"steps"`: Save is done every `save_steps`.
 
     * `save_steps`: Number of updates steps before two checkpoint saves if `save_strategy="steps"`.
-    * `save_total_limit`: If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in `output_dir`. Note, the best model is always preserved if the `evaluation_strategy` is not `"no"`.
+    * `save_total_limit`: If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in `output_dir`. Note, the best model is always preserved if the `eval_strategy` is not `"no"`.
     * `load_best_model_at_end`: Whether or not to load the best model found during training at the end of training.
 
     <Tip>
 
-    When set to `True`, the parameters `save_strategy` needs to be the same as `evaluation_strategy`, and in
+    When set to `True`, the parameters `save_strategy` needs to be the same as `eval_strategy`, and in
     the case it is "steps", `save_steps` must be a round multiple of `eval_steps`.
 
     </Tip>

diff --git a/scripts/setfit/distillation_baseline.py b/scripts/setfit/distillation_baseline.py
@@ -82,7 +82,7 @@ def standard_model_distillation(self, train_raw_student, x_test, y_test, num_cla
             per_device_train_batch_size=self.batch_size,
             per_device_eval_batch_size=self.batch_size,
             num_train_epochs=self.num_epochs,
-            evaluation_strategy="no",
+            eval_strategy="no",
             save_strategy="no",
             load_best_model_at_end=False,
             weight_decay=0.01,

diff --git a/scripts/setfit/run_fewshot.py b/scripts/setfit/run_fewshot.py
@@ -59,7 +59,7 @@ def parse_args():
     parser.add_argument("--override_results", default=False, action="store_true")
     parser.add_argument("--keep_body_frozen", default=False, action="store_true")
     parser.add_argument("--add_data_augmentation", default=False)
-    parser.add_argument("--evaluation_strategy", default=False)
+    parser.add_argument("--eval_strategy", default=False)
 
     args = parser.parse_args()
 
@@ -149,8 +149,8 @@ def main():
                 num_epochs=args.num_epochs,
                 num_iterations=args.num_iterations,
             )
-            if not args.evaluation_strategy:
-                trainer.args.evaluation_strategy = "no"
+            if not args.eval_strategy:
+                trainer.args.eval_strategy = "no"
             if args.classifier == "pytorch":
                 trainer.freeze()
                 trainer.train()

diff --git a/scripts/transformers/run_fewshot.py b/scripts/transformers/run_fewshot.py
@@ -94,7 +94,7 @@ def compute_metrics(pred):
             per_device_train_batch_size=batch_size,
             per_device_eval_batch_size=batch_size,
             weight_decay=0.01,
-            evaluation_strategy="epoch",
+            eval_strategy="epoch",
             logging_steps=100,
             save_strategy="no",
             fp16=True,

diff --git a/scripts/transformers/run_fewshot_multilingual.py b/scripts/transformers/run_fewshot_multilingual.py
@@ -119,7 +119,7 @@ def compute_metrics(pred):
             per_device_train_batch_size=batch_size,
             per_device_eval_batch_size=batch_size,
             weight_decay=0.01,
-            evaluation_strategy="epoch",
+            eval_strategy="epoch",
             logging_steps=100,
             save_strategy="no",
             fp16=True,

diff --git a/scripts/transformers/run_full.py b/scripts/transformers/run_full.py
@@ -85,7 +85,7 @@ def compute_metrics(pred):
         per_device_train_batch_size=batch_size,
         per_device_eval_batch_size=batch_size,
         weight_decay=0.001,
-        evaluation_strategy="epoch",
+        eval_strategy="epoch",
         logging_steps=100,
         metric_for_best_model=metric,
         load_best_model_at_end=True,

diff --git a/scripts/transformers/run_full_multilingual.py b/scripts/transformers/run_full_multilingual.py
@@ -104,7 +104,7 @@ def compute_metrics(pred):
         per_device_train_batch_size=batch_size,
         per_device_eval_batch_size=batch_size,
         weight_decay=0.01,
-        evaluation_strategy="epoch",
+        eval_strategy="epoch",
         logging_steps=100,
         metric_for_best_model="eval_loss",
         load_best_model_at_end=True,

diff --git a/src/setfit/model_card.py b/src/setfit/model_card.py
@@ -80,7 +80,7 @@ def on_train_begin(
             "logging_strategy",
             "logging_first_step",
             "logging_steps",
-            "evaluation_strategy",
+            "eval_strategy",
             "eval_steps",
             "eval_delay",
             "save_strategy",

diff --git a/src/setfit/trainer.py b/src/setfit/trainer.py
@@ -443,7 +443,7 @@ def train_embeddings(
         train_dataloader, loss_func, batch_size, num_unique_pairs = self.get_dataloader(
             x_train, y_train, args=args, max_pairs=train_max_pairs
         )
-        if x_eval is not None and args.evaluation_strategy != IntervalStrategy.NO:
+        if x_eval is not None and args.eval_strategy != IntervalStrategy.NO:
             eval_max_pairs = -1 if args.eval_max_steps == -1 else args.eval_max_steps * args.embedding_batch_size
             eval_dataloader, _, _, _ = self.get_dataloader(x_eval, y_eval, args=args, max_pairs=eval_max_pairs)
         else:

diff --git a/src/setfit/training_args.py b/src/setfit/training_args.py
@@ -124,19 +124,19 @@ class TrainingArguments:
             Whether to log and evaluate the first `global_step` or not.
         logging_steps (`int`, defaults to 50):
             Number of update steps between two logs if `logging_strategy="steps"`.
-        evaluation_strategy (`str` or [`~transformers.trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
+        eval_strategy (`str` or [`~transformers.trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
             The evaluation strategy to adopt during training. Possible values are:
 
                 - `"no"`: No evaluation is done during training.
                 - `"steps"`: Evaluation is done (and logged) every `eval_steps`.
                 - `"epoch"`: Evaluation is done at the end of each epoch.
 
         eval_steps (`int`, *optional*):
-            Number of update steps between two evaluations if `evaluation_strategy="steps"`. Will default to the same
+            Number of update steps between two evaluations if `eval_strategy="steps"`. Will default to the same
             value as `logging_steps` if not set.
         eval_delay (`float`, *optional*):
             Number of epochs or steps to wait for before the first evaluation can be performed, depending on the
-            evaluation_strategy.
+            eval_strategy.
         eval_max_steps (`int`, defaults to `-1`):
             If set to a positive number, the total number of evaluation steps to perform. The evaluation may stop
             before reaching the set number of steps when all data is exhausted.
@@ -151,13 +151,13 @@ class TrainingArguments:
             Number of updates steps before two checkpoint saves if `save_strategy="steps"`.
         save_total_limit (`int`, *optional*, defaults to `1`):
             If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
-            `output_dir`. Note, the best model is always preserved if the `evaluation_strategy` is not `"no"`.
+            `output_dir`. Note, the best model is always preserved if the `eval_strategy` is not `"no"`.
         load_best_model_at_end (`bool`, *optional*, defaults to `False`):
             Whether or not to load the best model found during training at the end of training.
 
             <Tip>
 
-            When set to `True`, the parameters `save_strategy` needs to be the same as `evaluation_strategy`, and in
+            When set to `True`, the parameters `save_strategy` needs to be the same as `eval_strategy`, and in
             the case it is "steps", `save_steps` must be a round multiple of `eval_steps`.
 
             </Tip>
@@ -208,7 +208,8 @@ class TrainingArguments:
     logging_first_step: bool = True
     logging_steps: int = 50
 
-    evaluation_strategy: str = "no"
+    eval_strategy: str = "no"
+    evaluation_strategy: str = field(default="no", repr=False, init=False) # Softly deprecated
     eval_steps: Optional[int] = None
     eval_delay: int = 0
     eval_max_steps: int = -1
@@ -251,30 +252,36 @@ def __post_init__(self) -> None:
             self.logging_dir = default_logdir()
 
         self.logging_strategy = IntervalStrategy(self.logging_strategy)
-        self.evaluation_strategy = IntervalStrategy(self.evaluation_strategy)
+        if self.evaluation_strategy and not self.eval_strategy:
+            logger.warning(
+                "The `evaluation_strategy` argument is deprecated and will be removed in a future version. "
+                "Please use `eval_strategy` instead."
+            )
+            self.eval_strategy = self.evaluation_strategy
+        self.eval_strategy = IntervalStrategy(self.eval_strategy)
 
-        if self.eval_steps is not None and self.evaluation_strategy == IntervalStrategy.NO:
-            logger.info('Using `evaluation_strategy="steps"` as `eval_steps` is defined.')
-            self.evaluation_strategy = IntervalStrategy.STEPS
+        if self.eval_steps is not None and self.eval_strategy == IntervalStrategy.NO:
+            logger.info('Using `eval_strategy="steps"` as `eval_steps` is defined.')
+            self.eval_strategy = IntervalStrategy.STEPS
 
         # eval_steps has to be defined and non-zero, fallbacks to logging_steps if the latter is non-zero
-        if self.evaluation_strategy == IntervalStrategy.STEPS and (self.eval_steps is None or self.eval_steps == 0):
+        if self.eval_strategy == IntervalStrategy.STEPS and (self.eval_steps is None or self.eval_steps == 0):
             if self.logging_steps > 0:
                 self.eval_steps = self.logging_steps
             else:
                 raise ValueError(
-                    f"evaluation strategy {self.evaluation_strategy} requires either non-zero `eval_steps` or"
+                    f"evaluation strategy {self.eval_strategy} requires either non-zero `eval_steps` or"
                     " `logging_steps`"
                 )
 
         # Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible.
         if self.load_best_model_at_end:
-            if self.evaluation_strategy != self.save_strategy:
+            if self.eval_strategy != self.save_strategy:
                 raise ValueError(
                     "`load_best_model_at_end` requires the save and eval strategy to match, but found\n- Evaluation "
-                    f"strategy: {self.evaluation_strategy}\n- Save strategy: {self.save_strategy}"
+                    f"strategy: {self.eval_strategy}\n- Save strategy: {self.save_strategy}"
                 )
-            if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0:
+            if self.eval_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0:
                 raise ValueError(
                     "`load_best_model_at_end` requires the saving steps to be a round multiple of the evaluation "
                     f"steps, but found {self.save_steps}, which is not a round multiple of {self.eval_steps}."

diff --git a/tests/span/test_model_card.py b/tests/span/test_model_card.py
@@ -25,7 +25,7 @@ def test_model_card(absa_dataset: Dataset, tmp_path: Path) -> None:
         eval_steps=1,
         logging_steps=1,
         max_steps=2,
-        evaluation_strategy="steps",
+        eval_strategy="steps",
     )
     trainer = AbsaTrainer(
         model=model,

diff --git a/tests/test_model_card.py b/tests/test_model_card.py
@@ -35,7 +35,7 @@ def test_model_card(tmp_path: Path) -> None:
         eval_steps=1,
         logging_steps=1,
         max_steps=2,
-        evaluation_strategy="steps",
+        eval_strategy="steps",
     )
     trainer = Trainer(
         model=model,

diff --git a/tests/test_trainer.py b/tests/test_trainer.py
@@ -590,7 +590,7 @@ def test_train_load_best(model: SetFitModel, tmp_path: Path, caplog: LogCaptureF
         output_dir=tmp_path,
         save_steps=5,
         eval_steps=5,
-        evaluation_strategy="steps",
+        eval_strategy="steps",
         load_best_model_at_end=True,
         num_epochs=5,
     )

diff --git a/tests/test_training_args.py b/tests/test_training_args.py
@@ -72,29 +72,29 @@ def test_report_to(self):
 
     def test_eval_steps_without_eval_strat(self):
         args = TrainingArguments(eval_steps=5)
-        self.assertEqual(args.evaluation_strategy, IntervalStrategy.STEPS)
+        self.assertEqual(args.eval_strategy, IntervalStrategy.STEPS)
 
     def test_eval_strat_steps_without_eval_steps(self):
-        args = TrainingArguments(evaluation_strategy="steps")
+        args = TrainingArguments(eval_strategy="steps")
         self.assertEqual(args.eval_steps, args.logging_steps)
         with self.assertRaises(ValueError):
-            TrainingArguments(evaluation_strategy="steps", logging_steps=0, logging_strategy="no")
+            TrainingArguments(eval_strategy="steps", logging_steps=0, logging_strategy="no")
 
     def test_load_best_model(self):
         with self.assertRaises(ValueError):
-            TrainingArguments(load_best_model_at_end=True, evaluation_strategy="steps", save_strategy="epoch")
+            TrainingArguments(load_best_model_at_end=True, eval_strategy="steps", save_strategy="epoch")
         with self.assertRaises(ValueError):
             TrainingArguments(
                 load_best_model_at_end=True,
-                evaluation_strategy="steps",
+                eval_strategy="steps",
                 save_strategy="steps",
                 eval_steps=100,
                 save_steps=50,
             )
         # No error: save_steps is a round multiple of eval_steps
         TrainingArguments(
             load_best_model_at_end=True,
-            evaluation_strategy="steps",
+            eval_strategy="steps",
             save_strategy="steps",
             eval_steps=50,
             save_steps=100,