huggingface · Rocketknight1 · Nov 29, 2021 · Nov 22, 2021 · Nov 23, 2021 · Nov 23, 2021
diff --git a/src/transformers/keras_callbacks.py b/src/transformers/keras_callbacks.py
@@ -1,4 +1,5 @@
 import logging
+import os
 from pathlib import Path
 from time import sleep
 from typing import Optional, Union
@@ -23,6 +24,7 @@ def __init__(
         tokenizer: Optional[PreTrainedTokenizerBase] = None,
         hub_model_id: Optional[str] = None,
         hub_token: Optional[str] = None,
+        checkpoint: Optional[bool] = False,
     ):
         """
         output_dir (:obj:`str`):
@@ -48,8 +50,13 @@ def __init__(
         hub_token (:obj:`str`, `optional`):
             The token to use to push the model to the Hub. Will default to the token in the cache folder obtained with
             :obj:`huggingface-cli login`.
+        checkpoint (:obj:`bool`, `optional`):
+            Whether to save full training checkpoints (including epoch and optimizer state) to allow training to be
+            resumed. Only usable when `save_strategy` is `epoch`.
         """
         super().__init__()
+        if checkpoint and save_strategy != "epoch":
+            raise ValueError("Cannot save checkpoints when save_strategy is not 'epoch'!")
         if isinstance(save_strategy, str):
             save_strategy = IntervalStrategy(save_strategy.lower())
         self.save_strategy = save_strategy
@@ -65,6 +72,7 @@ def __init__(
         self.repo = Repository(str(output_dir), clone_from=hub_model_id)
         self.tokenizer = tokenizer
         self.last_job = None
+        self.checkpoint = checkpoint
 
     def on_train_batch_end(self, batch, logs=None):
         if self.save_strategy == IntervalStrategy.STEPS and batch + 1 % self.save_steps == 0:
@@ -84,6 +92,9 @@ def on_epoch_end(self, epoch, logs=None):
             self.model.save_pretrained(self.output_dir)
             if self.tokenizer is not None:
                 self.tokenizer.save_pretrained(self.output_dir)
+            if self.checkpoint:
+                checkpoint_dir = os.path.join(self.output_dir, "checkpoint")
+                self.model._save_checkpoint(checkpoint_dir, epoch)
             _, self.last_job = self.repo.push_to_hub(
                 commit_message=f"Training in progress epoch {epoch}", blocking=False
             )

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
@@ -18,6 +18,7 @@
 import functools
 import inspect
 import os
+import pickle
 import re
 import warnings
 from typing import Dict, List, Optional, Union
@@ -753,6 +754,39 @@ def get_input_embeddings(self) -> tf.keras.layers.Layer:
         else:
             raise NotImplementedError
 
+    def _save_checkpoint(self, checkpoint_dir, epoch):
+        if not os.path.isdir(checkpoint_dir):
+            os.mkdir(checkpoint_dir)
+        # We avoid tf.train.checkpoint or saving weights in TF format, even though that includes optimizer
+        # state for us, because it requires special handling for objects like custom losses, which we use
+        # internally and which users are likely to use too
+        weights_path = os.path.join(checkpoint_dir, "weights.h5")
+        self.save_weights(weights_path)
+        extra_data = {"epoch": epoch, "optimizer_state": self.optimizer.get_weights()}
+        extra_data_path = os.path.join(checkpoint_dir, "extra_data.pickle")
+        with open(extra_data_path, "wb") as f:
+            pickle.dump(extra_data, f)
+
+    def load_repo_checkpoint(self, repo_path_or_name, organization=None):
+        if getattr(self, "optimizer", None) is None:
+            raise RuntimeError(
+                "Checkpoint loading failed as no optimizer is attached to the model. "
+                "This is most likely caused by the model not being compiled."
+            )
+        repo = self._create_or_get_repo(repo_path_or_name, organization)
 def _create_or_get_repo( 
     cls, 
     repo_path_or_name: Optional[str] = None, 
     repo_url: Optional[str] = None, 
     organization: Optional[str] = None, 
     private: bool = None, 
     use_auth_token: Optional[Union[bool, str]] = None, 
 ) -> Repository: 
     if repo_path_or_name is None and repo_url is None: 
         raise ValueError("You need to specify a `repo_path_or_name` or a `repo_url`.") 
     if use_auth_token is None and repo_url is None: 
         use_auth_token = True 
     if repo_path_or_name is None: 
         repo_path_or_name = repo_url.split("/")[-1] 
     if repo_url is None and not os.path.exists(repo_path_or_name): 
         repo_name = Path(repo_path_or_name).name 
         repo_url = cls._get_repo_url_from_name( 
             repo_name, organization=organization, private=private, use_auth_token=use_auth_token 
         ) 
     # Create a working directory if it does not exist. 
     if not os.path.exists(repo_path_or_name): 
         os.makedirs(repo_path_or_name) 
     repo = Repository(repo_path_or_name, clone_from=repo_url, use_auth_token=use_auth_token) 
     repo.git_pull() 
     return repo 
 def _create_or_get_repo( 
     cls, 
     repo_path_or_name: Optional[str] = None, 
     repo_url: Optional[str] = None, 
     organization: Optional[str] = None, 
     private: bool = None, 
     use_auth_token: Optional[Union[bool, str]] = None, 
 ) -> Repository: 
     if repo_path_or_name is None and repo_url is None: 
         raise ValueError("You need to specify a `repo_path_or_name` or a `repo_url`.") 
  
     if use_auth_token is None and repo_url is None: 
         use_auth_token = True 
  
     if repo_path_or_name is None: 
         repo_path_or_name = repo_url.split("/")[-1] 
  
     if repo_url is None and not os.path.exists(repo_path_or_name): 
         repo_name = Path(repo_path_or_name).name 
         repo_url = cls._get_repo_url_from_name( 
             repo_name, organization=organization, private=private, use_auth_token=use_auth_token 
         ) 
  
     # Create a working directory if it does not exist. 
     if not os.path.exists(repo_path_or_name): 
         os.makedirs(repo_path_or_name) 
  
     repo = Repository(repo_path_or_name, clone_from=repo_url, use_auth_token=use_auth_token) 
     repo.git_pull() 
     return repo 
+        checkpoint_dir = os.path.join(repo.local_dir, "checkpoint")
+        weights_file = os.path.join(checkpoint_dir, "weights.h5")
+        if not os.path.isfile(weights_file):
+            raise FileNotFoundError(f"Could not find checkpoint file weights.h5 in repo {repo_path_or_name}!")
+        extra_data_file = os.path.join(checkpoint_dir, "extra_data.pickle")
+        if not os.path.isfile(extra_data_file):
+            raise FileNotFoundError(f"Could not find checkpoint file extra_data.pickle in repo {repo_path_or_name}!")
+        self.load_weights(weights_file)
+        with open(extra_data_file, "rb") as f:
+            extra_data = pickle.load(f)
+        self.optimizer.set_weights(extra_data["optimizer_state"])
+        return {"epoch": extra_data["epoch"]}
+
     def compile(
         self,
         optimizer="rmsprop",