lab-cosmo · frostedoyster · Jun 21, 2024 · Jun 14, 2024 · Jun 14, 2024 · Jun 14, 2024
diff --git a/docs/src/dev-docs/new-architecture.rst b/docs/src/dev-docs/new-architecture.rst
@@ -16,13 +16,13 @@ to these lines
     hypers = {}
     dataset_info = DatasetInfo()
 
-    if "continue_from":
-        model = Model.load_checkpoint("path")
+    if continue_from is not None:
+        trainer = Trainer.load_checkpoint(continue_from, hypers["training"])
+        model = Model.load_checkpoint(continue_from)
         model = model.restart(dataset_info)
     else:
-        model = Model(hypers["architecture"], dataset_info)
-
-    trainer = Trainer(hypers["training"])
+        model = Model(hypers["model"], dataset_info)
+        trainer = Trainer(hypers["training"])
 
     trainer.train(
         model=model,
@@ -56,9 +56,8 @@ In order to follow this, a new architectures has two define two classes
     val_datasets passed to the Trainer, as well as the dataset_info passed to the
     model.
 
-The ``ModelInterface`` is the main model class and must implement a
-``save_checkpoint()``, ``load_checkpoint()``  as well as a ``restart()`` and
-``export()`` method.
+The ``ModelInterface`` is the main model class and must implement the
+``load_checkpoint()``, ``restart()`` and ``export()`` methods.
 
 .. code-block:: python
 
@@ -71,9 +70,6 @@ The ``ModelInterface`` is the main model class and must implement a
             self.hypers = model_hypers
             self.dataset_info = dataset_info
 
-        def save_checkpoint(self, path: Union[str, Path]):
-            pass
-
         @classmethod
         def load_checkpoint(cls, path: Union[str, Path]) -> "ModelInterface":
             pass
-        @classmethod
-        def load_checkpoint(cls, path: Union[str, Path]) -> "ModelInterface":
-            pass
-        @classmethod
-        def load_checkpoint(cls, path: Union[str, Path]) -> "ModelInterface":
-            pass
@@ -105,8 +101,8 @@ a helper function :py:func:`metatrain.utils.export.export` to export a torch
 model to an :py:class:`MetatensorAtomisticModel
 <metatensor.torch.atomistic.MetatensorAtomisticModel>`.
 
-The ``TrainerInterface`` class should have the following signature with a required
-methods for ``train()``.
+The ``TrainerInterface`` class should have the following signature with required
+methods for ``train()``, ``save_checkpoint()`` and ``load_checkpoint()``.
 
 .. code-block:: python
 
@@ -123,6 +119,18 @@ methods for ``train()``.
             checkpoint_dir: str,
         ) -> None: ...
 
+        def save_checkpoint(self, path: Union[str, Path]) -> None: ...
+
+        @classmethod
+        def load_checkpoint(
+            cls, path: Union[str, Path], train_hypers: Dict
+        ) -> "TrainerInterface":
+            pass
+
+The format of checkpoints is not defined by `metatrain` and can be any format that
+can be loaded by the trainer (to restart training) and by the model (to export the
+checkpoint).
+
 The names of the ``ModelInterface`` and the ``TrainerInterface`` are free to choose but
 should be linked to constants in the ``__init__.py`` of each architecture. On top of
 these two constants the ``__init__.py`` must contain constants for the original

diff --git a/src/metatrain/cli/export.py b/src/metatrain/cli/export.py
@@ -80,6 +80,9 @@ def export_model(model: Any, output: Union[Path, str] = "exported-model.pt") ->
         torch.jit.save(model, path)
     else:
         extensions_path = "extensions/"
-        logger.info(f"Exporting model to {path} and extensions to {extensions_path}")
+        logger.info(
+            f"Exporting model to `{path}` and extensions to `{extensions_path}`"
-            f"Exporting model to `{path}` and extensions to `{extensions_path}`"
+            f"Exporting model to {path!r} and extensions to {extensions_path!r}"
-            f"Exporting model to `{path}` and extensions to `{extensions_path}`"
+            f"Exporting model to {path!r} and extensions to {extensions_path!r}"
+        )
         mts_atomistic_model = model.export()
         mts_atomistic_model.export(path, collect_extensions=extensions_path)
+        logger.info("Model exported successfully")
diff --git a/src/metatrain/cli/train.py b/src/metatrain/cli/train.py
@@ -371,20 +371,21 @@ def train_model(
     try:
         if continue_from is not None:
             logger.info(f"Loading checkpoint from `{continue_from}`")
+            trainer = Trainer.load_checkpoint(continue_from, hypers["training"])
             model = Model.load_checkpoint(continue_from)
             model = model.restart(dataset_info)
         else:
             model = Model(hypers["model"], dataset_info)
+            trainer = Trainer(hypers["training"])
     except Exception as e:
         raise ArchitectureError(e)
 
     ###########################
     # TRAIN MODEL #############
     ###########################
 
-    logger.info("Start training")
+    logger.info("Calling trainer")
     try:
-        trainer = Trainer(hypers["training"])
         trainer.train(
             model=model,
             devices=devices,
@@ -405,18 +406,18 @@ def train_model(
     output_checked = check_suffix(filename=output, suffix=".pt")
     logger.info(
         "Training finished, saving final checkpoint "
-        f"to {str(Path(output_checked).stem)}.ckpt"
+        f"to `{str(Path(output_checked).stem)}.ckpt`"
     )
     try:
-        model.save_checkpoint(f"{Path(output_checked).stem}.ckpt")
+        trainer.save_checkpoint(model, f"{Path(output_checked).stem}.ckpt")
     except Exception as e:
         raise ArchitectureError(e)
 
     mts_atomistic_model = model.export()
     extensions_path = "extensions/"
 
     logger.info(
-        f"Exporting model to {output_checked} and extensions to {extensions_path}"
+        f"Exporting model to `{output_checked}` and extensions to `{extensions_path}`"
     )
     mts_atomistic_model.export(str(output_checked), collect_extensions=extensions_path)
 

@@ -15,7 +15,6 @@
 from ...utils.data.dataset import DatasetInfo
 from ...utils.dtype import dtype_to_str
 from ...utils.export import export
-from ...utils.io import check_suffix
 from .utils import systems_to_torch_alchemical_batch
 
 
@@ -126,29 +125,17 @@ def forward(
         )
         return total_energies
 
-    def save_checkpoint(self, path: Union[str, Path]):
-        torch.save(
-            {
-                "model_hypers": {
-                    "model_hypers": self.hypers,
-                    "dataset_info": self.dataset_info,
-                },
-                "model_state_dict": self.state_dict(),
-            },
-            check_suffix(path, ".ckpt"),
-        )
-
     @classmethod
     def load_checkpoint(cls, path: Union[str, Path]) -> "AlchemicalModel":
 
-        # Load the model and the metadata
-        model_dict = torch.load(path)
+        # Load the checkpoint
+        checkpoint = torch.load(path)
+        model_hypers = checkpoint["model_hypers"]
+        model_state_dict = checkpoint["model_state_dict"]
 
         # Create the model
-        model = cls(**model_dict["model_hypers"])
-
-        # Load the model weights
-        model.load_state_dict(model_dict["model_state_dict"])
+        model = cls(**model_hypers)
+        model.load_state_dict(model_state_dict)
 
         return model
 

@@ -16,6 +16,7 @@
 )
 from ...utils.evaluate_model import evaluate_model
 from ...utils.external_naming import to_external_name
+from ...utils.io import check_suffix
 from ...utils.logging import MetricLogger
 from ...utils.loss import TensorMapDictLoss
 from ...utils.metrics import RMSEAccumulator
@@ -35,6 +36,9 @@
 class Trainer:
     def __init__(self, train_hypers):
         self.hypers = train_hypers
+        self.optimizer_state_dict = None
+        self.scheduler_state_dict = None
+        self.epoch = None
 
     def train(
         self,
@@ -178,6 +182,8 @@ def train(
         optimizer = torch.optim.Adam(
             model.parameters(), lr=self.hypers["learning_rate"]
         )
+        if self.optimizer_state_dict is not None:
+            optimizer.load_state_dict(self.optimizer_state_dict)
 
         # Create a scheduler:
         lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
@@ -186,6 +192,8 @@ def train(
             factor=self.hypers["scheduler_factor"],
             patience=self.hypers["scheduler_patience"],
         )
+        if self.scheduler_state_dict is not None:
+            lr_scheduler.load_state_dict(self.scheduler_state_dict)
 
         # counters for early stopping:
         best_val_loss = float("inf")
@@ -194,9 +202,11 @@ def train(
         # per-atom targets:
         per_structure_targets = self.hypers["per_structure_targets"]
 
+        start_epoch = 0 if self.epoch is None else self.epoch + 1
+
         # Train the model:
         logger.info("Starting training")
-        for epoch in range(self.hypers["num_epochs"]):
+        for epoch in range(start_epoch, start_epoch + self.hypers["num_epochs"]):
             train_rmse_calculator = RMSEAccumulator()
             val_rmse_calculator = RMSEAccumulator()
 
@@ -279,7 +289,7 @@ def train(
                 **finalized_val_info,
             }
 
-            if epoch == 0:
+            if epoch == start_epoch:
                 metric_logger = MetricLogger(
                     logobj=logger,
                     dataset_info=model.dataset_info,
@@ -293,7 +303,12 @@ def train(
                 )
 
             if epoch % self.hypers["checkpoint_interval"] == 0:
-                model.save_checkpoint(Path(checkpoint_dir) / f"model_{epoch}.ckpt")
+                self.optimizer_state_dict = optimizer.state_dict()
+                self.scheduler_state_dict = lr_scheduler.state_dict()
+                self.epoch = epoch
+                self.save_checkpoint(
+                    model, Path(checkpoint_dir) / f"model_{epoch}.ckpt"
+                )
 
             # early stopping criterion:
             if val_loss < best_val_loss:
@@ -308,3 +323,43 @@ def train(
                         "without improvement."
                     )
                     break
+
+    def save_checkpoint(self, model, path: Union[str, Path]):
+        checkpoint = {
+            "model_hypers": {
+                "model_hypers": model.hypers,
+                "dataset_info": model.dataset_info,
+            },
+            "model_state_dict": model.state_dict(),
+            "train_hypers": self.hypers,
+            "epoch": self.epoch,
+            "optimizer_state_dict": self.optimizer_state_dict,
+            "scheduler_state_dict": self.scheduler_state_dict,
+        }
+        torch.save(
+            checkpoint,
+            check_suffix(path, ".ckpt"),
+        )
+
+    @classmethod
+    def load_checkpoint(cls, path: Union[str, Path], train_hypers) -> "Trainer":
+
+        # Load the checkpoint
+        checkpoint = torch.load(path)
+        model_hypers = checkpoint["model_hypers"]
+        model_state_dict = checkpoint["model_state_dict"]
+        epoch = checkpoint["epoch"]
+        optimizer_state_dict = checkpoint["optimizer_state_dict"]
+        scheduler_state_dict = checkpoint["scheduler_state_dict"]
+
+        # Create the trainer
+        trainer = cls(train_hypers)
+        trainer.optimizer_state_dict = optimizer_state_dict
+        trainer.scheduler_state_dict = scheduler_state_dict
+        trainer.epoch = epoch
+
+        # Create the model
+        model = AlchemicalModel(**model_hypers)
+        model.load_state_dict(model_state_dict)
+
+        return trainer
@@ -1,4 +1,3 @@
-from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Type, Union
 
 import metatensor.torch
@@ -197,15 +196,6 @@ def forward(
         out_tensor = self.apply_composition_weights(systems, energies)
         return {output_key: out_tensor}
 
-    def save_checkpoint(self, path: Union[str, Path]):
-        # GAP will not save checkpoints, as it does not allow
-        # restarting training
-        return
-
-    @classmethod
-    def load_checkpoint(cls, path: Union[str, Path]) -> "GAP":
-        raise ValueError("GAP does not allow restarting training")
-
     def export(self) -> MetatensorAtomisticModel:
         capabilities = ModelCapabilities(
             outputs=self.outputs,

@@ -1,4 +1,5 @@
 import logging
+from pathlib import Path
 from typing import List, Union
 
 import metatensor
@@ -140,3 +141,12 @@ def train(
         model._subset_of_regressors_torch = (
             model._subset_of_regressors.export_torch_script_model()
         )
+
+    def save_checkpoint(self, model, checkpoint_dir: str):
+        # GAP won't save a checkpoint since it
+        # doesn't support restarting training
+        return
+
+    @classmethod
+    def load_checkpoint(cls, path: Union[str, Path], hypers_train) -> "GAP":
+        raise ValueError("GAP does not allow restarting training")
@@ -12,13 +12,14 @@
     NeighborListOptions,
     System,
 )
+from pet.hypers import Hypers
 from pet.pet import PET as RawPET
+from pet.pet import SelfContributionsWrapper
 
 from metatrain.utils.data import DatasetInfo
 
 from ...utils.dtype import dtype_to_str
 from ...utils.export import export
-from ...utils.io import check_suffix
 from .utils import systems_to_batch_dict
 
 
@@ -110,29 +111,33 @@ def forward(
             output_quantities[output_name] = output_tmap
         return output_quantities
 
-    def save_checkpoint(self, path: Union[str, Path]):
-        torch.save(
-            {
-                "model_hypers": {
-                    "model_hypers": self.hypers,
-                    "dataset_info": self.dataset_info,
-                },
-                "model_state_dict": self.state_dict(),
-            },
-            check_suffix(path, ".ckpt"),
-        )
-
     @classmethod
     def load_checkpoint(cls, path: Union[str, Path]) -> "PET":
 
-        # Load the model and the metadata
-        model_dict = torch.load(path)
+        checkpoint = torch.load(path)
+        hypers = checkpoint["hypers"]
+        dataset_info = checkpoint["dataset_info"]
+        model = cls(
+            model_hypers=hypers["ARCHITECTURAL_HYPERS"], dataset_info=dataset_info
+        )
+
+        checkpoint = torch.load(path)
+        state_dict = checkpoint["checkpoint"]["model_state_dict"]
+
+        ARCHITECTURAL_HYPERS = Hypers(model.hypers)
+        raw_pet = RawPET(ARCHITECTURAL_HYPERS, 0.0, len(model.atomic_types))
+
+        new_state_dict = {}
+        for name, value in state_dict.items():
+            name = name.replace("model.pet_model.", "")
+            new_state_dict[name] = value
+
+        raw_pet.load_state_dict(new_state_dict)
 
-        # Create the model
-        model = cls(**model_dict["model_hypers"])
+        self_contributions = checkpoint["self_contributions"]
+        wrapper = SelfContributionsWrapper(raw_pet, self_contributions)
 
-        # Load the model weights
-        model.load_state_dict(model_dict["model_state_dict"])
+        model.set_trained_model(wrapper)
 
         return model