From f5d67f120e57e19ccce4a0af758f32535fe2bee0 Mon Sep 17 00:00:00 2001
From: dberenbaum <dave@iterative.ai>
Date: Thu, 9 Nov 2023 17:59:56 -0500
Subject: [PATCH] dvclive tracker

---
 docs/source/usage_guides/tracking.md          |  3 +-
 .../deepspeed_with_config_support.py          |  2 +-
 .../by_feature/megatron_lm_gpt_pretraining.py |  2 +-
 src/accelerate/test_utils/testing.py          |  8 +++
 src/accelerate/tracking.py                    | 70 +++++++++++++++++++
 src/accelerate/utils/__init__.py              |  1 +
 src/accelerate/utils/dataclasses.py           |  2 +
 src/accelerate/utils/imports.py               |  4 ++
 tests/test_tracking.py                        | 54 +++++++++++++-
 9 files changed, 142 insertions(+), 4 deletions(-)

diff --git a/docs/source/usage_guides/tracking.md b/docs/source/usage_guides/tracking.md
index 141fea6924b..dba4b084d5d 100644
--- a/docs/source/usage_guides/tracking.md
+++ b/docs/source/usage_guides/tracking.md
@@ -20,7 +20,7 @@ There are a large number of experiment tracking API's available, however getting
 
 ## Integrated Trackers
 
-Currently `Accelerate` supports six trackers out-of-the-box:
+Currently `Accelerate` supports seven trackers out-of-the-box:
 
 - TensorBoard
 - WandB
@@ -28,6 +28,7 @@ Currently `Accelerate` supports six trackers out-of-the-box:
 - Aim
 - MLFlow
 - ClearML
+- DVCLive
 
 To use any of them, pass in the selected type(s) to the `log_with` parameter in [`Accelerate`]:
 ```python
diff --git a/examples/by_feature/deepspeed_with_config_support.py b/examples/by_feature/deepspeed_with_config_support.py
index 15e810c4a2e..b5f122f3ad1 100755
--- a/examples/by_feature/deepspeed_with_config_support.py
+++ b/examples/by_feature/deepspeed_with_config_support.py
@@ -220,7 +220,7 @@ def parse_args():
         default="all",
         help=(
             'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
-            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+            ' `"wandb"`, `"comet_ml"`, and `"dvclive"`. Use `"all"` (default) to report to all integrations.'
             "Only applicable when `--with_tracking` is passed."
         ),
     )
diff --git a/examples/by_feature/megatron_lm_gpt_pretraining.py b/examples/by_feature/megatron_lm_gpt_pretraining.py
index 3c048b2600e..b0e1b33700f 100644
--- a/examples/by_feature/megatron_lm_gpt_pretraining.py
+++ b/examples/by_feature/megatron_lm_gpt_pretraining.py
@@ -216,7 +216,7 @@ def parse_args():
         default="all",
         help=(
             'The integration to report the results and logs to. Supported platforms are `"tensorboard"`,'
-            ' `"wandb"` and `"comet_ml"`. Use `"all"` (default) to report to all integrations.'
+            ' `"wandb"`, `"comet_ml"`, and `"dvclive"`. Use `"all"` (default) to report to all integrations.'
             "Only applicable when `--with_tracking` is passed."
         ),
     )
diff --git a/src/accelerate/test_utils/testing.py b/src/accelerate/test_utils/testing.py
index d6d1e2f2f0a..8a8b82f4e34 100644
--- a/src/accelerate/test_utils/testing.py
+++ b/src/accelerate/test_utils/testing.py
@@ -35,6 +35,7 @@
     is_comet_ml_available,
     is_datasets_available,
     is_deepspeed_available,
+    is_dvclive_available,
     is_mps_available,
     is_pandas_available,
     is_tensorboard_available,
@@ -231,6 +232,13 @@ def require_clearml(test_case):
     return unittest.skipUnless(is_clearml_available(), "test requires clearml")(test_case)
 
 
+def require_dvclive(test_case):
+    """
+    Decorator marking a test that requires dvclive installed. These tests are skipped when dvclive isn't installed
+    """
+    return unittest.skipUnless(is_dvclive_available(), "test requires dvclive")(test_case)
+
+
 def require_pandas(test_case):
     """
     Decorator marking a test that requires pandas installed. These tests are skipped when pandas isn't installed
diff --git a/src/accelerate/tracking.py b/src/accelerate/tracking.py
index 4f536d57812..cfacbfb1161 100644
--- a/src/accelerate/tracking.py
+++ b/src/accelerate/tracking.py
@@ -30,6 +30,7 @@
     is_aim_available,
     is_clearml_available,
     is_comet_ml_available,
+    is_dvclive_available,
     is_mlflow_available,
     is_tensorboard_available,
     is_wandb_available,
@@ -57,6 +58,9 @@
 if is_clearml_available():
     _available_trackers.append(LoggerType.CLEARML)
 
+if is_dvclive_available():
+    _available_trackers.append(LoggerType.DVCLIVE)
+
 logger = get_logger(__name__)
 
 
@@ -837,6 +841,70 @@ def _get_title_series(name):
         return name, "train"
 
 
+class DVCLiveTracker(GeneralTracker):
+    """
+    A `Tracker` class that supports `dvclive`. Should be initialized at the start of your script.
+
+    Args:
+        run_name (`str`):
+            Ignored for dvclive. See `kwargs` instead.
+        kwargs:
+            Additional key word arguments passed along to `dvclive.Live()`.
+    """
+
+    name = "dvclive"
+    requires_logging_directory = False
+
+    @on_main_process
+    def __init__(self, run_name: Optional[str] = None, live: Optional[Any] = None, **kwargs):
+        from dvclive import Live
+
+        super().__init__()
+        self.live = live if live is not None else Live(**kwargs)
+
+    @property
+    def tracker(self):
+        return self.live
+
+    @on_main_process
+    def store_init_configuration(self, values: dict):
+        """
+        Logs `values` as hyperparameters for the run. Should be run at the beginning of your experiment. Stores the
+        hyperparameters in a yaml file for future use.
+
+        Args:
+            values (Dictionary `str` to `bool`, `str`, `float`, `int`, or a List or Dict of those types):
+                Values to be stored as initial hyperparameters as key-value pairs. The values need to have type `bool`,
+                `str`, `float`, or `int`.
+        """
+        self.live.log_params(values)
+
+    @on_main_process
+    def log(self, values: dict, step: Optional[int] = None, **kwargs):
+        """
+        Logs `values` to the current run.
+
+        Args:
+            values (Dictionary `str` to `str`, `float`, or `int`):
+                Values to be logged as key-value pairs. The values need to have type `str`, `float`, or `int`.
+            step (`int`, *optional*):
+                The run step. If included, the log will be affiliated with this step.
+            kwargs:
+                Additional key word arguments passed along to `dvclive.Live.log_metric()`.
+        """
+        if step:
+            self.live.step = step
+        for k, v in values.items():
+            self.live.log_metric(k, v, **kwargs)
+
+    @on_main_process
+    def finish(self):
+        """
+        Closes `dvclive.Live()`.
+        """
+        self.live.end()
+
+
 LOGGER_TYPE_TO_CLASS = {
     "aim": AimTracker,
     "comet_ml": CometMLTracker,
@@ -844,6 +912,7 @@ def _get_title_series(name):
     "tensorboard": TensorBoardTracker,
     "wandb": WandBTracker,
     "clearml": ClearMLTracker,
+    "dvclive": DVCLiveTracker,
 }
 
 
@@ -866,6 +935,7 @@ def filter_trackers(
             - `"wandb"`
             - `"comet_ml"`
             - `"mlflow"`
+            - `"dvclive"`
             If `"all"` is selected, will pick up all available trackers in the environment and initialize them. Can
             also accept implementations of `GeneralTracker` for custom trackers, and can be combined with `"all"`.
         logging_dir (`str`, `os.PathLike`, *optional*):
diff --git a/src/accelerate/utils/__init__.py b/src/accelerate/utils/__init__.py
index 96e3fe61035..e3f4c8997d6 100644
--- a/src/accelerate/utils/__init__.py
+++ b/src/accelerate/utils/__init__.py
@@ -52,6 +52,7 @@
     is_cuda_available,
     is_datasets_available,
     is_deepspeed_available,
+    is_dvclive_available,
     is_fp8_available,
     is_ipex_available,
     is_megatron_lm_available,
diff --git a/src/accelerate/utils/dataclasses.py b/src/accelerate/utils/dataclasses.py
index 72f3c9aeb2d..e0e41568b0c 100644
--- a/src/accelerate/utils/dataclasses.py
+++ b/src/accelerate/utils/dataclasses.py
@@ -340,6 +340,7 @@ class LoggerType(BaseEnum):
         - **TENSORBOARD** -- TensorBoard as an experiment tracker
         - **WANDB** -- wandb as an experiment tracker
         - **COMETML** -- comet_ml as an experiment tracker
+        - **DVCLIVE** -- dvclive as an experiment tracker
     """
 
     ALL = "all"
@@ -349,6 +350,7 @@ class LoggerType(BaseEnum):
     COMETML = "comet_ml"
     MLFLOW = "mlflow"
     CLEARML = "clearml"
+    DVCLIVE = "dvclive"
 
 
 class PrecisionType(BaseEnum):
diff --git a/src/accelerate/utils/imports.py b/src/accelerate/utils/imports.py
index 9a60233c96c..27389eab107 100644
--- a/src/accelerate/utils/imports.py
+++ b/src/accelerate/utils/imports.py
@@ -297,3 +297,7 @@ def is_xpu_available(check_device=False):
         except RuntimeError:
             return False
     return hasattr(torch, "xpu") and torch.xpu.is_available()
+
+
+def is_dvclive_available():
+    return _is_package_available("dvclive")
diff --git a/tests/test_tracking.py b/tests/test_tracking.py
index 545b51fefd4..73a5049c81e 100644
--- a/tests/test_tracking.py
+++ b/tests/test_tracking.py
@@ -35,13 +35,19 @@
     TempDirTestCase,
     require_clearml,
     require_comet_ml,
+    require_dvclive,
     require_pandas,
     require_tensorboard,
     require_wandb,
     skip,
 )
 from accelerate.tracking import CometMLTracker, GeneralTracker
-from accelerate.utils import ProjectConfiguration, is_comet_ml_available, is_tensorboard_available
+from accelerate.utils import (
+    ProjectConfiguration,
+    is_comet_ml_available,
+    is_dvclive_available,
+    is_tensorboard_available,
+)
 
 
 if is_comet_ml_available():
@@ -52,6 +58,11 @@
 
     import tensorboard.compat.proto.event_pb2 as event_pb2
 
+if is_dvclive_available():
+    from dvclive.plots.metric import Metric
+    from dvclive.serialize import load_yaml
+    from dvclive.utils import parse_metrics
+
 logger = logging.getLogger(__name__)
 
 
@@ -473,3 +484,44 @@ def test_log(self):
                     "some_string": "",
                 }
                 self.assertDictEqual(data, truth)
+
+
+@require_dvclive
+class DVCLiveTrackingTest(unittest.TestCase):
+    def test_init_trackers(self):
+        with mock.patch("dvclive.live.get_dvc_repo") as repo_mock:
+            repo_mock.return_value = None
+            project_name = "test_project_with_config"
+            with tempfile.TemporaryDirectory() as dirpath:
+                accelerator = Accelerator(log_with="dvclive")
+                config = {
+                    "num_iterations": 12,
+                    "learning_rate": 1e-2,
+                    "some_boolean": False,
+                    "some_string": "some_value",
+                }
+                init_kwargs = {"dvclive": {"dir": dirpath, "save_dvc_exp": False, "dvcyaml": None}}
+                accelerator.init_trackers(project_name, config, init_kwargs)
+                accelerator.end_training()
+                live = accelerator.trackers[0].live
+                params = load_yaml(live.params_file)
+                assert params == config
+
+    def test_log(self):
+        with mock.patch("dvclive.live.get_dvc_repo") as repo_mock:
+            repo_mock.return_value = None
+            project_name = "test_project_with_log"
+            with tempfile.TemporaryDirectory() as dirpath:
+                accelerator = Accelerator(log_with="dvclive", project_dir=dirpath)
+                init_kwargs = {"dvclive": {"dir": dirpath, "save_dvc_exp": False, "dvcyaml": None}}
+                accelerator.init_trackers(project_name, init_kwargs=init_kwargs)
+                values = {"total_loss": 0.1, "iteration": 1, "my_text": "some_value"}
+                accelerator.log(values, step=0)
+                accelerator.end_training()
+                live = accelerator.trackers[0].live
+                logs, latest = parse_metrics(live)
+                assert latest == values
+                scalars = os.path.join(live.plots_dir, Metric.subfolder)
+                assert os.path.join(scalars, "total_loss.tsv") in logs
+                assert os.path.join(scalars, "iteration.tsv") in logs
+                assert os.path.join(scalars, "my_text.tsv") in logs