From 3bcbb9158ab1966072a62fa78c07c6ac93fe97df Mon Sep 17 00:00:00 2001 From: Alexander Schepanovski Date: Tue, 17 Mar 2020 13:38:30 +0700 Subject: [PATCH 1/4] dvc: add Repo._skip_graph_checks flag Will skip modified graph checks on add, run and import. The aim is to avoid collecting stages and building graph on these commands. The skip_checks flag is aimed to be private, but let people use dvc on big repos without monkey patching or writing yaml files themselves. --- dvc/repo/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index e7db956463..31143f87ca 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -174,7 +174,8 @@ def _ignore(self): def check_modified_graph(self, new_stages): """Generate graph including the new stage to check for errors""" - self._collect_graph(self.stages + new_stages) + if not getattr(self, "_skip_graph_checks", False): + self._collect_graph(self.stages + new_stages) def collect(self, target, with_deps=False, recursive=False, graph=None): import networkx as nx From e43743cc78cfffcf9a6787c84242a2161fd519c1 Mon Sep 17 00:00:00 2001 From: Alexander Schepanovski Date: Mon, 23 Mar 2020 12:21:15 +0700 Subject: [PATCH 2/4] dvc: explain skip_graphs_checks flag in a comment --- dvc/repo/__init__.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 31143f87ca..623e038c19 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -174,6 +174,15 @@ def _ignore(self): def check_modified_graph(self, new_stages): """Generate graph including the new stage to check for errors""" + # Building graph might be costly for ones with many stages, + # we provide this non-documented hack to skip this for add and run. + # Can be used as: + # + # repo = Repo(...) + # repo._skip_graph_checks = True + # repo.add(...) + # + # A user should care about not duplicating outs and not adding cycles. if not getattr(self, "_skip_graph_checks", False): self._collect_graph(self.stages + new_stages) From 080548038e7863cf078b6482513d17f71f6af0ae Mon Sep 17 00:00:00 2001 From: Alexander Schepanovski Date: Mon, 23 Mar 2020 12:21:52 +0700 Subject: [PATCH 3/4] test: that skip_graph_checks works --- tests/unit/repo/test_repo.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/unit/repo/test_repo.py b/tests/unit/repo/test_repo.py index 4638ff0113..e957eeea27 100644 --- a/tests/unit/repo/test_repo.py +++ b/tests/unit/repo/test_repo.py @@ -82,3 +82,18 @@ def test_collect_optimization(tmp_dir, dvc, mocker): # Should read stage directly instead of collecting the whole graph dvc.collect(stage.path) dvc.collect_granular(stage.path) + + +def test_skip_graph_checks(tmp_dir, dvc, mocker, run_copy): + (stage,) = tmp_dir.dvc_gen("foo", "foo text") + + # Error out on graph collection and raise a skip graph checks flag + mocker.patch( + "dvc.repo.Repo._collect_graph", + side_effect=Exception("Should not collect"), + ) + dvc._skip_graph_checks = True + + tmp_dir.gen("foo", "foo text") + dvc.add("foo") + run_copy("foo", "bar") From 1c63a3cc38c8e79546aa55dc850854d523fae4b7 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Tue, 24 Mar 2020 00:50:31 +0200 Subject: [PATCH 4/4] repo: improve hack description and test --- dvc/repo/__init__.py | 11 +++++++---- tests/unit/repo/test_repo.py | 28 ++++++++++++++++++++-------- 2 files changed, 27 insertions(+), 12 deletions(-) diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 623e038c19..e7f2b4dfd3 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -174,15 +174,18 @@ def _ignore(self): def check_modified_graph(self, new_stages): """Generate graph including the new stage to check for errors""" - # Building graph might be costly for ones with many stages, - # we provide this non-documented hack to skip this for add and run. - # Can be used as: + # Building graph might be costly for the ones with many DVC-files, + # so we provide this undocumented hack to skip it. See [1] for + # more details. The hack can be used as: # # repo = Repo(...) # repo._skip_graph_checks = True # repo.add(...) # - # A user should care about not duplicating outs and not adding cycles. + # A user should care about not duplicating outs and not adding cycles, + # otherwise DVC might have an undefined behaviour. + # + # [1] https://github.com/iterative/dvc/issues/2671 if not getattr(self, "_skip_graph_checks", False): self._collect_graph(self.stages + new_stages) diff --git a/tests/unit/repo/test_repo.py b/tests/unit/repo/test_repo.py index e957eeea27..985c113ef1 100644 --- a/tests/unit/repo/test_repo.py +++ b/tests/unit/repo/test_repo.py @@ -85,15 +85,27 @@ def test_collect_optimization(tmp_dir, dvc, mocker): def test_skip_graph_checks(tmp_dir, dvc, mocker, run_copy): - (stage,) = tmp_dir.dvc_gen("foo", "foo text") - - # Error out on graph collection and raise a skip graph checks flag - mocker.patch( - "dvc.repo.Repo._collect_graph", - side_effect=Exception("Should not collect"), - ) - dvc._skip_graph_checks = True + # See https://github.com/iterative/dvc/issues/2671 for more info + mock_collect_graph = mocker.patch("dvc.repo.Repo._collect_graph") + # sanity check tmp_dir.gen("foo", "foo text") dvc.add("foo") run_copy("foo", "bar") + assert mock_collect_graph.called + + # check that our hack can be enabled + mock_collect_graph.reset_mock() + dvc._skip_graph_checks = True + tmp_dir.gen("baz", "baz text") + dvc.add("baz") + run_copy("baz", "qux") + assert not mock_collect_graph.called + + # check that our hack can be disabled + mock_collect_graph.reset_mock() + dvc._skip_graph_checks = False + tmp_dir.gen("quux", "quux text") + dvc.add("quux") + run_copy("quux", "quuz") + assert mock_collect_graph.called