From f949b319762468c38a8611b7a3b3e128d6a4e52d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Tue, 14 Jan 2020 17:54:20 +0000 Subject: [PATCH 01/15] status: implement support for imported files Fixes #2959 --- dvc/repo/status.py | 4 ++-- dvc/stage.py | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/dvc/repo/status.py b/dvc/repo/status.py index 2e9a7a9483..8f16725653 100644 --- a/dvc/repo/status.py +++ b/dvc/repo/status.py @@ -19,7 +19,7 @@ def _local_status(self, targets=None, with_deps=False): stages = self.collect(None, with_deps=with_deps) for stage in stages: - if stage.locked: + if stage.locked and not stage.is_repo_import: logger.warning( "DVC-file '{path}' is locked. Its dependencies are" " not going to be shown in the status output.".format( @@ -27,7 +27,7 @@ def _local_status(self, targets=None, with_deps=False): ) ) - status.update(stage.status()) + status.update(stage.status(for_status_command=True)) return status diff --git a/dvc/stage.py b/dvc/stage.py index 449cb261fd..85832720bc 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -1006,10 +1006,12 @@ def _status(entries): return ret @rwlocked(read=["deps", "outs"]) - def status(self): + def status(self, for_status_command=False): ret = [] - if not self.locked: + show_import = self.is_repo_import and for_status_command + + if not self.locked or show_import: deps_status = self._status(self.deps) if deps_status: ret.append({"changed deps": deps_status}) From c9b0593220bab3358d76f7f11f682947a1d1b26c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Tue, 14 Jan 2020 18:51:56 +0000 Subject: [PATCH 02/15] status: enable status of files imported from remote git repos --- dvc/dependency/repo.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index 57244bc4fd..8b1840b7cf 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -1,6 +1,7 @@ import copy import os from contextlib import contextmanager +import filecmp from funcy import merge @@ -10,6 +11,7 @@ from dvc.exceptions import NotDvcRepoError from dvc.exceptions import OutputNotFoundError from dvc.exceptions import PathMissingError +from dvc.exceptions import NoOutputInExternalRepoError from dvc.utils.fs import fs_copy @@ -46,14 +48,29 @@ def _make_repo(self, **overrides): yield repo def status(self): - with self._make_repo() as repo: - current = repo.find_out_by_relpath(self.def_path).info + try: + with self._make_repo() as repo: + current = repo.find_out_by_relpath(self.def_path).info - with self._make_repo(rev_lock=None) as repo: - updated = repo.find_out_by_relpath(self.def_path).info + with self._make_repo(rev_lock=None) as repo: + updated = repo.find_out_by_relpath(self.def_path).info + + if current != updated: + return {str(self): "update available"} + + except NoOutputInExternalRepoError: + url = self.def_repo["url"] + old_clone = cached_clone(url, rev=self.def_repo.get("rev_lock")) + new_clone = cached_clone(url) + old_file = os.path.join(old_clone, self.def_path) + new_file = os.path.join(new_clone, self.def_path) + + if not os.path.exists(old_file) or not os.path.exists(new_file): + raise - if current != updated: - return {str(self): "update available"} + files_differ = not filecmp.cmp(old_file, new_file) + if files_differ: + return {str(self): "update available"} return {} From 8f464c2a071c8b9105957483a91a259f466ee0a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Tue, 14 Jan 2020 19:00:46 +0000 Subject: [PATCH 03/15] status: don't clone more than once for plain files --- dvc/dependency/repo.py | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index 8b1840b7cf..63553b4a4f 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -11,7 +11,6 @@ from dvc.exceptions import NotDvcRepoError from dvc.exceptions import OutputNotFoundError from dvc.exceptions import PathMissingError -from dvc.exceptions import NoOutputInExternalRepoError from dvc.utils.fs import fs_copy @@ -48,29 +47,23 @@ def _make_repo(self, **overrides): yield repo def status(self): - try: - with self._make_repo() as repo: + with self._make_repo() as repo, self._make_repo( + rev_lock=None + ) as updated_repo: + try: current = repo.find_out_by_relpath(self.def_path).info - - with self._make_repo(rev_lock=None) as repo: - updated = repo.find_out_by_relpath(self.def_path).info - - if current != updated: - return {str(self): "update available"} - - except NoOutputInExternalRepoError: - url = self.def_repo["url"] - old_clone = cached_clone(url, rev=self.def_repo.get("rev_lock")) - new_clone = cached_clone(url) - old_file = os.path.join(old_clone, self.def_path) - new_file = os.path.join(new_clone, self.def_path) - - if not os.path.exists(old_file) or not os.path.exists(new_file): - raise - - files_differ = not filecmp.cmp(old_file, new_file) - if files_differ: - return {str(self): "update available"} + updated = updated_repo.find_out_by_relpath(self.def_path).info + + has_changed = current != updated + except OutputNotFoundError: + current_path = os.path.join(repo.root_dir, self.def_path) + updated_path = os.path.join( + updated_repo.root_dir, self.def_path + ) + has_changed = not filecmp.cmp(current_path, updated_path) + + if has_changed: + return {str(self): "update available"} return {} From cfbdccdbede9d3c8f4f0418565ed37efc4cc562b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Tue, 14 Jan 2020 19:49:21 +0000 Subject: [PATCH 04/15] update test --- tests/func/test_update.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/func/test_update.py b/tests/func/test_update.py index 108e32b82e..96f70d8f3e 100644 --- a/tests/func/test_update.py +++ b/tests/func/test_update.py @@ -19,8 +19,13 @@ def test_update_import(tmp_dir, dvc, erepo_dir): # cli call, so we need to clean the caches to see the changes. clean_repos() - assert dvc.status([stage.path]) == {} + status, = dvc.status([stage.path])["version.dvc"] + changed_dep, = list(status["changed deps"].items()) + assert changed_dep[0].startswith("version ") + assert changed_dep[1] == "update available" + dvc.update(stage.path) + assert dvc.status([stage.path]) == {} assert imported.is_file() From e5d35cb3ce20ab1ea7d0b18ffec8df3417a0f375 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Tue, 14 Jan 2020 20:32:46 +0000 Subject: [PATCH 05/15] reduce cognitive complexity of _local_status --- dvc/repo/status.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/dvc/repo/status.py b/dvc/repo/status.py index 8f16725653..85299d8ff2 100644 --- a/dvc/repo/status.py +++ b/dvc/repo/status.py @@ -10,14 +10,9 @@ logger = logging.getLogger(__name__) -def _local_status(self, targets=None, with_deps=False): +def _joint_status(stages): status = {} - if targets: - stages = cat(self.collect(t, with_deps=with_deps) for t in targets) - else: - stages = self.collect(None, with_deps=with_deps) - for stage in stages: if stage.locked and not stage.is_repo_import: logger.warning( @@ -32,6 +27,15 @@ def _local_status(self, targets=None, with_deps=False): return status +def _local_status(self, targets=None, with_deps=False): + if targets: + stages = cat(self.collect(t, with_deps=with_deps) for t in targets) + else: + stages = self.collect(None, with_deps=with_deps) + + return _joint_status(stages) + + def _cloud_status( self, targets=None, From 3228b476fab96f16f00bedd5eea83ca455b6c633 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Tue, 14 Jan 2020 22:58:11 +0000 Subject: [PATCH 06/15] compare checksums instead of file bytes --- dvc/dependency/repo.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index 63553b4a4f..ec8eb0e843 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -1,7 +1,6 @@ import copy import os from contextlib import contextmanager -import filecmp from funcy import merge @@ -12,6 +11,7 @@ from dvc.exceptions import OutputNotFoundError from dvc.exceptions import PathMissingError from dvc.utils.fs import fs_copy +from dvc.path_info import PathInfo class DependencyREPO(DependencyLOCAL): @@ -46,6 +46,11 @@ def _make_repo(self, **overrides): with external_repo(**merge(self.def_repo, overrides)) as repo: yield repo + def _get_checksum_in_repo(self, repo): + return repo.cache.local.get_checksum( + PathInfo(os.path.join(repo.root_dir, self.def_path)) + ) + def status(self): with self._make_repo() as repo, self._make_repo( rev_lock=None @@ -56,11 +61,13 @@ def status(self): has_changed = current != updated except OutputNotFoundError: - current_path = os.path.join(repo.root_dir, self.def_path) - updated_path = os.path.join( - updated_repo.root_dir, self.def_path - ) - has_changed = not filecmp.cmp(current_path, updated_path) + # Need to load the state before calculating checksums + repo.state.load() + updated_repo.state.load() + + has_changed = self._get_checksum_in_repo( + repo + ) != self._get_checksum_in_repo(updated_repo) if has_changed: return {str(self): "update available"} From 799d902c98314367eeac238c8cde864c0142b912 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Tue, 14 Jan 2020 23:41:48 +0000 Subject: [PATCH 07/15] add test that checks that a file can be upgraded from git tracking to DVC tracking --- tests/func/test_update.py | 51 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/tests/func/test_update.py b/tests/func/test_update.py index 8d6f71cc64..dd467bc306 100644 --- a/tests/func/test_update.py +++ b/tests/func/test_update.py @@ -56,6 +56,57 @@ def test_update_import(tmp_dir, dvc, erepo_dir, cached): } +def test_update_import_after_remote_updates_to_dvc(tmp_dir, dvc, erepo_dir): + old_rev = None + with erepo_dir.branch("branch", new=True), erepo_dir.chdir(): + erepo_dir.scm_gen("version", "branch", commit="add version file") + old_rev = erepo_dir.scm.get_rev() + + stage = dvc.imp(fspath(erepo_dir), "version", "version", rev="branch") + + imported = tmp_dir / "version" + assert imported.is_file() + assert imported.read_text() == "branch" + assert stage.deps[0].def_repo == { + "url": fspath(erepo_dir), + "rev": "branch", + "rev_lock": old_rev, + } + + new_rev = None + with erepo_dir.branch("branch", new=False), erepo_dir.chdir(): + erepo_dir.scm.repo.index.remove("version") + erepo_dir.dvc_gen("version", "updated") + erepo_dir.scm.add(["version", "version.dvc"]) + erepo_dir.scm.commit("upgrade to DVC tracking") + new_rev = erepo_dir.scm.get_rev() + + assert old_rev != new_rev + + # Caching in external repos doesn't see upstream updates within single + # cli call, so we need to clean the caches to see the changes. + clean_repos() + + status, = dvc.status([stage.path])["version.dvc"] + changed_dep, = list(status["changed deps"].items()) + assert changed_dep[0].startswith("version ") + assert changed_dep[1] == "update available" + + dvc.update(stage.path) + + assert dvc.status([stage.path]) == {} + + assert imported.is_file() + assert imported.read_text() == "updated" + + stage = Stage.load(dvc, stage.path) + assert stage.deps[0].def_repo == { + "url": fspath(erepo_dir), + "rev": "branch", + "rev_lock": new_rev, + } + + def test_update_import_url(tmp_dir, dvc, tmp_path_factory): import_src = tmp_path_factory.mktemp("import_url_source") src = import_src / "file" From 289797e7eb4cc648ed4eaa9aa15f0c8c524d914c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Wed, 15 Jan 2020 11:33:26 +0000 Subject: [PATCH 08/15] can now get whether an item changed, but not when an item changes from non-DVC to DVC --- dvc/dependency/repo.py | 44 ++++++++++++++++++++++++++++++++++++--- tests/func/test_status.py | 35 +++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index 875544a00f..b273e34e35 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -1,6 +1,7 @@ import copy import os from contextlib import contextmanager +import filecmp from funcy import merge @@ -52,7 +53,7 @@ def _get_checksum_in_repo(self, repo): PathInfo(os.path.join(repo.root_dir, self.def_path)) ) - def status(self): + def _has_changed(self): with self._make_repo() as repo, self._make_repo( rev_lock=None ) as updated_repo: @@ -60,16 +61,53 @@ def status(self): current = repo.find_out_by_relpath(self.def_path).info updated = updated_repo.find_out_by_relpath(self.def_path).info - has_changed = current != updated + return current != updated except OutputNotFoundError: # Need to load the state before calculating checksums repo.state.load() updated_repo.state.load() - has_changed = self._get_checksum_in_repo( + return self._get_checksum_in_repo( repo ) != self._get_checksum_in_repo(updated_repo) + def _has_changed_non_dvc(self): + url = self.def_repo[self.PARAM_URL] + rev = self.def_repo.get(self.PARAM_REV) + rev_lock = self.def_repo.get(self.PARAM_REV_LOCK, rev) + + current_repo = cached_clone(url, rev=rev_lock or rev) + updated_repo = cached_clone(url, rev=rev) + + current_path = os.path.join(current_repo, self.def_path) + updated_path = os.path.join(updated_repo, self.def_path) + + if not os.path.exists(current_path): + raise FileNotFoundError(current_path) + + if not os.path.exists(updated_path): + raise FileNotFoundError(updated_path) + + is_dir = os.path.isdir(current_path) + + assert is_dir == os.path.isdir(updated_path) + + if is_dir: + comparison = filecmp.dircmp(current_path, updated_path) + return not ( + comparison.left_only + or comparison.right_only + or comparison.diff_files + ) + + return not filecmp.cmp(current_path, updated_path, shallow=False) + + def status(self): + try: + has_changed = self._has_changed() + except NotDvcRepoError: + has_changed = self._has_changed_non_dvc() + if has_changed: return {str(self): "update available"} diff --git a/tests/func/test_status.py b/tests/func/test_status.py index e4de038523..b122a9e7aa 100644 --- a/tests/func/test_status.py +++ b/tests/func/test_status.py @@ -1,9 +1,12 @@ import os +import shutil from mock import patch from dvc.main import main +from dvc.compat import fspath from tests.basic_env import TestDvc +from dvc.external_repo import clean_repos class TestStatus(TestDvc): @@ -23,3 +26,35 @@ def test_quiet(self): def test_implied_cloud(self, mock_status): main(["status", "--remote", "something"]) mock_status.assert_called() + + +def test_status_non_dvc_repo_import(tmp_dir, dvc, erepo_dir): + with erepo_dir.branch("branch", new=True), erepo_dir.chdir(): + erepo_dir.scm.repo.index.remove([".dvc"], r=True) + shutil.rmtree(".dvc") + erepo_dir.scm_gen("file", "first version") + erepo_dir.scm.add(["file"]) + erepo_dir.scm.commit("first version") + + dvc.imp(fspath(erepo_dir), "file", "file", rev="branch") + + status = dvc.status(["file.dvc"]) + + assert status == {} + + # Caching in external repos doesn't see upstream updates within single + # cli call, so we need to clean the caches to see the changes. + clean_repos() + + with erepo_dir.branch("branch", new=False), erepo_dir.chdir(): + erepo_dir.scm_gen("file", "second_version", commit="update file") + erepo_dir.scm.add(["file"]) + erepo_dir.scm.commit("first version") + + status, = dvc.status(["file.dvc"])["file.dvc"] + + assert status == { + "changed deps": { + "file ({})".format(fspath(erepo_dir)): "update available" + } + } From 008a531535babc4d0a44716cfc9991c8f2677343 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Wed, 15 Jan 2020 12:39:15 +0000 Subject: [PATCH 09/15] support non-DVC repositories --- dvc/dependency/repo.py | 84 ++++++++++++++++++++++++--------------- tests/func/test_status.py | 34 ++++++++++++++++ tests/func/test_update.py | 45 +++++++++++++++++++++ 3 files changed, 131 insertions(+), 32 deletions(-) diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index b273e34e35..424269d606 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -53,35 +53,37 @@ def _get_checksum_in_repo(self, repo): PathInfo(os.path.join(repo.root_dir, self.def_path)) ) - def _has_changed(self): - with self._make_repo() as repo, self._make_repo( - rev_lock=None - ) as updated_repo: - try: - current = repo.find_out_by_relpath(self.def_path).info - updated = updated_repo.find_out_by_relpath(self.def_path).info - - return current != updated - except OutputNotFoundError: - # Need to load the state before calculating checksums - repo.state.load() - updated_repo.state.load() - - return self._get_checksum_in_repo( - repo - ) != self._get_checksum_in_repo(updated_repo) - - def _has_changed_non_dvc(self): - url = self.def_repo[self.PARAM_URL] - rev = self.def_repo.get(self.PARAM_REV) - rev_lock = self.def_repo.get(self.PARAM_REV_LOCK, rev) - - current_repo = cached_clone(url, rev=rev_lock or rev) - updated_repo = cached_clone(url, rev=rev) - - current_path = os.path.join(current_repo, self.def_path) - updated_path = os.path.join(updated_repo, self.def_path) + # fileinfo is a dictionary containing "checksum" and "path" + def _get_fileinfo(self, updated=False): + rev_lock = None + if not updated: + rev_lock = self.def_repo.get(self.PARAM_REV_LOCK) + try: + with self._make_repo(rev_lock=rev_lock) as repo: + path = os.path.join(repo.root_dir, self.def_path) + try: + checksum = repo.find_out_by_relpath(self.def_path).info[ + "md5" + ] + except OutputNotFoundError: + repo.state.load() + checksum = repo.cache.local.get_checksum(PathInfo(path)) + + assert checksum + + return {"checksum": checksum, "path": path, "repo": repo} + + except NotDvcRepoError: + repo_path = cached_clone( + self.def_repo[self.PARAM_URL], + rev=rev_lock or self.def_repo.get(self.PARAM_REV), + ) + path = os.path.join(repo_path, self.def_path) + return {"checksum": None, "path": path, "repo": None} + + @staticmethod + def _paths_changed(current_path, updated_path): if not os.path.exists(current_path): raise FileNotFoundError(current_path) @@ -102,11 +104,29 @@ def _has_changed_non_dvc(self): return not filecmp.cmp(current_path, updated_path, shallow=False) + def _checkout_if_needed(self, fileinfo): + if os.path.exists(fileinfo["path"]): + return + if "repo" not in fileinfo: + return + + fileinfo["repo"].checkout([fileinfo["path"] + ".dvc"], recursive=True) + def status(self): - try: - has_changed = self._has_changed() - except NotDvcRepoError: - has_changed = self._has_changed_non_dvc() + current_fileinfo = self._get_fileinfo(updated=False) + updated_fileinfo = self._get_fileinfo(updated=True) + print("CURRENT", current_fileinfo, "UPDATED", updated_fileinfo) + + if current_fileinfo["checksum"] and updated_fileinfo["checksum"]: + has_changed = ( + current_fileinfo["checksum"] != updated_fileinfo["checksum"] + ) + else: + self._checkout_if_needed(current_fileinfo) + self._checkout_if_needed(updated_fileinfo) + has_changed = DependencyREPO._paths_changed( + current_fileinfo["path"], updated_fileinfo["path"] + ) if has_changed: return {str(self): "update available"} diff --git a/tests/func/test_status.py b/tests/func/test_status.py index b122a9e7aa..1b688803af 100644 --- a/tests/func/test_status.py +++ b/tests/func/test_status.py @@ -3,6 +3,7 @@ from mock import patch +from dvc.repo import Repo from dvc.main import main from dvc.compat import fspath from tests.basic_env import TestDvc @@ -58,3 +59,36 @@ def test_status_non_dvc_repo_import(tmp_dir, dvc, erepo_dir): "file ({})".format(fspath(erepo_dir)): "update available" } } + + +def test_status_before_and_after_dvc_init(tmp_dir, dvc, erepo_dir): + with erepo_dir.chdir(): + erepo_dir.scm.repo.index.remove([".dvc"], r=True) + shutil.rmtree(".dvc") + erepo_dir.scm_gen("file", "first version") + erepo_dir.scm.add(["file"]) + erepo_dir.scm.commit("first version") + old_rev = erepo_dir.scm.get_rev() + + dvc.imp(fspath(erepo_dir), "file", "file") + + assert dvc.status(["file.dvc"]) == {} + + with erepo_dir.chdir(): + Repo.init() + erepo_dir.scm.repo.index.remove(["file"]) + os.remove("file") + erepo_dir.dvc_gen("file", "second version") + erepo_dir.scm.add([".dvc", "file.dvc"]) + erepo_dir.scm.commit("version with dvc") + new_rev = erepo_dir.scm.get_rev() + + assert old_rev != new_rev + + # Caching in external repos doesn't see upstream updates within single + # cli call, so we need to clean the caches to see the changes. + clean_repos() + + status = dvc.status(["file.dvc"]) + + print("STATUS", status) diff --git a/tests/func/test_update.py b/tests/func/test_update.py index dd467bc306..6f483fff0a 100644 --- a/tests/func/test_update.py +++ b/tests/func/test_update.py @@ -1,5 +1,8 @@ import pytest +import os +import shutil +from dvc.repo import Repo from dvc.stage import Stage from dvc.compat import fspath from dvc.external_repo import clean_repos @@ -107,6 +110,48 @@ def test_update_import_after_remote_updates_to_dvc(tmp_dir, dvc, erepo_dir): } +def test_update_before_and_after_dvc_init(tmp_dir, dvc, erepo_dir): + with erepo_dir.chdir(): + erepo_dir.scm.repo.index.remove([".dvc"], r=True) + shutil.rmtree(".dvc") + erepo_dir.scm_gen("file", "first version") + erepo_dir.scm.add(["file"]) + erepo_dir.scm.commit("first version") + old_rev = erepo_dir.scm.get_rev() + + stage = dvc.imp(fspath(erepo_dir), "file", "file") + + with erepo_dir.chdir(): + Repo.init() + erepo_dir.scm.repo.index.remove(["file"]) + os.remove("file") + erepo_dir.dvc_gen("file", "second version") + erepo_dir.scm.add([".dvc", "file.dvc"]) + erepo_dir.scm.commit("version with dvc") + new_rev = erepo_dir.scm.get_rev() + + assert old_rev != new_rev + + # Caching in external repos doesn't see upstream updates within single + # cli call, so we need to clean the caches to see the changes. + clean_repos() + + assert dvc.status([stage.path]) == { + "file.dvc": [ + { + "changed deps": { + "file ({})".format(fspath(erepo_dir)): "update available" + } + } + ] + } + + dvc.update(stage.path) + + assert (tmp_dir / "file").read_text() == "second version" + assert dvc.status([stage.path]) == {} + + def test_update_import_url(tmp_dir, dvc, tmp_path_factory): import_src = tmp_path_factory.mktemp("import_url_source") src = import_src / "file" From 4ada384fcd07c69737608eb0918f1080f668e3d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Wed, 15 Jan 2020 12:43:14 +0000 Subject: [PATCH 10/15] remove trace statement --- dvc/dependency/repo.py | 1 - tests/func/test_status.py | 8 ++++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index 424269d606..f8ffeb9f6b 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -115,7 +115,6 @@ def _checkout_if_needed(self, fileinfo): def status(self): current_fileinfo = self._get_fileinfo(updated=False) updated_fileinfo = self._get_fileinfo(updated=True) - print("CURRENT", current_fileinfo, "UPDATED", updated_fileinfo) if current_fileinfo["checksum"] and updated_fileinfo["checksum"]: has_changed = ( diff --git a/tests/func/test_status.py b/tests/func/test_status.py index 1b688803af..d8a65b9c73 100644 --- a/tests/func/test_status.py +++ b/tests/func/test_status.py @@ -89,6 +89,10 @@ def test_status_before_and_after_dvc_init(tmp_dir, dvc, erepo_dir): # cli call, so we need to clean the caches to see the changes. clean_repos() - status = dvc.status(["file.dvc"]) + status, = dvc.status(["file.dvc"])["file.dvc"] - print("STATUS", status) + assert status == { + "changed deps": { + "file ({})".format(fspath(erepo_dir)): "update available" + } + } From a0bdaa4c9e8101666dd6066b01eb4285708bd6e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Thu, 16 Jan 2020 12:51:20 +0000 Subject: [PATCH 11/15] use checksum facilities --- dvc/dependency/repo.py | 72 +++++++----------------------------------- dvc/repo/status.py | 2 +- dvc/stage.py | 4 +-- 3 files changed, 14 insertions(+), 64 deletions(-) diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index f8ffeb9f6b..e137c41a8e 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -1,7 +1,6 @@ import copy import os from contextlib import contextmanager -import filecmp from funcy import merge @@ -48,13 +47,8 @@ def _make_repo(self, **overrides): with external_repo(**merge(self.def_repo, overrides)) as repo: yield repo - def _get_checksum_in_repo(self, repo): - return repo.cache.local.get_checksum( - PathInfo(os.path.join(repo.root_dir, self.def_path)) - ) - # fileinfo is a dictionary containing "checksum" and "path" - def _get_fileinfo(self, updated=False): + def _get_checksum(self, updated=False): rev_lock = None if not updated: rev_lock = self.def_repo.get(self.PARAM_REV_LOCK) @@ -62,17 +56,14 @@ def _get_fileinfo(self, updated=False): try: with self._make_repo(rev_lock=rev_lock) as repo: path = os.path.join(repo.root_dir, self.def_path) + try: - checksum = repo.find_out_by_relpath(self.def_path).info[ - "md5" - ] + return repo.find_out_by_relpath(self.def_path).info["md5"] except OutputNotFoundError: - repo.state.load() - checksum = repo.cache.local.get_checksum(PathInfo(path)) - - assert checksum - - return {"checksum": checksum, "path": path, "repo": repo} + with repo.state: + return self.repo.cache.local.get_checksum( + PathInfo(path) + ) except NotDvcRepoError: repo_path = cached_clone( @@ -80,54 +71,13 @@ def _get_fileinfo(self, updated=False): rev=rev_lock or self.def_repo.get(self.PARAM_REV), ) path = os.path.join(repo_path, self.def_path) - return {"checksum": None, "path": path, "repo": None} - - @staticmethod - def _paths_changed(current_path, updated_path): - if not os.path.exists(current_path): - raise FileNotFoundError(current_path) - - if not os.path.exists(updated_path): - raise FileNotFoundError(updated_path) - - is_dir = os.path.isdir(current_path) - - assert is_dir == os.path.isdir(updated_path) - - if is_dir: - comparison = filecmp.dircmp(current_path, updated_path) - return not ( - comparison.left_only - or comparison.right_only - or comparison.diff_files - ) - - return not filecmp.cmp(current_path, updated_path, shallow=False) - - def _checkout_if_needed(self, fileinfo): - if os.path.exists(fileinfo["path"]): - return - if "repo" not in fileinfo: - return - - fileinfo["repo"].checkout([fileinfo["path"] + ".dvc"], recursive=True) + return self.repo.cache.local.get_checksum(PathInfo(path)) def status(self): - current_fileinfo = self._get_fileinfo(updated=False) - updated_fileinfo = self._get_fileinfo(updated=True) - - if current_fileinfo["checksum"] and updated_fileinfo["checksum"]: - has_changed = ( - current_fileinfo["checksum"] != updated_fileinfo["checksum"] - ) - else: - self._checkout_if_needed(current_fileinfo) - self._checkout_if_needed(updated_fileinfo) - has_changed = DependencyREPO._paths_changed( - current_fileinfo["path"], updated_fileinfo["path"] - ) + current_checksum = self._get_checksum(updated=False) + updated_checksum = self._get_checksum(updated=True) - if has_changed: + if current_checksum != updated_checksum: return {str(self): "update available"} return {} diff --git a/dvc/repo/status.py b/dvc/repo/status.py index 85299d8ff2..3528e6243d 100644 --- a/dvc/repo/status.py +++ b/dvc/repo/status.py @@ -22,7 +22,7 @@ def _joint_status(stages): ) ) - status.update(stage.status(for_status_command=True)) + status.update(stage.status(check_updates=True)) return status diff --git a/dvc/stage.py b/dvc/stage.py index 85832720bc..28076bad6d 100644 --- a/dvc/stage.py +++ b/dvc/stage.py @@ -1006,10 +1006,10 @@ def _status(entries): return ret @rwlocked(read=["deps", "outs"]) - def status(self, for_status_command=False): + def status(self, check_updates=False): ret = [] - show_import = self.is_repo_import and for_status_command + show_import = self.is_repo_import and check_updates if not self.locked or show_import: deps_status = self._status(self.deps) From 5883d3dabfdaadf6d732b9ed03587e4b8802043e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Thu, 16 Jan 2020 12:54:05 +0000 Subject: [PATCH 12/15] reorder imports --- tests/func/test_status.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/func/test_status.py b/tests/func/test_status.py index d8a65b9c73..8b8d3d2ba9 100644 --- a/tests/func/test_status.py +++ b/tests/func/test_status.py @@ -6,8 +6,8 @@ from dvc.repo import Repo from dvc.main import main from dvc.compat import fspath -from tests.basic_env import TestDvc from dvc.external_repo import clean_repos +from tests.basic_env import TestDvc class TestStatus(TestDvc): From a9bdaec63c9c1408ee3873f946fa70949d51e923 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Thu, 16 Jan 2020 13:37:13 +0000 Subject: [PATCH 13/15] rework code path; remove extraneous context generator --- dvc/dependency/repo.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index e137c41a8e..be57e0aba1 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -47,7 +47,6 @@ def _make_repo(self, **overrides): with external_repo(**merge(self.def_repo, overrides)) as repo: yield repo - # fileinfo is a dictionary containing "checksum" and "path" def _get_checksum(self, updated=False): rev_lock = None if not updated: @@ -55,15 +54,11 @@ def _get_checksum(self, updated=False): try: with self._make_repo(rev_lock=rev_lock) as repo: - path = os.path.join(repo.root_dir, self.def_path) - try: return repo.find_out_by_relpath(self.def_path).info["md5"] except OutputNotFoundError: - with repo.state: - return self.repo.cache.local.get_checksum( - PathInfo(path) - ) + # Fall through to after exception handler + path = os.path.join(repo.root_dir, self.def_path) except NotDvcRepoError: repo_path = cached_clone( @@ -71,7 +66,8 @@ def _get_checksum(self, updated=False): rev=rev_lock or self.def_repo.get(self.PARAM_REV), ) path = os.path.join(repo_path, self.def_path) - return self.repo.cache.local.get_checksum(PathInfo(path)) + + return self.repo.cache.local.get_checksum(PathInfo(path)) def status(self): current_checksum = self._get_checksum(updated=False) From cd3ef1850de1e8759a0eaff3c5181602c86206d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Thu, 16 Jan 2020 14:44:09 +0000 Subject: [PATCH 14/15] simplify control flow --- dvc/dependency/repo.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index be57e0aba1..aed3369a75 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -54,20 +54,19 @@ def _get_checksum(self, updated=False): try: with self._make_repo(rev_lock=rev_lock) as repo: - try: - return repo.find_out_by_relpath(self.def_path).info["md5"] - except OutputNotFoundError: - # Fall through to after exception handler - path = os.path.join(repo.root_dir, self.def_path) + return repo.find_out_by_relpath(self.def_path).info["md5"] - except NotDvcRepoError: - repo_path = cached_clone( - self.def_repo[self.PARAM_URL], - rev=rev_lock or self.def_repo.get(self.PARAM_REV), - ) - path = os.path.join(repo_path, self.def_path) + except (NotDvcRepoError, OutputNotFoundError): + # Fall through and clone + pass + + repo_path = cached_clone( + self.def_repo[self.PARAM_URL], + rev=rev_lock or self.def_repo.get(self.PARAM_REV), + ) + path = PathInfo(os.path.join(repo_path, self.def_path)) - return self.repo.cache.local.get_checksum(PathInfo(path)) + return self.repo.cache.local.get_checksum(path) def status(self): current_checksum = self._get_checksum(updated=False) From fcb7681683d625aaf9edb62e306482e2f1d654d8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Santos?= Date: Thu, 16 Jan 2020 14:49:44 +0000 Subject: [PATCH 15/15] take exception transformation into account --- dvc/dependency/repo.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index aed3369a75..3bcfd3e65d 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -9,6 +9,7 @@ from dvc.external_repo import external_repo from dvc.exceptions import NotDvcRepoError from dvc.exceptions import OutputNotFoundError +from dvc.exceptions import NoOutputInExternalRepoError from dvc.exceptions import PathMissingError from dvc.utils.fs import fs_copy from dvc.path_info import PathInfo @@ -55,8 +56,7 @@ def _get_checksum(self, updated=False): try: with self._make_repo(rev_lock=rev_lock) as repo: return repo.find_out_by_relpath(self.def_path).info["md5"] - - except (NotDvcRepoError, OutputNotFoundError): + except (NotDvcRepoError, NoOutputInExternalRepoError): # Fall through and clone pass