diff --git a/dvc/api.py b/dvc/api.py index a2d648aa17..ca2c702069 100644 --- a/dvc/api.py +++ b/dvc/api.py @@ -2,8 +2,13 @@ from contextlib import _GeneratorContextManager as GCM from contextlib import contextmanager -from dvc.exceptions import DvcException, NotDvcRepoError -from dvc.external_repo import external_repo +from dvc.exceptions import ( + DvcException, + FileMissingError, + NotDvcRepoError, + PathMissingError, +) +from dvc.external_repo import ExternalDVCRepo, ExternalGitRepo, external_repo from dvc.repo import Repo @@ -26,10 +31,14 @@ def get_url(path, repo=None, rev=None, remote=None): directory in the remote storage. """ with _make_repo(repo, rev=rev) as _repo: - if not isinstance(_repo, Repo): - raise UrlNotDvcRepoError(_repo.url) # pylint: disable=no-member - out = _repo.find_out_by_relpath(path) - remote_obj = _repo.cloud.get_remote(remote) + # pylint: disable=no-member + path = os.path.join(_repo.root_dir, path) + is_erepo = isinstance(_repo, (ExternalDVCRepo, ExternalGitRepo)) + r = _repo.in_repo(path) if is_erepo else _repo + if is_erepo and not r: + raise UrlNotDvcRepoError(_repo.url) + out = r.find_out_by_relpath(path) + remote_obj = r.cloud.get_remote(remote) return str(remote_obj.tree.hash_to_path_info(out.checksum)) @@ -74,10 +83,17 @@ def __getattr__(self, name): def _open(path, repo=None, rev=None, remote=None, mode="r", encoding=None): with _make_repo(repo, rev=rev) as _repo: - with _repo.open_by_relpath( - path, remote=remote, mode=mode, encoding=encoding - ) as fd: - yield fd + is_erepo = not isinstance(_repo, Repo) + try: + with _repo.repo_tree.open_by_relpath( + path, remote=remote, mode=mode, encoding=encoding + ) as fd: + yield fd + except FileNotFoundError as exc: + if is_erepo: + # pylint: disable=no-member + raise PathMissingError(path, _repo.url) from exc + raise FileMissingError(path) from exc def read(path, repo=None, rev=None, remote=None, mode="r", encoding=None): @@ -101,5 +117,5 @@ def _make_repo(repo_url=None, rev=None): return except NotDvcRepoError: pass # fallthrough to external_repo - with external_repo(url=repo_url, rev=rev) as repo: + with external_repo(url=repo_url, rev=rev, stream=True) as repo: yield repo diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index eb3c58b5ce..fcc205b822 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -2,7 +2,6 @@ from voluptuous import Required -from dvc.exceptions import OutputNotFoundError from dvc.path_info import PathInfo from .local import LocalDependency @@ -42,30 +41,17 @@ def repo_pair(self): def __str__(self): return "{} ({})".format(self.def_path, self.def_repo[self.PARAM_URL]) - def _make_repo(self, *, locked=True): + def _make_repo(self, *, locked=True, **kwargs): from dvc.external_repo import external_repo d = self.def_repo rev = (d.get("rev_lock") if locked else None) or d.get("rev") - return external_repo(d["url"], rev=rev) + return external_repo(d["url"], rev=rev, **kwargs) def _get_checksum(self, locked=True): - from dvc.repo.tree import RepoTree - - with self._make_repo(locked=locked) as repo: - try: - return repo.find_out_by_relpath(self.def_path).info["md5"] - except OutputNotFoundError: - path = PathInfo(os.path.join(repo.root_dir, self.def_path)) - - # we want stream but not fetch, so DVC out directories are - # walked, but dir contents is not fetched - tree = RepoTree(repo, stream=True) - - # We are polluting our repo cache with some dir listing here - if tree.isdir(path): - return self.repo.cache.local.tree.get_hash(path, tree=tree) - return tree.get_file_hash(path) + with self._make_repo(locked=locked, stream=True) as repo: + path = PathInfo(os.path.join(repo.root_dir, self.def_path)) + return repo.get_checksum(path, self.repo.cache.local) def status(self): current_checksum = self._get_checksum(locked=True) @@ -88,8 +74,7 @@ def download(self, to): self.def_repo[self.PARAM_REV_LOCK] = repo.get_rev() cache = self.repo.cache.local - with repo.use_cache(cache): - _, _, cache_infos = repo.fetch_external([self.def_path]) + _, _, cache_infos = repo.fetch_external([self.def_path], cache) cache.checkout(to.path_info, cache_infos[0]) def update(self, rev=None): diff --git a/dvc/external_repo.py b/dvc/external_repo.py index 15332e7de1..dc43b39067 100644 --- a/dvc/external_repo.py +++ b/dvc/external_repo.py @@ -8,6 +8,7 @@ from funcy import cached_property, retry, wrap_with +from dvc import subrepos from dvc.config import NoRemoteError, NotDvcRepoError from dvc.exceptions import ( FileMissingError, @@ -21,25 +22,41 @@ from dvc.repo.tree import RepoTree from dvc.scm.base import CloneError from dvc.scm.git import Git -from dvc.tree.local import LocalTree +from dvc.tree import LocalTree from dvc.utils.fs import remove logger = logging.getLogger(__name__) +def _is_dvc_main_repo(scm, path, rev): + isdir = scm.get_tree(rev).isdir if scm else os.path.isdir + return isdir(os.path.join(path, Repo.DVC_DIR)) + + @contextmanager -def external_repo(url, rev=None, for_write=False): +def external_repo(url, rev=None, for_write=False, **kwargs): logger.debug("Creating external repo %s@%s", url, rev) path = _cached_clone(url, rev, for_write=for_write) + path = os.path.realpath(path) + if not rev: # Local HEAD points to the tip of whatever branch we first cloned from # (which may not be the default branch), use origin/HEAD here to get # the tip of the default branch rev = "refs/remotes/origin/HEAD" - try: - repo = ExternalRepo(path, url, rev, for_write=for_write) - except NotDvcRepoError: - repo = ExternalGitRepo(path, url, rev) + if for_write: + rev = None + scm = None + else: + scm = Git(path) + + erepo_cls = ( + ExternalDVCRepo + if _is_dvc_main_repo(scm, path, rev) + else ExternalGitRepo + ) + repo_kw = dict(scm=scm, rev=rev, for_write=for_write, url=url, **kwargs) + repo = erepo_cls(path, **repo_kw) try: yield repo @@ -71,53 +88,98 @@ def clean_repos(): _remove(path) -class BaseExternalRepo: - # pylint: disable=no-member +def _fix_local_remote(orig_repo, src_repo, remote_name): + # If a remote URL is relative to the source repo, + # it will have changed upon config load and made + # relative to this new repo. Restore the old one here. + new_remote = orig_repo.config["remote"][remote_name] + old_remote = src_repo.config["remote"][remote_name] + if new_remote["url"] != old_remote["url"]: + new_remote["url"] = old_remote["url"] + + +def _add_upstream(orig_repo, src_repo): + # Fill the empty upstream entry with a new remote pointing to the + # original repo's cache location. + orig_repo.config["remote"]["auto-generated-upstream"] = { + "url": src_repo.cache.local.cache_dir + } + orig_repo.config["core"]["remote"] = "auto-generated-upstream" + - _local_cache = None +class BaseExternalMixin: + def _setup(self): + self._setup_cache_dir() - def __str__(self): - return self.url + for repo in self.repos: + repo.cache.local.cache_dir = self.cache_dir + if os.path.isdir(self.url): + self._fix_upstream(repo) @property - def local_cache(self): - if hasattr(self, "cache"): - return self.cache.local - return self._local_cache + def repos(self): + raise NotImplementedError - @contextmanager - def use_cache(self, cache): - """Use the specified cache in place of default tmpdir cache for - download operations. - """ - if hasattr(self, "cache"): - save_cache = self.cache.local - self.cache.local = cache - self._local_cache = cache + @wrap_with(threading.Lock()) + def _setup_cache_dir(self): + # share same cache_dir among all subrepos + try: + self.cache_dir = CACHE_DIRS[self.url] + except KeyError: + self.cache_dir = CACHE_DIRS[self.url] = tempfile.mkdtemp( + "dvc-cache" + ) + @wrap_with(threading.Lock()) + @contextmanager + def with_cache(self, cache_dir, link_types=None): + for repo in self.repos: + repo.cache.local.cache_dir = cache_dir + if link_types: + # FIXME: not rolling back, should be fine for now + repo.cache.local.cache_types = link_types yield + for repo in self.repos: + repo.cache.local.cache_dir = self.cache_dir - if hasattr(self, "cache"): - self.cache.local = save_cache - self._local_cache = None + def _fix_upstream(self, orig_repo): + try: + rel_path = os.path.relpath(orig_repo.root_dir, self.root_dir) + src_repo = Repo(PathInfo(self.url) / rel_path) + except NotDvcRepoError: + # If ExternalRepo does not throw NotDvcRepoError and Repo does, + # the self.url might be a bare git repo. + # NOTE: This will fail to resolve remote with relative path, + # same as if it was a remote DVC repo. + return - @cached_property - def repo_tree(self): - return RepoTree(self, fetch=True) + try: + remote_name = orig_repo.config["core"].get("remote") + if remote_name: + _fix_local_remote(orig_repo, src_repo, remote_name) + else: + _add_upstream(orig_repo, src_repo) + finally: + src_repo.close() - def get_rev(self): - if isinstance(self.tree, LocalTree): - return self.scm.get_rev() - return self.tree.rev + def close(self): + if self.scm: + self.scm.close() + + def fetch_external(self, paths: Iterable, cache, **kwargs): + # `RepoTree` will try downloading it to the repo's cache + # instead of `cache_dir`, need to change on all instances + with self.with_cache(cache.cache_dir): + return self._fetch_to_cache(paths, cache, **kwargs) - def fetch_external(self, paths: Iterable, **kwargs): + def _fetch_to_cache(self, paths: Iterable, cache, **kwargs): """Fetch specified external repo paths into cache. - Returns 3-tuple in the form - (downloaded, failed, list(cache_infos)) - where cache_infos can be used as checkout targets for the - fetched paths. - """ + Returns 3-tuple in the form + (downloaded, failed, list(cache_infos)) + where cache_infos can be used as checkout targets for the + fetched paths. + """ download_results = [] failed = 0 @@ -130,7 +192,7 @@ def download_update(result): for path in paths: if not self.repo_tree.exists(path): raise PathMissingError(path, self.url) - save_info = self.local_cache.save( + save_info = cache.save( path, self.repo_tree, None, @@ -141,109 +203,128 @@ def download_update(result): return sum(download_results), failed, save_infos - def get_external(self, path, dest): + def get_external(self, src, dest): """Convenience wrapper for fetch_external and checkout.""" - if self.local_cache: - # fetch DVC and git files to tmpdir cache, then checkout - _, _, save_infos = self.fetch_external([path]) - self.local_cache.checkout(PathInfo(dest), save_infos[0]) + repo = self.in_repo(src) + if repo: + cache = repo.cache.local + _, _, save_infos = self._fetch_to_cache([src], cache) + cache.checkout(PathInfo(dest), save_infos[0]) else: - # git-only erepo with no cache, just copy files directly - # to dest - path = PathInfo(self.root_dir) / path + path = PathInfo(self.root_dir) / src if not self.repo_tree.exists(path): - raise PathMissingError(path, self.url) + raise PathMissingError(src, self.url) self.repo_tree.copytree(path, dest) + def get_rev(self): + if isinstance(self.tree, LocalTree): + return self.scm.get_rev() + return self.tree.rev -class ExternalRepo(Repo, BaseExternalRepo): - def __init__(self, root_dir, url, rev, for_write=False): - if for_write: - super().__init__(root_dir) - else: - root_dir = os.path.realpath(root_dir) - super().__init__(root_dir, scm=Git(root_dir), rev=rev) - self.url = url - self._set_cache_dir() - self._fix_upstream() + def get_checksum(self, path_info, cache): + if self.repo_tree.isdir(path_info): + return cache.tree.get_hash(path_info, tree=self.repo_tree) + return self.repo_tree.get_file_hash(path_info) - @wrap_with(threading.Lock()) - def _set_cache_dir(self): - try: - cache_dir = CACHE_DIRS[self.url] - except KeyError: - cache_dir = CACHE_DIRS[self.url] = tempfile.mkdtemp("dvc-cache") + def in_repo(self, path): + tree = self.repo_tree.in_subtree(PathInfo(self.root_dir) / path) + return tree.repo if tree else None - self.cache.local.cache_dir = cache_dir - self._local_cache = self.cache.local + def _find_subrepos(self): + repo_kw = {} + if self.for_write: + tree = LocalTree(None, {"url": self.root_dir}) + else: + assert self.url and self.scm and self.rev + # .dvc folders are ignored by dvcignore which is required for + # `subrepos.find()`, hence using a separate tree + tree = self.scm.get_tree(self.rev) + repo_kw = dict(scm=self.scm, rev=self.rev) - def _fix_upstream(self): - if not os.path.isdir(self.url): - return + paths = subrepos.find(tree) + return [Repo(path, **repo_kw) for path in paths] - try: - src_repo = Repo(self.url) - except NotDvcRepoError: - # If ExternalRepo does not throw NotDvcRepoError and Repo does, - # the self.url might be a bare git repo. - # NOTE: This will fail to resolve remote with relative path, - # same as if it was a remote DVC repo. - return + @cached_property + def main_tree(self): + # FIXME: Repo.tree has dvcignore embedded on it, which might ignore + # subrepos. Also, there might be unwanted side effects of using `tree` + # that does not ignore subrepos, so we create our own `master` tree, + # that speaks for the whole repository + # --- + # This is blocking implementation of subrepos inside `Repo`, + # as collecting `.dvcignore` twice is not an answer. + # we need very granular controls on dvcignore, such that we have + # per-ops controls and also be able to create a separate instance + # of tree that does have some attributes of dvcignore (en/dis)abled. + kwargs = dict( + use_dvcignore=True, + dvcignore_root=self.root_dir, + ignore_subrepo=False, + ) + if self.for_write: + return LocalTree(None, {"url": self.root_dir}, **kwargs) + return self.scm.get_tree(rev=self.rev, **kwargs) - try: - remote_name = self.config["core"].get("remote") - if remote_name: - self._fix_local_remote(src_repo, remote_name) - else: - self._add_upstream(src_repo) - finally: - src_repo.close() + @cached_property + def repo_tree(self) -> "RepoTree": + return RepoTree( + self.main_tree, subrepos=self.repos, **self._tree_config + ) + + +class ExternalDVCRepo(BaseExternalMixin, Repo): + def __init__( + self, + root_dir, + scm=None, + rev=None, + for_write=False, + url=None, + fetch=True, + **kwargs + ): + super().__init__(root_dir, scm=scm, rev=rev) - def _fix_local_remote(self, src_repo, remote_name): - # If a remote URL is relative to the source repo, - # it will have changed upon config load and made - # relative to this new repo. Restore the old one here. - new_remote = self.config["remote"][remote_name] - old_remote = src_repo.config["remote"][remote_name] - if new_remote["url"] != old_remote["url"]: - new_remote["url"] = old_remote["url"] - - def _add_upstream(self, src_repo): - # Fill the empty upstream entry with a new remote pointing to the - # original repo's cache location. - cache_dir = src_repo.cache.local.cache_dir - self.config["remote"]["auto-generated-upstream"] = {"url": cache_dir} - self.config["core"]["remote"] = "auto-generated-upstream" - - -class ExternalGitRepo(BaseExternalRepo): - def __init__(self, root_dir, url, rev): - self.root_dir = os.path.realpath(root_dir) self.url = url - self.tree = self.scm.get_tree(rev) + self.rev = rev + self.for_write = for_write + self._tree_config = {"fetch": fetch, **kwargs} + self.subrepos = self._find_subrepos() + self._setup() - @cached_property - def scm(self): - return Git(self.root_dir) + @property + def repos(self): + return [self] + self.subrepos + + +class ExternalGitRepo(BaseExternalMixin): + def __init__( + self, + root_dir, + scm=None, + rev=None, + for_write=False, + url=None, + fetch=True, + **kwargs + ): + self.root_dir = root_dir + self.scm = scm + self.url = url + self.rev = rev + self.for_write = for_write + self._tree_config = {"fetch": fetch, **kwargs} - def close(self): - if "scm" in self.__dict__: - self.scm.close() + self.subrepos = self._find_subrepos() + self._setup() - def find_out_by_relpath(self, path): - raise OutputNotFoundError(path, self) + @property + def repos(self): + return self.subrepos - @contextmanager - def open_by_relpath(self, path, mode="r", encoding=None, **kwargs): - """Opens a specified resource as a file object.""" - tree = RepoTree(self) - try: - with tree.open( - path, mode=mode, encoding=encoding, **kwargs - ) as fobj: - yield fobj - except FileNotFoundError: - raise PathMissingError(path, self.url) + @property + def tree(self): + return self.main_tree def _cached_clone(url, rev, for_write=False): diff --git a/dvc/ignore.py b/dvc/ignore.py index 831486209f..11c4cca53b 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -120,7 +120,7 @@ def _is_dvc_repo(root, directory): return os.path.isdir(os.path.join(root, directory, Repo.DVC_DIR)) - def __init__(self, tree, root_dir): + def __init__(self, tree, root_dir, ignore_subrepo=True): from dvc.repo import Repo default_ignore_patterns = [".hg/", ".git/", "{}/".format(Repo.DVC_DIR)] @@ -135,7 +135,8 @@ def __init__(self, tree, root_dir): self.root_dir, use_dvcignore=False ): self._update(root) - self._update_sub_repo(root, dirs) + if ignore_subrepo: + self._update_sub_repo(root, dirs) dirs[:], _ = self(root, dirs, []) def _update(self, dirname): diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 6d98682727..f6aeb5f5b9 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -1,14 +1,11 @@ import logging import os -from contextlib import contextmanager from functools import wraps from funcy import cached_property, cat, first from dvc.config import Config from dvc.dvcfile import PIPELINE_FILE, Dvcfile, is_valid_filename -from dvc.exceptions import FileMissingError -from dvc.exceptions import IsADirectoryError as DvcIsADirectoryError from dvc.exceptions import ( NoOutputOrStageError, NotDvcRepoError, @@ -153,22 +150,29 @@ def tree(self, tree): def __repr__(self): return f"{self.__class__.__name__}: '{self.root_dir}'" + @cached_property + def repo_tree(self): + return RepoTree(self.tree, [self], stream=True) + @classmethod def find_root(cls, root=None, tree=None): + # TODO (@skshetry): Verify and refactor this root_dir = os.path.realpath(root or os.curdir) - if tree: - if tree.isdir(os.path.join(root_dir, cls.DVC_DIR)): - return root_dir - raise NotDvcRepoError(f"'{root}' does not contain DVC directory") - - if not os.path.isdir(root_dir): - raise NotDvcRepoError(f"directory '{root}' does not exist") + is_dir = tree.isdir if tree else os.path.isdir while True: dvc_dir = os.path.join(root_dir, cls.DVC_DIR) - if os.path.isdir(dvc_dir): + if is_dir(dvc_dir): return root_dir + if ( + tree + and os.path.dirname(os.path.abspath(tree.tree_root)) + == root_dir + ): + raise NotDvcRepoError( + f"'{root}' does not contain DVC directory" + ) if os.path.ismount(root_dir): break root_dir = os.path.dirname(root_dir) @@ -580,26 +584,6 @@ def is_dvc_internal(self, path): path_parts = os.path.normpath(path).split(os.path.sep) return self.DVC_DIR in path_parts - @contextmanager - def open_by_relpath(self, path, remote=None, mode="r", encoding=None): - """Opens a specified resource as a file descriptor""" - - tree = RepoTree(self, stream=True) - path = os.path.join(self.root_dir, path) - try: - with self.state: - with tree.open( - os.path.join(self.root_dir, path), - mode=mode, - encoding=encoding, - remote=remote, - ) as fobj: - yield fobj - except FileNotFoundError as exc: - raise FileMissingError(path) from exc - except IsADirectoryError as exc: - raise DvcIsADirectoryError(f"'{path}' is a directory") from exc - def close(self): self.scm.close() diff --git a/dvc/repo/fetch.py b/dvc/repo/fetch.py index fe3cd9c6f6..3814589445 100644 --- a/dvc/repo/fetch.py +++ b/dvc/repo/fetch.py @@ -81,10 +81,9 @@ def _fetch_external(self, repo_url, repo_rev, files, jobs): failed, downloaded = 0, 0 try: with external_repo(repo_url, repo_rev) as repo: - with repo.use_cache(self.cache.local): - d, f, _ = repo.fetch_external(files, jobs=jobs) - downloaded += d - failed += f + d, f, _ = repo.fetch_external(files, self.cache.local, jobs=jobs) + downloaded += d + failed += f except CloneError: failed += 1 logger.exception( diff --git a/dvc/repo/get.py b/dvc/repo/get.py index 1141b00772..4ae5c23f6c 100644 --- a/dvc/repo/get.py +++ b/dvc/repo/get.py @@ -37,19 +37,17 @@ def get(url, path, out=None, rev=None): tmp_dir = os.path.join(dpath, "." + str(shortuuid.uuid())) try: with external_repo(url=url, rev=rev) as repo: - if hasattr(repo, "cache"): - repo.cache.local.cache_dir = tmp_dir - - # Try any links possible to avoid data duplication. - # - # Not using symlink, because we need to remove cache after we - # are done, and to make that work we would have to copy data - # over anyway before removing the cache, so we might just copy - # it right away. - # - # Also, we can't use theoretical "move" link type here, because - # the same cache file might be used a few times in a directory. - repo.cache.local.cache_types = ["reflink", "hardlink", "copy"] - repo.get_external(path, out) + # Try any links possible to avoid data duplication. + # + # Not using symlink, because we need to remove cache after we + # are done, and to make that work we would have to copy data + # over anyway before removing the cache, so we might just copy + # it right away. + # + # Also, we can't use theoretical "move" link type here, because + # the same cache file might be used a few times in a directory. + link_types = ["reflink", "hardlink", "copy"] + with repo.with_cache(tmp_dir, link_types=link_types): + repo.get_external(path, out) finally: remove(tmp_dir) diff --git a/dvc/repo/ls.py b/dvc/repo/ls.py index e02d52116a..1f3ae17d60 100644 --- a/dvc/repo/ls.py +++ b/dvc/repo/ls.py @@ -29,7 +29,7 @@ def ls( """ from dvc.external_repo import external_repo - with external_repo(url, rev) as repo: + with external_repo(url, rev, stream=True) as repo: path_info = PathInfo(repo.root_dir) if path: path_info /= path @@ -48,15 +48,10 @@ def ls( def _ls(repo, path_info, recursive=None, dvc_only=False): - from dvc.repo.tree import RepoTree - def onerror(exc): raise exc - # use our own RepoTree instance instead of repo.repo_tree since we want to - # fetch directory listings, but don't want to fetch file contents. - tree = RepoTree(repo, stream=True) - + tree = repo.repo_tree ret = {} try: for root, dirs, files in tree.walk( @@ -76,9 +71,8 @@ def onerror(exc): if not recursive: for dname in dirs: info = PathInfo(root) / dname - if not dvc_only or ( - tree.dvctree and tree.dvctree.exists(info) - ): + subtree = tree.in_subtree(info) + if not dvc_only or (subtree and subtree.exists(info)): dvc = tree.isdvc(info) path = str(info.relative_to(path_info)) ret[path] = { diff --git a/dvc/repo/metrics/show.py b/dvc/repo/metrics/show.py index cdd787f005..e57c71c56d 100644 --- a/dvc/repo/metrics/show.py +++ b/dvc/repo/metrics/show.py @@ -53,7 +53,7 @@ def _extract_metrics(metrics): def _read_metrics(repo, metrics, rev): - tree = RepoTree(repo) + tree = RepoTree(repo.tree, [repo]) res = {} for metric in metrics: diff --git a/dvc/repo/plots/__init__.py b/dvc/repo/plots/__init__.py index 4386ca5613..8ce3585493 100644 --- a/dvc/repo/plots/__init__.py +++ b/dvc/repo/plots/__init__.py @@ -44,7 +44,7 @@ def collect(self, targets=None, revs=None): continue rev = rev or "workspace" - tree = RepoTree(self.repo) + tree = RepoTree(self.repo.tree, [self.repo]) plots = _collect_plots(self.repo, targets, rev) for path_info, props in plots.items(): datafile = relpath(path_info, self.repo.root_dir) diff --git a/dvc/repo/tree.py b/dvc/repo/tree.py index 2b7810c29d..4f380535f3 100644 --- a/dvc/repo/tree.py +++ b/dvc/repo/tree.py @@ -1,5 +1,8 @@ import logging import os +from typing import Optional + +from funcy import cached_property from dvc.dvcfile import is_valid_filename from dvc.exceptions import OutputNotFoundError @@ -245,78 +248,117 @@ class RepoTree(BaseTree): # pylint:disable=abstract-method Any kwargs will be passed to `DvcTree()`. """ - def __init__(self, repo, **kwargs): - super().__init__(repo, {"url": repo.root_dir}) - if hasattr(repo, "dvc_dir"): - self.dvctree = DvcTree(repo, **kwargs) - else: - # git-only erepo's do not need dvctree - self.dvctree = None + def __init__( + self, tree, subrepos=None, **kwargs + ): # pylint: disable=super-init-not-called + subrepos = subrepos or [] + subrepos.sort(key=lambda r: len(r.root_dir), reverse=True) + dvctrees = [ + (os.path.abspath(repo.root_dir), DvcTree(repo, **kwargs)) + for repo in subrepos + ] + self._dvctrees = dict( + sorted(dvctrees, key=lambda v: len(v[0]), reverse=True) + ) + self.tree = tree + self._kwargs = kwargs @property def fetch(self): - if self.dvctree: - return self.dvctree.fetch - return False + return self._kwargs.get("fetch", False) @property def stream(self): - if self.dvctree: - return self.dvctree.stream - return False + return self._kwargs.get("stream", False) + + def _find_subtree(self, path_prefix) -> Optional[DvcTree]: + return self._find_subtree_with_prefix(path_prefix)[1] + + def _find_subtree_with_prefix(self, path): + # dvctrees is already ordered from low to high + path_prefix = os.path.abspath(path) + exact_match = self._dvctrees.get(path_prefix) + if exact_match: + return path_prefix, exact_match + + for pref, tree in self._dvctrees.items(): + if path_prefix.startswith(pref + os.sep): + return pref, tree + return "", None def open( self, path, mode="r", encoding="utf-8", **kwargs ): # pylint: disable=arguments-differ if "b" in mode: encoding = None + else: + encoding = encoding or "utf-8" - if self.dvctree and self.dvctree.exists(path): - return self.dvctree.open( - path, mode=mode, encoding=encoding, **kwargs - ) - return self.repo.tree.open(path, mode=mode, encoding=encoding) + subtree = self._find_subtree(path) + if subtree and subtree.exists(path): + return subtree.open(path, mode=mode, encoding=encoding, **kwargs) + return self.tree.open(path, mode=mode, encoding=encoding) - def exists( - self, path, use_dvcignore=True - ): # pylint: disable=arguments-differ - return self.repo.tree.exists(path) or ( - self.dvctree and self.dvctree.exists(path) - ) + def open_by_relpath(self, path, *args, **kwargs): + return self.open(PathInfo(self.tree.tree_root) / path, *args, **kwargs) + + def exists(self, path): # pylint: disable=arguments-differ + subtree = self._find_subtree(path) + return self.tree.exists(path) or (subtree and subtree.exists(path)) def isdir(self, path): # pylint: disable=arguments-differ - return self.repo.tree.isdir(path) or ( - self.dvctree and self.dvctree.isdir(path) - ) + subtree = self._find_subtree(path) + return self.tree.isdir(path) or (subtree and subtree.isdir(path)) def isdvc(self, path, **kwargs): - return self.dvctree is not None and self.dvctree.isdvc(path, **kwargs) + subtree = self._find_subtree(path) + return subtree is not None and subtree.isdvc(path, **kwargs) def isfile(self, path): # pylint: disable=arguments-differ - return self.repo.tree.isfile(path) or ( - self.dvctree and self.dvctree.isfile(path) - ) + subtree = self._find_subtree(path) + return self.tree.isfile(path) or (subtree and subtree.isfile(path)) def isexec(self, path): - if self.dvctree and self.dvctree.exists(path): - return self.dvctree.isexec(path) - return self.repo.tree.isexec(path) + subtree = self._find_subtree(path) + if subtree and subtree.exists(path): + return subtree.isexec(path) + return self.tree.isexec(path) def stat(self, path): - return self.repo.tree.stat(path) + return self.tree.stat(path) - def _walk_one(self, walk): + def _dvc_walk(self, walk): try: root, dirs, files = next(walk) except StopIteration: return yield root, dirs, files for _ in dirs: - yield from self._walk_one(walk) - - def _walk(self, dvc_walk, repo_walk, dvcfiles=False): + yield from self._dvc_walk(walk) + + def _repo_walk(self, dir_path, walk, **kwargs): + assert os.path.isabs(dir_path) + subtree = self._dvctrees.get(dir_path) + if subtree: + dvc_walk = subtree.walk(dir_path, topdown=True) + yield from self._walk(walk, dvc_walk, **kwargs) + else: + try: + root, dirs, files = next(walk) + except StopIteration: + return + yield root, dirs, files + for dirname in dirs: + yield from self._repo_walk( + os.path.join(root, dirname), walk, **kwargs + ) + + def _walk(self, repo_walk, dvc_walk=None, dvcfiles=False): + assert repo_walk try: - _, dvc_dirs, dvc_fnames = next(dvc_walk) + _, dvc_dirs, dvc_fnames = ( + next(dvc_walk) if dvc_walk else (None, [], []) + ) repo_root, repo_dirs, repo_fnames = next(repo_walk) except StopIteration: return @@ -345,11 +387,15 @@ def _walk(self, dvc_walk, repo_walk, dvcfiles=False): for dirname in dirs: if dirname in shared: - yield from self._walk(dvc_walk, repo_walk, dvcfiles=dvcfiles) + yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles) elif dirname in dvc_set: - yield from self._walk_one(dvc_walk) + yield from self._dvc_walk(dvc_walk) elif dirname in repo_set: - yield from self._walk_one(repo_walk) + yield from self._repo_walk( + os.path.join(repo_root, dirname), + repo_walk, + dvcfiles=dvcfiles, + ) def walk( self, top, topdown=True, onerror=None, dvcfiles=False, **kwargs @@ -379,22 +425,25 @@ def walk( onerror(NotADirectoryError(top)) return - dvc_exists = self.dvctree and self.dvctree.exists(top) - repo_exists = self.repo.tree.exists(top) + subtree = self._find_subtree(top) + dvc_exists = subtree and subtree.exists(top) + repo_exists = self.tree.exists(top) if dvc_exists and not repo_exists: - yield from self.dvctree.walk( + yield from subtree.walk( top, topdown=topdown, onerror=onerror, **kwargs ) - return - if repo_exists and not dvc_exists: - yield from self.repo.tree.walk( - top, topdown=topdown, onerror=onerror + if not dvc_exists and not repo_exists: + repo_walk = self.tree.walk(top, topdown=topdown, onerror=onerror) + yield from self._repo_walk( + os.path.abspath(top), repo_walk, dvcfiles=dvcfiles ) - return - - dvc_walk = self.dvctree.walk(top, topdown=topdown, **kwargs) - repo_walk = self.repo.tree.walk(top, topdown=topdown) - yield from self._walk(dvc_walk, repo_walk, dvcfiles=dvcfiles) + dvc_walk = ( + subtree.walk(top, topdown=topdown, **kwargs) + if dvc_exists + else None + ) + repo_walk = self.tree.walk(top, topdown=topdown) + yield from self._walk(repo_walk, dvc_walk, dvcfiles=dvcfiles) def walk_files(self, top, **kwargs): # pylint: disable=arguments-differ for root, _, files in self.walk(top, **kwargs): @@ -410,11 +459,10 @@ def get_file_hash(self, path_info): """ if not self.exists(path_info): raise FileNotFoundError - if self.dvctree and self.dvctree.exists(path_info): - try: - return self.dvctree.get_file_hash(path_info) - except OutputNotFoundError: - pass + subtree = self._find_subtree(path_info) + subtree_exists = subtree is not None and subtree.exists(path_info) + if subtree_exists: + return subtree.get_file_hash(path_info) return file_md5(path_info, self)[0] def copytree(self, top, dest): @@ -439,6 +487,9 @@ def copytree(self, top, dest): with self.open(src, mode="rb") as fobj: copy_fobj_to_file(fobj, dest_dir / fname) - @property - def hash_jobs(self): # pylint: disable=invalid-overridden-method - return self.repo.tree.hash_jobs + @cached_property + def hash_jobs(self): + return self.tree.hash_jobs + + def in_subtree(self, path): + return self._find_subtree(path) diff --git a/dvc/subrepos.py b/dvc/subrepos.py new file mode 100644 index 0000000000..a51a651e3c --- /dev/null +++ b/dvc/subrepos.py @@ -0,0 +1,13 @@ +import os + +from funcy import collecting + + +@collecting +def find(tree, top=None, skip_top_level=True): + top = top or tree.tree_root + for root, _, _ in tree.walk(top): + if skip_top_level and top == root: + continue + if tree.isdir(os.path.join(root, ".dvc")): + yield root diff --git a/dvc/tree/_debug.py b/dvc/tree/_debug.py new file mode 100644 index 0000000000..1d86ff7aae --- /dev/null +++ b/dvc/tree/_debug.py @@ -0,0 +1,22 @@ +import os + +from funcy import post_processing + + +@post_processing("\r\n".join) +def visualize(tree, top, **kwargs): + """`tree`-like output, useful for debugging/visualizing, needs `walk()`""" + indent = 4 + spacing = " " * indent + tee = "├── " + last = "└── " + for root, _, files in tree.walk(top, **kwargs): + level = root.replace(top, "").count(os.sep) + indent = spacing * level + yield "{}{}/".format(indent, os.path.basename(root)) + sub_indent = spacing * (level + 1) + length = len(files) + for i, f in enumerate(files): + yield "{}{}{}".format( + sub_indent, tee if i + 1 != length else last, f + ) diff --git a/dvc/tree/git.py b/dvc/tree/git.py index edf49964a7..4fd69a4501 100644 --- a/dvc/tree/git.py +++ b/dvc/tree/git.py @@ -24,7 +24,14 @@ def _item_basename(item): class GitTree(BaseTree): # pylint:disable=abstract-method """Proxies the repo file access methods to Git objects""" - def __init__(self, git, rev, use_dvcignore=False, dvcignore_root=None): + def __init__( + self, + git, + rev, + use_dvcignore=False, + dvcignore_root=None, + ignore_subrepo=True, + ): """Create GitTree instance Args: @@ -36,6 +43,7 @@ def __init__(self, git, rev, use_dvcignore=False, dvcignore_root=None): self.rev = rev self.use_dvcignore = use_dvcignore self.dvcignore_root = dvcignore_root + self.ignore_subrepo = ignore_subrepo @property def tree_root(self): @@ -46,13 +54,18 @@ def dvcignore(self): from dvc.ignore import DvcIgnoreFilter, DvcIgnoreFilterNoop root = self.dvcignore_root or self.tree_root - cls = DvcIgnoreFilter if self.use_dvcignore else DvcIgnoreFilterNoop - return cls(self, root) + if self.use_dvcignore: + return DvcIgnoreFilter( + self, root, ignore_subrepo=self.ignore_subrepo + ) + return DvcIgnoreFilterNoop(self, root) def open( self, path, mode="r", encoding="utf-8" ): # pylint: disable=arguments-differ assert mode in {"r", "rb"} + if "b" not in mode: + encoding = encoding or "utf-8" relative_path = relpath(path, self.git.working_dir) diff --git a/dvc/tree/local.py b/dvc/tree/local.py index bfce3345f8..dca87d1a5a 100644 --- a/dvc/tree/local.py +++ b/dvc/tree/local.py @@ -36,12 +36,20 @@ class LocalTree(BaseTree): CACHE_MODE = 0o444 SHARED_MODE_MAP = {None: (0o644, 0o755), "group": (0o664, 0o775)} - def __init__(self, repo, config, use_dvcignore=False, dvcignore_root=None): + def __init__( + self, + repo, + config, + use_dvcignore=False, + dvcignore_root=None, + ignore_subrepo=True, + ): super().__init__(repo, config) url = config.get("url") self.path_info = self.PATH_CLS(url) if url else None self.use_dvcignore = use_dvcignore self.dvcignore_root = dvcignore_root + self.ignore_subrepo = ignore_subrepo @property def tree_root(self): @@ -58,8 +66,11 @@ def dvcignore(self): from dvc.ignore import DvcIgnoreFilter, DvcIgnoreFilterNoop root = self.dvcignore_root or self.tree_root - cls = DvcIgnoreFilter if self.use_dvcignore else DvcIgnoreFilterNoop - return cls(self, root) + if self.use_dvcignore: + return DvcIgnoreFilter( + self, root, ignore_subrepo=self.ignore_subrepo + ) + return DvcIgnoreFilterNoop(self, root) @staticmethod def open(path_info, mode="r", encoding=None): diff --git a/tests/dir_helpers.py b/tests/dir_helpers.py index 02ae154fd6..ec192a77ba 100644 --- a/tests/dir_helpers.py +++ b/tests/dir_helpers.py @@ -87,7 +87,7 @@ def __new__(cls, *args, **kwargs): self._init() return self - def init(self, *, scm=False, dvc=False): + def init(self, *, scm=False, dvc=False, subdir=False): from dvc.repo import Repo from dvc.scm.git import Git @@ -100,7 +100,9 @@ def init(self, *, scm=False, dvc=False): git_init(str_path) if dvc: self.dvc = Repo.init( - str_path, no_scm=not scm and not hasattr(self, "scm") + str_path, + no_scm=not scm and not hasattr(self, "scm"), + subdir=subdir, ) if scm: self.scm = self.dvc.scm if hasattr(self, "dvc") else Git(str_path) @@ -123,10 +125,10 @@ def gen(self, struct, text=""): if isinstance(struct, (str, bytes, pathlib.PurePath)): struct = {struct: text} - self._gen(struct) - return struct.keys() + return self._gen(struct) def _gen(self, struct, prefix=None): + paths = [] for name, contents in struct.items(): path = (prefix or self) / name @@ -141,6 +143,8 @@ def _gen(self, struct, prefix=None): path.write_bytes(contents) else: path.write_text(contents, encoding="utf-8") + paths.append(path) + return paths def dvc_gen(self, struct, text="", commit=None): paths = self.gen(struct, text) @@ -249,10 +253,10 @@ class PosixTmpDir(TmpDir, pathlib.PurePosixPath): @pytest.fixture(scope="session") def make_tmp_dir(tmp_path_factory, request): - def make(name, *, scm=False, dvc=False): + def make(name, *, scm=False, dvc=False, **kwargs): path = tmp_path_factory.mktemp(name) if isinstance(name, str) else name new_dir = TmpDir(path) - new_dir.init(scm=scm, dvc=dvc) + new_dir.init(scm=scm, dvc=dvc, **kwargs) request.addfinalizer(new_dir.close) return new_dir diff --git a/tests/func/test_api.py b/tests/func/test_api.py index 6920df5654..4c8b9b2e29 100644 --- a/tests/func/test_api.py +++ b/tests/func/test_api.py @@ -7,6 +7,7 @@ from dvc.exceptions import FileMissingError from dvc.path_info import URLInfo from dvc.utils.fs import remove +from tests.func.test_get import make_subrepo cloud_names = [ "s3", @@ -173,3 +174,87 @@ def test_open_not_cached(dvc): os.remove(metric_file) with pytest.raises(FileMissingError): api.read(metric_file) + + +def test_read_with_subrepos(tmp_dir, scm, local_cloud): + # create a repo in "sub1", "sub2" and nested dvc repo in "sub2/nested" + # each having a dvc-tracked file foo.txt with text of location: + # eg: "sub1-foo.txt", etc. And "foo.txt" and "sub1/bar.txt" are git tracked + tmp_dir.scm_gen("foo.txt", "foo.txt", commit="FOO") + for path in ["sub1", "sub2", os.path.join("sub2", "nested")]: + repo = tmp_dir / path + make_subrepo(repo, scm, config=local_cloud.config) + with repo.chdir(): + text = os.fspath((repo / "foo.txt").relative_to(tmp_dir)).replace( + os.sep, "-" + ) + repo.dvc_gen({"foo.txt": text}, commit=f"commit for path {path}") + repo.dvc.push() + + tmp_dir.scm_gen( + {"sub1": {"bar.txt": "sub1-bar.txt"}}, commit="add sub1/bar.txt" + ) + repo_path = f"file://{tmp_dir}" # forcing it to load as ExternalRepo + + assert api.read("foo.txt", repo=repo_path) == "foo.txt" + assert api.read("sub1/foo.txt", repo=repo_path) == "sub1-foo.txt" + assert api.read("sub1/bar.txt", repo=repo_path) == "sub1-bar.txt" + assert api.read("sub2/foo.txt", repo=repo_path) == "sub2-foo.txt" + + nested_path = os.path.join("sub2", "nested", "foo.txt") + assert api.read(nested_path, repo=repo_path) == "sub2-nested-foo.txt" + + +def test_get_url_subrepo_git_repo(tmp_dir, scm, local_cloud): + remote_url = local_cloud.config["url"] + checksums = [] + paths = ["sub1", "sub2", os.path.join("sub2", "nested")] + for path in paths: + repo = tmp_dir / path + make_subrepo(repo, scm, config=local_cloud.config) + with repo.chdir(): + text = os.fspath((repo / "foo.txt").relative_to(tmp_dir)).replace( + "/", "-" + ) + (stage,) = repo.dvc_gen( + {"foo.txt": text}, commit=f"commit for path {path}" + ) + repo.dvc.push() + checksum = stage.outs[0].info["md5"] + checksums.append(checksum) + + repo_path = f"file://{tmp_dir}" # forcing it to load as ExternalRepo + + for path, c in zip(paths, checksums): + url = api.get_url(os.path.join(path, "foo.txt"), repo=repo_path) + assert url == os.path.join(os.path.relpath(remote_url), c[:2], c[2:]) + + +def test_get_url_subrepo_dvc_on_toplevel(tmp_dir, dvc, scm, local_remote): + remote_url = local_remote.config["url"] + + (stage,) = tmp_dir.dvc_gen("foo.txt", "foo.txt", commit="foo.txt") + checksums = [stage.outs[0].checksum] + tmp_dir.dvc.push() + + paths = ["sub1", "sub2", os.path.join("sub2", "nested")] + for path in paths: + repo = tmp_dir / path + make_subrepo(repo, scm, config=local_remote.config) + with repo.chdir(): + text = os.fspath((repo / "foo.txt").relative_to(tmp_dir)).replace( + "/", "-" + ) + (stage,) = repo.dvc_gen( + {"foo.txt": text}, commit=f"commit for path {path}" + ) + repo.dvc.push() + checksum = stage.outs[0].checksum + checksums.append(checksum) + + paths = [""] + paths + repo_path = f"file://{tmp_dir}" # forcing it to load as ExternalRepo + + for path, c in zip(paths, checksums): + url = api.get_url(os.path.join(path, "foo.txt"), repo=repo_path) + assert url == os.path.join(os.path.relpath(remote_url), c[:2], c[2:]) diff --git a/tests/func/test_data_cloud.py b/tests/func/test_data_cloud.py index 33381c6eff..ff9cbf99f7 100644 --- a/tests/func/test_data_cloud.py +++ b/tests/func/test_data_cloud.py @@ -311,6 +311,37 @@ def test_pull_external_dvc_imports(tmp_dir, dvc, scm, erepo_dir): assert (tmp_dir / "new_dir" / "bar").read_text() == "bar" +def test_pull_subrepo_imports(tmp_dir, dvc, scm, erepo_dir, local_cloud): + from tests.func.test_get import make_subrepo + + dir1 = {"subdir1": {"file1": "foo1"}} + dir2 = {"subdir2": {"file2": "foo2"}} + + with erepo_dir.chdir(): + erepo_dir.dvc_gen("foo", "foo", commit="first") + erepo_dir.dvc_gen({"dir1": dir1}, commit="second") + erepo_dir.scm_gen({"lorem": "lorem"}, commit="third") + + subrepo = erepo_dir / "sub" + make_subrepo(subrepo, erepo_dir.scm, local_cloud.config) + with subrepo.chdir(): + subrepo.dvc_gen({"dir2": dir2}, commit="subrepo") + subrepo.dvc.push() + + dvc.imp(os.fspath(erepo_dir), "foo") + dvc.imp(os.fspath(erepo_dir), "dir1") + dvc.imp(os.fspath(erepo_dir), os.path.join("sub", "dir2")) + + assert dvc.pull()["fetched"] == 0 + + clean(["dir1", "dir2", "foo"], dvc) + + assert dvc.pull(force=True)["fetched"] == 3 + assert (tmp_dir / "foo").read_text() == "foo" + assert (tmp_dir / "dir1").read_text() == dir1 + assert (tmp_dir / "dir2").read_text() == dir2 + + def clean(outs, dvc=None): if dvc: outs = outs + [dvc.cache.local.cache_dir] diff --git a/tests/func/test_external_repo.py b/tests/func/test_external_repo.py index 50b142dd48..bb2a06e79e 100644 --- a/tests/func/test_external_repo.py +++ b/tests/func/test_external_repo.py @@ -1,9 +1,14 @@ import os +import pytest from mock import ANY, patch -from dvc.external_repo import CLONES, external_repo -from dvc.path_info import PathInfo +from dvc.external_repo import ( + CLONES, + ExternalDVCRepo, + ExternalGitRepo, + external_repo, +) from dvc.scm.git import Git from dvc.tree.local import LocalTree from dvc.utils import relpath @@ -20,11 +25,13 @@ def test_external_repo(erepo_dir): with patch.object(Git, "clone", wraps=Git.clone) as mock: with external_repo(url) as repo: - with repo.open_by_relpath("file") as fd: + assert isinstance(repo, ExternalDVCRepo) + with repo.repo_tree.open_by_relpath("file") as fd: assert fd.read() == "master" with external_repo(url, rev="branch") as repo: - with repo.open_by_relpath("file") as fd: + assert isinstance(repo, ExternalDVCRepo) + with repo.repo_tree.open_by_relpath("file") as fd: assert fd.read() == "branch" assert mock.call_count == 1 @@ -33,6 +40,7 @@ def test_external_repo(erepo_dir): def test_source_change(erepo_dir): url = os.fspath(erepo_dir) with external_repo(url) as repo: + assert isinstance(repo, ExternalDVCRepo) old_rev = repo.scm.get_rev() erepo_dir.scm_gen("file", "text", commit="a change") @@ -43,7 +51,7 @@ def test_source_change(erepo_dir): assert old_rev != new_rev -def test_cache_reused(erepo_dir, mocker, local_cloud): +def test_cache_reused(tmp_dir, erepo_dir, mocker, local_cloud): erepo_dir.add_remote(config=local_cloud.config) with erepo_dir.chdir(): erepo_dir.dvc_gen("file", "text", commit="add file") @@ -54,13 +62,15 @@ def test_cache_reused(erepo_dir, mocker, local_cloud): # Use URL to prevent any fishy optimizations url = f"file://{erepo_dir}" with external_repo(url) as repo: - repo.fetch() + assert isinstance(repo, ExternalDVCRepo) + repo.get_external("file", tmp_dir / "file1") assert download_spy.mock.call_count == 1 # Should not download second time erepo_dir.scm.branch("branch") with external_repo(url, "branch") as repo: - repo.fetch() + assert isinstance(repo, ExternalDVCRepo) + repo.get_external("file", tmp_dir / "file2") assert download_spy.mock.call_count == 1 @@ -90,10 +100,7 @@ def test_pull_subdir_file(tmp_dir, erepo_dir): dest = tmp_dir / "file" with external_repo(os.fspath(erepo_dir)) as repo: - _, _, save_infos = repo.fetch_external( - [os.path.join("subdir", "file")] - ) - repo.cache.local.checkout(PathInfo(dest), save_infos[0]) + repo.get_external(os.path.join("subdir", "file"), dest) assert dest.is_file() assert dest.read_text() == "contents" @@ -117,9 +124,10 @@ def test_relative_remote(erepo_dir, tmp_dir): url = os.fspath(erepo_dir) with external_repo(url) as repo: - assert os.path.isabs(repo.config["remote"]["upstream"]["url"]) - assert os.path.isdir(repo.config["remote"]["upstream"]["url"]) - with repo.open_by_relpath("file") as fd: + for subrepo in repo.repos: + assert os.path.isabs(subrepo.config["remote"]["upstream"]["url"]) + assert os.path.isdir(subrepo.config["remote"]["upstream"]["url"]) + with repo.repo_tree.open_by_relpath("file") as fd: assert fd.read() == "contents" @@ -133,7 +141,7 @@ def test_shallow_clone_branch(erepo_dir): with patch.object(Git, "clone", wraps=Git.clone) as mock_clone: with external_repo(url, rev="branch") as repo: - with repo.open_by_relpath("file") as fd: + with repo.repo_tree.open_by_relpath("file") as fd: assert fd.read() == "branch" mock_clone.assert_called_with(url, ANY, shallow_branch="branch") @@ -141,7 +149,7 @@ def test_shallow_clone_branch(erepo_dir): assert shallow with external_repo(url) as repo: - with repo.open_by_relpath("file") as fd: + with repo.repo_tree.open_by_relpath("file") as fd: assert fd.read() == "master" assert mock_clone.call_count == 1 @@ -159,7 +167,7 @@ def test_shallow_clone_tag(erepo_dir): with patch.object(Git, "clone", wraps=Git.clone) as mock_clone: with external_repo(url, rev="v1") as repo: - with repo.open_by_relpath("file") as fd: + with repo.repo_tree.open_by_relpath("file") as fd: assert fd.read() == "foo" mock_clone.assert_called_with(url, ANY, shallow_branch="v1") @@ -167,9 +175,30 @@ def test_shallow_clone_tag(erepo_dir): assert shallow with external_repo(url, rev="master") as repo: - with repo.open_by_relpath("file") as fd: + with repo.repo_tree.open_by_relpath("file") as fd: assert fd.read() == "bar" assert mock_clone.call_count == 1 _, shallow = CLONES[url] assert not shallow + + +@pytest.mark.parametrize( + "repo_class, erepo", + [ + (ExternalDVCRepo, pytest.lazy_fixture("erepo_dir")), + (ExternalGitRepo, pytest.lazy_fixture("git_dir")), + ], +) +def test_subrepo(erepo, repo_class): + from tests.func.test_get import make_subrepo + + subrepo = erepo / "sub" + nested = erepo / "sub" / "nested" + make_subrepo(subrepo, erepo.scm) + make_subrepo(nested, erepo.scm) + with external_repo(os.fspath(erepo), rev="master") as repo: + assert isinstance(repo, repo_class) + assert { + os.path.relpath(r.root_dir, repo.root_dir) for r in repo.subrepos + } == {"sub", os.path.join("sub", "nested")} diff --git a/tests/func/test_get.py b/tests/func/test_get.py index 72736f8077..59651ba549 100644 --- a/tests/func/test_get.py +++ b/tests/func/test_get.py @@ -9,7 +9,7 @@ from dvc.repo import Repo from dvc.repo.get import GetDVCFileError from dvc.system import System -from dvc.utils.fs import makedirs +from dvc.utils.fs import makedirs, remove def test_get_repo_file(tmp_dir, erepo_dir): @@ -98,22 +98,6 @@ def test_get_a_dvc_file(tmp_dir, erepo_dir): Repo.get(os.fspath(erepo_dir), "some_file.dvc") -# https://github.com/iterative/dvc/pull/2837#discussion_r352123053 -def test_get_full_dvc_path(tmp_dir, erepo_dir, tmp_path_factory): - path = tmp_path_factory.mktemp("ext") - external_data = path / "ext_data" - external_data.write_text("ext_data") - - with erepo_dir.chdir(): - erepo_dir.dvc.add(os.fspath(external_data), external=True) - erepo_dir.scm_add("ext_data.dvc", commit="add external data") - - Repo.get( - os.fspath(erepo_dir), os.fspath(external_data), "ext_data_imported" - ) - assert (tmp_dir / "ext_data_imported").read_text() == "ext_data" - - def test_non_cached_output(tmp_dir, erepo_dir): src = "non_cached_file" dst = src + "_imported" @@ -252,3 +236,48 @@ def test_get_pipeline_tracked_outs( with git_dir.chdir(): Repo.get("file:///{}".format(os.fspath(tmp_dir)), "bar", out="baz") assert (git_dir / "baz").read_text() == "foo" + + +def make_subrepo(dir_, scm, config=None): + dir_.mkdir(parents=True) + with dir_.chdir(): + dir_.scm = scm + dir_.init(dvc=True, subdir=True) + if config: + dir_.add_remote(config=config) + + +@pytest.mark.parametrize( + "erepo", + [pytest.lazy_fixture("erepo_dir"), pytest.lazy_fixture("git_dir")], +) +def test_subrepo(tmp_dir, erepo, local_cloud): + def scm_framework(p): + return {"foo": p + "foo-scm", "scm_dir": {"lorem": p + "scm-lorem"}} + + def output_framework(p): + return {"bar": p + "dvc-bar", "dvc_dir": {"ipsum": p + "dvc-ipsum"}} + + subrepo = ["sub1", "sub2", os.path.join("sub1", "nested")] + for path in subrepo: + repo = erepo / path + make_subrepo(repo, erepo.scm, local_cloud.config) + repo.dvc_gen(output_framework(path), commit="add dvc outputs") + repo.scm_gen(scm_framework(path), commit="add git files") + repo.dvc.push() + + for is_dvc, file in [ + (False, "foo"), + (True, "dvc_dir"), + (False, "scm_dir"), + (True, "bar"), + ]: + for repo in subrepo: + Repo.get( + f"{erepo}", os.path.join(repo, file), + ) + framework = output_framework if is_dvc else scm_framework + assert (tmp_dir / file).read_text() == framework(repo)[ + file + ], f"Failed for repo: {repo} for file: {file}" + remove(tmp_dir / file) diff --git a/tests/func/test_import.py b/tests/func/test_import.py index 4e38f70d89..d55dc0585c 100644 --- a/tests/func/test_import.py +++ b/tests/func/test_import.py @@ -9,8 +9,10 @@ from dvc.config import NoRemoteError from dvc.dvcfile import Dvcfile from dvc.exceptions import DownloadError, PathMissingError +from dvc.scm.base import CloneError from dvc.system import System from dvc.utils.fs import makedirs, remove +from tests.func.test_get import make_subrepo def test_import(tmp_dir, scm, dvc, erepo_dir): @@ -234,7 +236,8 @@ def test_download_error_pulling_imported_stage(tmp_dir, dvc, erepo_dir): remove(dst_cache) with patch( - "dvc.tree.local.LocalTree._download", side_effect=Exception + "dvc.external_repo.external_repo", + side_effect=CloneError(os.fspath(erepo_dir), "somewhere"), ), pytest.raises(DownloadError): dvc.pull(["foo_imported.dvc"]) @@ -345,3 +348,45 @@ def test_local_import(tmp_dir, dvc, scm): tmp_dir.dvc_gen("foo", "foo", commit="init") (tmp_dir / "outdir").mkdir() dvc.imp(".", "foo", out="outdir") + + +@pytest.mark.parametrize( + "output", + [ + "foo", + {"foo": "foo", "bar": "bar"}, + {"subdir": {"foo": "foo", "bar": "bar"}}, + ], + ids=["file", "dir", "nested_dir"], +) +@pytest.mark.parametrize( + "erepo", [pytest.lazy_fixture("erepo_dir"), pytest.lazy_fixture("git_dir")] +) +@pytest.mark.parametrize( + "subrepo_paths", + [ + (os.path.join("sub", "subdir1"), os.path.join("sub", "subdir2")), + (os.path.join("sub"), os.path.join("sub", "subdir1")), + ], + ids=["isolated", "nested"], +) +def test_subrepo_import( + tmp_dir, scm, dvc, output, subrepo_paths, erepo, local_cloud +): + sub_repos = [erepo / path for path in subrepo_paths] + filename = "output" + for repo in sub_repos: + make_subrepo(repo, erepo.scm, local_cloud.config) + repo.dvc_gen({filename: output}, commit="add subrepo") + repo.dvc.push() + + rev = erepo.scm.get_rev() + for i, repo in enumerate(sub_repos): + url = f"file:///{erepo}" + file = str((repo / filename).relative_to(erepo)) + out = f"{filename}-{i}" + + stage = dvc.imp(url, file, out=out) + + assert stage.deps[0].def_repo == {"url": url, "rev_lock": rev} + assert (tmp_dir / out).read_text() == output diff --git a/tests/func/test_ls.py b/tests/func/test_ls.py index 85dd9eb1ba..7602991930 100644 --- a/tests/func/test_ls.py +++ b/tests/func/test_ls.py @@ -1,6 +1,7 @@ import os import shutil import textwrap +from operator import itemgetter import pytest @@ -502,3 +503,109 @@ def _ls(path): {"isdir": False, "isexec": 0, "isout": False, "path": "bar"}, {"isdir": False, "isexec": 0, "isout": False, "path": "foo"}, ] + + +@pytest.mark.parametrize( + "dvc_top_level, erepo", + [ + (True, pytest.lazy_fixture("erepo_dir")), + (False, pytest.lazy_fixture("git_dir")), + ], +) +def test_subrepo(dvc_top_level, erepo): + from tests.func.test_get import make_subrepo + + # setup foo.txt and dvc_dir in each subrepo and bar.txt and scm_dir as scm + # tracked files + dvc_files = {"foo.txt": "foo.txt", "dvc_dir": {"lorem": "lorem"}} + scm_files = {"bar.txt": "bar.txt", "scm_dir": {"ipsum": "ipsum"}} + with erepo.chdir(): + erepo.scm_gen(scm_files, commit="bar.txt") + if dvc_top_level: + erepo.dvc_gen(dvc_files, commit="foo.txt") + for path in ["sub1", "sub2", os.path.join("sub2", "nested")]: + repo = erepo / path + make_subrepo(repo, erepo.scm) + with repo.chdir(): + repo.dvc_gen(dvc_files, commit=f"dvc track for path {path}") + repo.scm_gen(scm_files, commit=f"scm track for path {path}") + + def _ls(path=None): + return Repo.ls(os.fspath(erepo), path) + + git_tracked_outputs = [ + # these outputs will be used if top-level is not a dvc repo, + # but just a plain git repo + {"isdir": False, "isexec": False, "isout": False, "path": "bar.txt"}, + {"isdir": True, "isexec": 0, "isout": False, "path": "scm_dir"}, + ] + + # Repo.ls output is sorted alphabetically by path + common_outputs = sorted( + [ + *git_tracked_outputs, + { + "isdir": False, + "isexec": 0, + "isout": False, + "path": ".gitignore", + }, + {"isdir": True, "isexec": False, "isout": True, "path": "dvc_dir"}, + { + "isdir": False, + "isexec": False, + "isout": True, + "path": "foo.txt", + }, + { + "isdir": False, + "isexec": 0, + "isout": False, + "path": "foo.txt.dvc", + }, + { + "isdir": False, + "isexec": 0, + "isout": False, + "path": "dvc_dir.dvc", + }, + ], + key=itemgetter("path"), + ) + + top_level_outputs = ( + common_outputs if dvc_top_level else git_tracked_outputs + ) + assert _ls() == sorted( + [ + *top_level_outputs, + {"isdir": True, "isexec": False, "isout": False, "path": "sub1"}, + {"isdir": True, "isexec": False, "isout": False, "path": "sub2"}, + ], + key=itemgetter("path"), + ) + + assert _ls("sub1") == common_outputs + assert _ls("sub2") == sorted( + [ + *common_outputs, + {"isdir": True, "isexec": False, "isout": False, "path": "nested"}, + ], + key=itemgetter("path"), + ) + assert _ls(os.path.join("sub2", "nested")) == common_outputs + + # testing for contents of a directory + for path in [os.curdir, "sub1", "sub2", os.path.join("sub2", "nested")]: + if path == os.curdir and dvc_top_level: + assert _ls(os.path.join(path, "dvc_dir")) == [ + { + "isdir": False, + "isexec": False, + "isout": False, + "path": "lorem", + }, + ] + assert _ls(os.path.join(path, "scm_dir")) == [ + {"isdir": False, "isexec": False, "isout": False, "path": "ipsum"}, + ] diff --git a/tests/func/test_tree.py b/tests/func/test_tree.py index 93b671bd75..74585c0e1d 100644 --- a/tests/func/test_tree.py +++ b/tests/func/test_tree.py @@ -184,7 +184,7 @@ def test_repotree_walk_fetch(tmp_dir, dvc, scm, local_remote): dvc.push() remove(dvc.cache.local.cache_dir) - tree = RepoTree(dvc, fetch=True) + tree = RepoTree(dvc.tree, [dvc], fetch=True) with dvc.state: for _, _, _ in tree.walk("dir"): pass @@ -208,7 +208,7 @@ def test_repotree_cache_save(tmp_dir, dvc, scm, erepo_dir, local_cloud): # # for this test, all file objects are being opened() and copied from tree # into dvc.cache, not fetched or streamed from a remote - tree = RepoTree(erepo_dir.dvc, stream=True) + tree = RepoTree(erepo_dir.dvc.tree, [erepo_dir.dvc], stream=True) expected = [ tree.get_file_hash(PathInfo(erepo_dir / path)) for path in ("dir/bar", "dir/subdir/foo") diff --git a/tests/func/test_update.py b/tests/func/test_update.py index 12d0c9d61c..b46b82e8d4 100644 --- a/tests/func/test_update.py +++ b/tests/func/test_update.py @@ -1,4 +1,5 @@ import os +from copy import deepcopy import pytest @@ -282,3 +283,43 @@ def test_update_recursive(tmp_dir, dvc, erepo_dir): assert stage1.deps[0].def_repo["rev_lock"] == new_rev assert stage2.deps[0].def_repo["rev_lock"] == new_rev assert stage3.deps[0].def_repo["rev_lock"] == new_rev + + +def test_subrepo_update(tmp_dir, dvc, scm, erepo_dir, local_cloud): + from tests.func.test_get import make_subrepo + + dir1 = {"subdir1": {"foo1": "foo1"}} + dir2 = {"subdir2": {"foo2": "foo2"}} + + subrepo = erepo_dir / "sub" + make_subrepo(subrepo, erepo_dir.scm, local_cloud.config) + + with erepo_dir.chdir(): + erepo_dir.dvc_gen({"dir1": dir1}, commit="first") + with subrepo.chdir(): + subrepo.dvc_gen({"dir2": dir2}, commit="second") + subrepo.dvc.push() # FIXME: deleting this is failing the test, investigate + + old_rev = erepo_dir.scm.get_rev() + + dir1_new = deepcopy(dir1) + dir1_new["subdir1"].update({"bar1": "bar1"}) + dir2_new = deepcopy(dir2) + dir2_new["subdir2"].update({"bar2": "bar2"}) + + with erepo_dir.chdir(): + erepo_dir.dvc_gen({"dir1": dir1_new}, commit="third") + with subrepo.chdir(): + subrepo.dvc_gen({"dir2": dir2_new}, commit="third") + subrepo.dvc.push() + + dvc.imp(os.fspath(erepo_dir), "dir1", rev=old_rev) + dvc.imp(os.fspath(erepo_dir), os.path.join("sub", "dir2"), rev=old_rev) + + assert (tmp_dir / "dir1").read_text() == dir1 + assert (tmp_dir / "dir2").read_text() == dir2 + + dvc.update("dir1.dvc", rev="HEAD") + assert (tmp_dir / "dir1").read_text() == dir1_new + dvc.update("dir2.dvc", rev="HEAD") + assert (tmp_dir / "dir2").read_text() == dir2_new diff --git a/tests/unit/repo/test_repo_tree.py b/tests/unit/repo/test_repo_tree.py index 417b2c0b92..10b964ad55 100644 --- a/tests/unit/repo/test_repo_tree.py +++ b/tests/unit/repo/test_repo_tree.py @@ -1,10 +1,12 @@ import os import shutil +from unittest import mock import pytest from dvc.path_info import PathInfo from dvc.repo.tree import RepoTree +from tests.func.test_get import make_subrepo def test_exists(tmp_dir, dvc): @@ -12,7 +14,7 @@ def test_exists(tmp_dir, dvc): dvc.add("foo") (tmp_dir / "foo").unlink() - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) assert tree.exists("foo") @@ -21,7 +23,7 @@ def test_open(tmp_dir, dvc): dvc.add("foo") (tmp_dir / "foo").unlink() - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) with dvc.state: with tree.open("foo", "r") as fobj: assert fobj.read() == "foo" @@ -31,7 +33,7 @@ def test_open_dirty_hash(tmp_dir, dvc): tmp_dir.dvc_gen("file", "file") (tmp_dir / "file").write_text("something") - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) with tree.open("file", "r") as fobj: assert fobj.read() == "something" @@ -40,7 +42,7 @@ def test_open_dirty_no_hash(tmp_dir, dvc): tmp_dir.gen("file", "file") (tmp_dir / "file.dvc").write_text("outs:\n- path: file\n") - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) with tree.open("file", "r") as fobj: assert fobj.read() == "file" @@ -60,7 +62,7 @@ def test_open_in_history(tmp_dir, scm, dvc): if rev == "workspace": continue - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) with tree.open("foo", "r") as fobj: assert fobj.read() == "foo" @@ -68,7 +70,7 @@ def test_open_in_history(tmp_dir, scm, dvc): def test_isdir_isfile(tmp_dir, dvc): tmp_dir.gen({"datafile": "data", "datadir": {"foo": "foo", "bar": "bar"}}) - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) assert tree.isdir("datadir") assert not tree.isfile("datadir") assert not tree.isdvc("datadir") @@ -93,7 +95,7 @@ def test_isdir_mixed(tmp_dir, dvc): dvc.add(str(tmp_dir / "dir" / "foo")) - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) assert tree.isdir("dir") assert not tree.isfile("dir") @@ -123,7 +125,7 @@ def test_walk(tmp_dir, dvc, dvcfiles, extra_expected): ) dvc.add(str(tmp_dir / "dir"), recursive=True) tmp_dir.gen({"dir": {"foo": "foo", "bar": "bar"}}) - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) expected = [ PathInfo("dir") / "subdir1", @@ -150,7 +152,7 @@ def onerror(exc): raise exc tmp_dir.dvc_gen("foo", "foo") - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) # path does not exist for _ in tree.walk("dir"): @@ -171,9 +173,177 @@ def test_isdvc(tmp_dir, dvc): tmp_dir.gen({"foo": "foo", "bar": "bar", "dir": {"baz": "baz"}}) dvc.add("foo") dvc.add("dir") - tree = RepoTree(dvc) + tree = RepoTree(dvc.tree, [dvc]) assert tree.isdvc("foo") assert not tree.isdvc("bar") assert tree.isdvc("dir") assert not tree.isdvc("dir/baz") assert tree.isdvc("dir/baz", recursive=True, strict=False) + + +def test_in_subtree(tmp_dir, scm, dvc): + subrepo1 = tmp_dir / "dir" / "repo" + subrepo2 = tmp_dir / "dir" / "repo2" + + for repo in [subrepo1, subrepo2]: + make_subrepo(repo, scm) + + (tmp_dir / "dir" / "repotxt").write_text("file to confuse RepoTree") + subrepo1.dvc_gen({"foo": "foo"}, commit="FOO") + subrepo2.dvc_gen({"bar": "bar"}, commit="BAR") + + # dvc.tree ignores subrepos by default, + # but we just want to test `in_subtree()`, which is purely lexical + tree = RepoTree(dvc.tree, [dvc, subrepo1.dvc, subrepo2.dvc]) + + assert tree.in_subtree(str(tmp_dir / "dir")).repo == dvc + assert tree.in_subtree(str(tmp_dir / "dir" / "re")).repo == dvc + assert tree.in_subtree(str(tmp_dir / "dir" / "repo")).repo == subrepo1.dvc + assert tree.in_subtree(str(tmp_dir / "dir" / "repotxt")).repo == dvc + assert tree.in_subtree(str(tmp_dir / "dir" / "repo2")).repo == subrepo2.dvc + + for repo in [tmp_dir, subrepo1, subrepo2]: + for path in ["", "foo", "something-that-does-not-exist"]: + p = os.path.join(repo, path) + subtree = tree.in_subtree(p) + assert subtree, f"subtree not found for path '{p}'" + assert ( + subtree.repo == repo.dvc + ), f"repo did not match for path '{p}'" + + +def test_subrepos(tmp_dir, scm, dvc): + tmp_dir.scm_gen( + {"dir": {"repo.txt": "file to confuse RepoTree"}}, + commit="dir/repo.txt", + ) + + subrepo1 = tmp_dir / "dir" / "repo" + subrepo2 = tmp_dir / "dir" / "repo2" + + for repo in [subrepo1, subrepo2]: + make_subrepo(repo, scm) + + subrepo1.dvc_gen({"foo": "foo", "dir1": {"bar": "bar"}}, commit="FOO") + subrepo2.dvc_gen( + {"lorem": "lorem", "dir2": {"ipsum": "ipsum"}}, commit="BAR" + ) + + # using tree that does not have dvcignore + tree = RepoTree( + scm.get_tree("HEAD"), [dvc, subrepo1.dvc, subrepo2.dvc], fetch=True + ) + + def assert_tree_belongs_to_repo(ret_val): + method = tree._find_subtree + + def f(*args, **kwargs): + r = method(*args, **kwargs) + assert r and r.repo == ret_val + return r + + return f + + with mock.patch.object( + tree, + "_find_subtree", + side_effect=assert_tree_belongs_to_repo(subrepo1.dvc), + ): + assert tree.exists(subrepo1 / "foo") is True + assert tree.exists(subrepo1 / "bar") is False + + assert tree.isfile(subrepo1 / "foo") is True + assert tree.isfile(subrepo1 / "foo") is True + assert tree.isfile(subrepo1 / "dir1" / "bar") is True + assert tree.isfile(subrepo1 / "dir1") is False + + assert tree.isdir(subrepo1 / "dir1") is True + assert tree.isdir(subrepo1 / "dir1" / "bar") is False + assert tree.isdvc(subrepo1 / "foo") is True + + with mock.patch.object( + tree, + "_find_subtree", + side_effect=assert_tree_belongs_to_repo(subrepo2.dvc), + ): + assert tree.exists(subrepo2 / "lorem") is True + assert tree.exists(subrepo2 / "ipsum") is False + + assert tree.isfile(subrepo2 / "lorem") is True + assert tree.isfile(subrepo2 / "lorem") is True + assert tree.isfile(subrepo2 / "dir2" / "ipsum") is True + assert tree.isfile(subrepo2 / "dir2") is False + + assert tree.isdir(subrepo2 / "dir2") is True + assert tree.isdir(subrepo2 / "dir2" / "ipsum") is False + assert tree.isdvc(subrepo2 / "lorem") is True + + +@pytest.mark.parametrize( + "dvcfiles,extra_expected", + [ + (False, []), + ( + True, + [ + PathInfo("dir") / "repo" / "foo.dvc", + PathInfo("dir") / "repo" / "dir1.dvc", + PathInfo("dir") / "repo2" / "lorem.dvc", + PathInfo("dir") / "repo2" / "dir2.dvc", + ], + ), + ], +) +@pytest.mark.parametrize("is_dvc", [True, False]) +def test_subrepo_walk(tmp_dir, dvcfiles, extra_expected, is_dvc): + tmp_dir.init(scm=True, dvc=is_dvc) + scm = tmp_dir.scm + + tmp_dir.scm_gen( + {"dir": {"repo.txt": "file to confuse RepoTree"}}, + commit="dir/repo.txt", + ) + + subrepo1 = tmp_dir / "dir" / "repo" + subrepo2 = tmp_dir / "dir" / "repo2" + + subdirs = [subrepo1, subrepo2] + for dir_ in subdirs: + make_subrepo(dir_, scm) + if is_dvc: + subdirs.append(tmp_dir) + + subrepo1.dvc_gen({"foo": "foo", "dir1": {"bar": "bar"}}, commit="FOO") + subrepo2.dvc_gen( + {"lorem": "lorem", "dir2": {"ipsum": "ipsum"}}, commit="BAR" + ) + + # using tree that does not have dvcignore + tree = RepoTree( + scm.get_tree("HEAD", use_dvcignore=True, ignore_subrepo=False), + [dir_.dvc for dir_ in subdirs], + fetch=True, + ) + expected = [ + PathInfo("dir") / "repo", + PathInfo("dir") / "repo.txt", + PathInfo("dir") / "repo2", + PathInfo("dir") / "repo" / ".gitignore", + PathInfo("dir") / "repo" / "foo", + PathInfo("dir") / "repo" / "dir1", + PathInfo("dir") / "repo" / "dir1" / "bar", + PathInfo("dir") / "repo2" / ".gitignore", + PathInfo("dir") / "repo2" / "lorem", + PathInfo("dir") / "repo2" / "dir2", + PathInfo("dir") / "repo2" / "dir2" / "ipsum", + ] + + actual = [] + for root, dirs, files in tree.walk("dir", dvcfiles=dvcfiles): + for entry in dirs + files: + actual.append(os.path.join(root, entry)) + + # need to change to abspath as scm.get_tree() makes `root` return abspath + expected = [str(tmp_dir / path) for path in expected + extra_expected] + assert set(actual) == set(expected) + assert len(actual) == len(expected)