Skip to content

Commit

Permalink
dvc: introduce DvcTree (#3639)
Browse files Browse the repository at this point in the history
This is useful as it is in `dvc ls` implementation, but this is also a
prerequisite for `RepoTree` that will combine git(workspace) files and
dvc files into one tree, allowing things like `dvc metrics` and
`external_repo` to have a universal interface to access whatever
file/dir they want without cumbersome git/dvc path detection.
  • Loading branch information
efiop committed Apr 15, 2020
1 parent 45c2c72 commit c60f364
Show file tree
Hide file tree
Showing 4 changed files with 178 additions and 106 deletions.
3 changes: 3 additions & 0 deletions dvc/ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@ def isdir(self, path):
def isfile(self, path):
return self.tree.isfile(path)

def isexec(self, path):
return self.tree.isexec(path)

def walk(self, top, topdown=True):
for root, dirs, files in self.tree.walk(top, topdown):
dirs[:], files[:] = self.dvcignore(root, dirs, files)
Expand Down
160 changes: 54 additions & 106 deletions dvc/repo/ls.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import os
import stat

from dvc.exceptions import PathMissingError, OutputNotFoundError
from dvc.path_info import PathInfo
from dvc.exceptions import PathMissingError


@staticmethod
Expand Down Expand Up @@ -31,124 +29,74 @@ def ls(
"""
from dvc.external_repo import external_repo
from dvc.repo import Repo
from dvc.utils import relpath

with external_repo(url, rev) as repo:
path_info = _get_path_info(repo, path)
fs_nodes = []
path_info = PathInfo(repo.root_dir)
if path:
path_info /= path

ret = {}
if isinstance(repo, Repo):
fs_nodes.extend(_ls_outs_repo(repo, path_info, recursive))
ret = _ls(repo, path_info, recursive, True)

nondvc = {}
if not outs_only:
fs_nodes.extend(_ls_files_repo(path_info, recursive))
nondvc = _ls(repo, path_info, recursive, False)

if path and not fs_nodes:
raise PathMissingError(path, repo, output_only=outs_only)
ret.update(nondvc)

fs_nodes = {n["path_info"]: n for n in fs_nodes}.values()

def get_entry(fs_node):
node_path_info = fs_node["path_info"]
path = (
node_path_info.name
if node_path_info == path_info
else relpath(node_path_info, path_info)
)
return {
"path": path,
"isout": fs_node.get("isout", False),
"isdir": fs_node.get("isdir", False),
"isexec": fs_node.get("isexec", False),
}
if path and not ret:
raise PathMissingError(path, repo, output_only=outs_only)

entries = sorted(map(get_entry, fs_nodes), key=lambda f: f["path"])
return entries
ret_list = []
for path, info in ret.items():
info["path"] = path
ret_list.append(info)
ret_list.sort(key=lambda f: f["path"])
return ret_list


def _ls_files_repo(path_info, recursive=None):
from dvc.compat import fspath
def _ls(repo, path_info, recursive=None, dvc=False):
from dvc.ignore import CleanTree
from dvc.path_info import PathInfo
from dvc.repo.tree import DvcTree
from dvc.scm.tree import WorkingTree

if not os.path.exists(fspath(path_info)):
return []
if dvc:
tree = DvcTree(repo)
else:
tree = CleanTree(WorkingTree(repo.root_dir))

files = []
tree = CleanTree(WorkingTree(path_info))
ret = {}
try:
for dirpath, dirnames, filenames in tree.walk(path_info):
files.extend(PathInfo(dirpath, f) for f in filenames)
for root, dirs, files in tree.walk(path_info.fspath):
for fname in files:
info = PathInfo(root) / fname
path = str(info.relative_to(path_info))
ret[path] = {
"isout": dvc,
"isdir": False,
"isexec": False if dvc else tree.isexec(info.fspath),
}

if not recursive:
files.extend(PathInfo(dirpath, d) for d in dirnames)
for dname in dirs:
info = PathInfo(root) / dname
path = str(info.relative_to(path_info))
ret[path] = {
"isout": tree.isdvc(info.fspath) if dvc else False,
"isdir": True,
"isexec": False if dvc else tree.isexec(info.fspath),
}
break
except NotADirectoryError:
if os.path.isfile(fspath(path_info)):
files = [path_info]

return [_get_fs_node(f) for f in files]


def _ls_outs_repo(repo, path_info, recursive=None):
from dvc.compat import fspath
from dvc.path_info import PathInfo

try:
outs = repo.find_outs_by_path(fspath(path_info), recursive=True)
except OutputNotFoundError:
return []

if recursive:
return [_get_fs_node(out.path_info, out) for out in outs]

def get_first_segment(out):
"""Returns tuple with path_info and related out
path_info calculated as the first relpath segment
Example:
dir/file -> dir
dir/subdir/file -> dir
file -> file
"""
relpath = out.path_info.relpath(path_info)
if relpath.parts:
out_path_info = PathInfo(path_info, relpath.parts[0])
isout = len(relpath.parts) == 1
return (out_path_info, out if isout else None)
return (out.path_info, out)

return [
_get_fs_node(p, out)
for (p, out) in {get_first_segment(out) for out in outs}
]


def _get_path_info(repo, path=None):
from dvc.path_info import PathInfo

if not path:
return PathInfo(repo.root_dir)
return PathInfo(repo.root_dir, path)


def _get_fs_node(path_info, out=None):
from dvc.compat import fspath
return {
path_info.name: {
"isout": dvc,
"isdir": False,
"isexec": False if dvc else tree.isexec(path_info.fspath),
}
}
except FileNotFoundError:
return {}

if out:
isdir = out.is_dir_checksum if out.checksum else False
isexec = False
else:
try:
isdir = os.path.isdir(fspath(path_info))
mode = os.stat(fspath(path_info)).st_mode
isexec = mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
except FileNotFoundError:
isdir = False
isexec = False

return {
"path_info": path_info,
"isout": bool(out),
"isdir": isdir,
"isexec": isexec,
}
return ret
116 changes: 116 additions & 0 deletions dvc/repo/tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import errno

from dvc.scm.tree import BaseTree
from dvc.path_info import PathInfo
from dvc.exceptions import OutputNotFoundError


class DvcTree(BaseTree):
def __init__(self, repo):
self.repo = repo

def _find_outs(self, path, *args, **kwargs):
outs = self.repo.find_outs_by_path(path, *args, **kwargs)

def _is_cached(out):
return out.use_cache

outs = list(filter(_is_cached, outs))
if not outs:
raise OutputNotFoundError(path, self.repo)

return outs

def open(self, path, mode="r", encoding="utf-8"):
try:
outs = self._find_outs(path, strict=False)
except OutputNotFoundError as exc:
raise FileNotFoundError from exc

if len(outs) != 1 or outs[0].isdir():
raise IOError(errno.EISDIR)

out = outs[0]
if not out.changed_cache():
return open(out.cache_path.fspath, mode=mode, encoding=encoding)

raise FileNotFoundError

def exists(self, path):
try:
self._find_outs(path, strict=False, recursive=True)
return True
except OutputNotFoundError:
return False

def isdir(self, path):
if not self.exists(path):
return False

outs = self._find_outs(path, strict=False, recursive=True)

if len(outs) != 1 or outs[0].path_info.fspath != path:
return True

return outs[0].isdir()

def isfile(self, path):
if not self.exists(path):
return False

return not self.isdir(path)

def _walk(self, root, trie, topdown=True):
dirs = set()
files = []

root_len = len(root.parts)
for key, out in trie.iteritems(prefix=root.parts):
if key == root.parts:
continue

name = key[root_len]
if len(key) > root_len + 1 or out.isdir():
dirs.add(name)
continue

files.append(name)

if topdown:
yield root.fspath, list(dirs), files

for dname in dirs:
yield from self._walk(root / dname, trie)
else:
assert False

def walk(self, top, topdown=True):
from pygtrie import Trie

assert topdown

if not self.exists(top):
raise FileNotFoundError

if not self.isdir(top):
raise NotADirectoryError

root = PathInfo(top)
outs = self._find_outs(top, recursive=True, strict=False)

trie = Trie()

for out in outs:
trie[out.path_info.parts] = out

yield from self._walk(root, trie, topdown=topdown)

def isdvc(self, path):
try:
return len(self._find_outs(path)) == 1
except OutputNotFoundError:
pass
return False

def isexec(self, path):
return False
5 changes: 5 additions & 0 deletions dvc/scm/tree.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import stat

from dvc.compat import fspath

Expand Down Expand Up @@ -75,6 +76,10 @@ def onerror(e):
):
yield os.path.normpath(root), dirs, files

def isexec(self, path):
mode = os.stat(path).st_mode
return mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)


def is_working_tree(tree):
return isinstance(tree, WorkingTree) or isinstance(
Expand Down

0 comments on commit c60f364

Please sign in to comment.