Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dvc: introduce DvcTree #3639

Merged
merged 1 commit into from
Apr 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions dvc/ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@ def isdir(self, path):
def isfile(self, path):
return self.tree.isfile(path)

def isexec(self, path):
return self.tree.isexec(path)

def walk(self, top, topdown=True):
for root, dirs, files in self.tree.walk(top, topdown):
dirs[:], files[:] = self.dvcignore(root, dirs, files)
Expand Down
160 changes: 54 additions & 106 deletions dvc/repo/ls.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import os
import stat

from dvc.exceptions import PathMissingError, OutputNotFoundError
from dvc.path_info import PathInfo
from dvc.exceptions import PathMissingError


@staticmethod
Expand Down Expand Up @@ -31,124 +29,74 @@ def ls(
"""
from dvc.external_repo import external_repo
from dvc.repo import Repo
from dvc.utils import relpath

with external_repo(url, rev) as repo:
path_info = _get_path_info(repo, path)
fs_nodes = []
path_info = PathInfo(repo.root_dir)
if path:
path_info /= path

ret = {}
if isinstance(repo, Repo):
fs_nodes.extend(_ls_outs_repo(repo, path_info, recursive))
ret = _ls(repo, path_info, recursive, True)

nondvc = {}
if not outs_only:
fs_nodes.extend(_ls_files_repo(path_info, recursive))
nondvc = _ls(repo, path_info, recursive, False)

if path and not fs_nodes:
raise PathMissingError(path, repo, output_only=outs_only)
ret.update(nondvc)

fs_nodes = {n["path_info"]: n for n in fs_nodes}.values()

def get_entry(fs_node):
node_path_info = fs_node["path_info"]
path = (
node_path_info.name
if node_path_info == path_info
else relpath(node_path_info, path_info)
)
return {
"path": path,
"isout": fs_node.get("isout", False),
"isdir": fs_node.get("isdir", False),
"isexec": fs_node.get("isexec", False),
}
if path and not ret:
raise PathMissingError(path, repo, output_only=outs_only)

entries = sorted(map(get_entry, fs_nodes), key=lambda f: f["path"])
return entries
ret_list = []
for path, info in ret.items():
info["path"] = path
ret_list.append(info)
ret_list.sort(key=lambda f: f["path"])
return ret_list


def _ls_files_repo(path_info, recursive=None):
from dvc.compat import fspath
def _ls(repo, path_info, recursive=None, dvc=False):
from dvc.ignore import CleanTree
from dvc.path_info import PathInfo
from dvc.repo.tree import DvcTree
from dvc.scm.tree import WorkingTree

if not os.path.exists(fspath(path_info)):
return []
if dvc:
tree = DvcTree(repo)
else:
tree = CleanTree(WorkingTree(repo.root_dir))

files = []
tree = CleanTree(WorkingTree(path_info))
ret = {}
try:
for dirpath, dirnames, filenames in tree.walk(path_info):
files.extend(PathInfo(dirpath, f) for f in filenames)
for root, dirs, files in tree.walk(path_info.fspath):
for fname in files:
info = PathInfo(root) / fname
path = str(info.relative_to(path_info))
ret[path] = {
"isout": dvc,
"isdir": False,
"isexec": False if dvc else tree.isexec(info.fspath),
}

if not recursive:
files.extend(PathInfo(dirpath, d) for d in dirnames)
for dname in dirs:
info = PathInfo(root) / dname
path = str(info.relative_to(path_info))
ret[path] = {
"isout": tree.isdvc(info.fspath) if dvc else False,
"isdir": True,
"isexec": False if dvc else tree.isexec(info.fspath),
}
break
except NotADirectoryError:
if os.path.isfile(fspath(path_info)):
files = [path_info]

return [_get_fs_node(f) for f in files]


def _ls_outs_repo(repo, path_info, recursive=None):
from dvc.compat import fspath
from dvc.path_info import PathInfo

try:
outs = repo.find_outs_by_path(fspath(path_info), recursive=True)
except OutputNotFoundError:
return []

if recursive:
return [_get_fs_node(out.path_info, out) for out in outs]

def get_first_segment(out):
"""Returns tuple with path_info and related out

path_info calculated as the first relpath segment
Example:
dir/file -> dir
dir/subdir/file -> dir
file -> file
"""
relpath = out.path_info.relpath(path_info)
if relpath.parts:
out_path_info = PathInfo(path_info, relpath.parts[0])
isout = len(relpath.parts) == 1
return (out_path_info, out if isout else None)
return (out.path_info, out)

return [
_get_fs_node(p, out)
for (p, out) in {get_first_segment(out) for out in outs}
]


def _get_path_info(repo, path=None):
from dvc.path_info import PathInfo

if not path:
return PathInfo(repo.root_dir)
return PathInfo(repo.root_dir, path)


def _get_fs_node(path_info, out=None):
from dvc.compat import fspath
return {
path_info.name: {
"isout": dvc,
"isdir": False,
"isexec": False if dvc else tree.isexec(path_info.fspath),
}
}
except FileNotFoundError:
return {}

if out:
isdir = out.is_dir_checksum if out.checksum else False
isexec = False
else:
try:
isdir = os.path.isdir(fspath(path_info))
mode = os.stat(fspath(path_info)).st_mode
isexec = mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
except FileNotFoundError:
isdir = False
isexec = False

return {
"path_info": path_info,
"isout": bool(out),
"isdir": isdir,
"isexec": isexec,
}
return ret
116 changes: 116 additions & 0 deletions dvc/repo/tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import errno

from dvc.scm.tree import BaseTree
from dvc.path_info import PathInfo
from dvc.exceptions import OutputNotFoundError


class DvcTree(BaseTree):
def __init__(self, repo):
self.repo = repo

def _find_outs(self, path, *args, **kwargs):
outs = self.repo.find_outs_by_path(path, *args, **kwargs)

def _is_cached(out):
return out.use_cache

outs = list(filter(_is_cached, outs))
if not outs:
raise OutputNotFoundError(path, self.repo)

return outs

def open(self, path, mode="r", encoding="utf-8"):
try:
outs = self._find_outs(path, strict=False)
except OutputNotFoundError as exc:
raise FileNotFoundError from exc

if len(outs) != 1 or outs[0].isdir():
raise IOError(errno.EISDIR)

out = outs[0]
if not out.changed_cache():
return open(out.cache_path.fspath, mode=mode, encoding=encoding)

raise FileNotFoundError

def exists(self, path):
try:
self._find_outs(path, strict=False, recursive=True)
return True
except OutputNotFoundError:
return False

def isdir(self, path):
if not self.exists(path):
return False

outs = self._find_outs(path, strict=False, recursive=True)

if len(outs) != 1 or outs[0].path_info.fspath != path:
return True

return outs[0].isdir()

def isfile(self, path):
if not self.exists(path):
return False

return not self.isdir(path)

def _walk(self, root, trie, topdown=True):
dirs = set()
files = []

root_len = len(root.parts)
for key, out in trie.iteritems(prefix=root.parts):
if key == root.parts:
continue

name = key[root_len]
if len(key) > root_len + 1 or out.isdir():
dirs.add(name)
continue

files.append(name)

if topdown:
yield root.fspath, list(dirs), files

for dname in dirs:
yield from self._walk(root / dname, trie)
else:
assert False

def walk(self, top, topdown=True):
from pygtrie import Trie

assert topdown

if not self.exists(top):
raise FileNotFoundError

if not self.isdir(top):
raise NotADirectoryError

root = PathInfo(top)
outs = self._find_outs(top, recursive=True, strict=False)

trie = Trie()

for out in outs:
trie[out.path_info.parts] = out

yield from self._walk(root, trie, topdown=topdown)

def isdvc(self, path):
try:
return len(self._find_outs(path)) == 1
except OutputNotFoundError:
pass
return False

def isexec(self, path):
return False
5 changes: 5 additions & 0 deletions dvc/scm/tree.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import stat

from dvc.compat import fspath

Expand Down Expand Up @@ -75,6 +76,10 @@ def onerror(e):
):
yield os.path.normpath(root), dirs, files

def isexec(self, path):
mode = os.stat(path).st_mode
return mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)


def is_working_tree(tree):
return isinstance(tree, WorkingTree) or isinstance(
Expand Down