Skip to content
Merged
47 changes: 42 additions & 5 deletions dvc/ignore.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
import logging
import os

from funcy import cached_property
from pathspec import PathSpec
from pathspec.patterns import GitWildMatchPattern

from dvc.scm.tree import BaseTree
from dvc.utils import relpath

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -72,13 +74,12 @@ def __eq__(self, other):


class DvcIgnoreFilter(object):
def __init__(self, root_dir, tree):
def __init__(self, tree):
self.tree = tree
self.ignores = {DvcIgnoreDirs([".git", ".hg", ".dvc"])}
self._update(root_dir)
for root, dirs, _ in self.tree.walk(root_dir, dvcignore=self):
for d in dirs:
self._update(os.path.join(root, d))
for root, dirs, files in self.tree.walk(self.tree.tree_root):
self._update(root)
dirs[:], files[:] = self(root, dirs, files)

def _update(self, dirname):
ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE)
Expand All @@ -90,3 +91,39 @@ def __call__(self, root, dirs, files):
dirs, files = ignore(root, dirs, files)

return dirs, files


class CleanTree(BaseTree):
def __init__(self, tree):
self.tree = tree

@cached_property
def dvcignore(self):
return DvcIgnoreFilter(self.tree)

@property
def tree_root(self):
return self.tree.tree_root

def open(self, path, mode="r", encoding="utf-8"):
return self.tree.open(path, mode, encoding)

def exists(self, path):
return self.tree.exists(path)

def isdir(self, path):
return self.tree.isdir(path)

def isfile(self, path):
return self.tree.isfile(path)
Comment on lines +108 to +118
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to check that path is not dvcignored to be entirely safe.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that we should check if its ignored, a path can be both file and be ignored at the same time. What is the reasoning for that?

Copy link
Contributor

@Suor Suor Dec 30, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because if file is ignored then it should not be visible, i.e. tree should pretend it doesn't exist. This is not an issue for now because we only open files that we walk, probably.

I see at least one issue though dvc pipeline show ignored.dvc will actually instantiate this ignored stage, and we will get a puzzling KeyError later.

We may postpone this though because this might be solved in various ways.


def walk(self, top, topdown=True):
for root, dirs, files in self.tree.walk(top, topdown):
dirs[:], files[:] = self.dvcignore(root, dirs, files)

yield root, dirs, files

def walk_files(self, top):
for root, _, files in self.walk(top):
for file in files:
yield os.path.join(root, file)
14 changes: 11 additions & 3 deletions dvc/remote/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from dvc.exceptions import DownloadError
from dvc.exceptions import DvcException
from dvc.exceptions import UploadError
from dvc.ignore import CleanTree
from dvc.path_info import PathInfo
from dvc.progress import Tqdm
from dvc.remote.base import RemoteBASE
Expand All @@ -21,6 +22,7 @@
from dvc.remote.base import STATUS_MISSING
from dvc.remote.base import STATUS_NEW
from dvc.scheme import Schemes
from dvc.scm.tree import WorkingTree
from dvc.system import System
from dvc.utils import copyfile
from dvc.utils import file_md5
Expand Down Expand Up @@ -83,7 +85,7 @@ def supported(cls, config):

def list_cache_paths(self):
assert self.path_info is not None
return walk_files(self.path_info, None)
return walk_files(self.path_info)

def get(self, md5):
if not md5:
Expand Down Expand Up @@ -138,7 +140,11 @@ def getsize(path_info):
return os.path.getsize(fspath_py35(path_info))

def walk_files(self, path_info):
for fname in walk_files(path_info, self.repo.dvcignore):
assert isinstance(self.repo.tree, CleanTree) and isinstance(
self.repo.tree.tree, WorkingTree
)

for fname in self.repo.tree.walk_files(path_info):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to assert that tree is a working tree or a clean working tree. We want to be sure that we never walk the git tree here.

yield PathInfo(fname)

def get_file_checksum(self, path_info):
Expand Down Expand Up @@ -427,7 +433,9 @@ def _unprotect_file(path):
os.chmod(path, os.stat(path).st_mode | stat.S_IWRITE)

def _unprotect_dir(self, path):
for fname in walk_files(path, self.repo.dvcignore):
assert isinstance(self.repo.tree, CleanTree)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to update this assert the same way as in walk_files().


for fname in self.repo.tree.walk_files(path):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same assert here.

RemoteLOCAL._unprotect_file(fname)

def unprotect(self, path_info):
Expand Down
13 changes: 4 additions & 9 deletions dvc/repo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from contextlib import contextmanager
from functools import wraps
from itertools import chain

from dvc.ignore import CleanTree
from dvc.utils.compat import FileNotFoundError, fspath_py35, open as _open

from funcy import cached_property
Expand All @@ -15,7 +17,6 @@
NotDvcRepoError,
OutputNotFoundError,
)
from dvc.ignore import DvcIgnoreFilter
from dvc.path_info import PathInfo
from dvc.remote.base import RemoteActionNotImplemented
from dvc.utils import relpath
Expand Down Expand Up @@ -84,7 +85,7 @@ def __init__(self, root_dir=None):

self.scm = SCM(self.root_dir)

self.tree = WorkingTree(self.root_dir)
self.tree = CleanTree(WorkingTree(self.root_dir))

self.tmp_dir = os.path.join(self.dvc_dir, "tmp")
makedirs(self.tmp_dir, exist_ok=True)
Expand Down Expand Up @@ -391,9 +392,7 @@ def stages(self):
stages = []
outs = []

for root, dirs, files in self.tree.walk(
self.root_dir, dvcignore=self.dvcignore
):
for root, dirs, files in self.tree.walk(self.root_dir):
for fname in files:
path = os.path.join(root, fname)
if not Stage.is_valid_filename(path):
Expand Down Expand Up @@ -487,10 +486,6 @@ def _open_cached(self, out, remote=None, mode="r", encoding=None):

return _open(cache_file, mode=mode, encoding=encoding)

@cached_property
def dvcignore(self):
return DvcIgnoreFilter(self.root_dir, self.tree)

def close(self):
self.scm.close()

Expand Down
3 changes: 1 addition & 2 deletions dvc/repo/add.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from dvc.repo.scm_context import scm_context
from dvc.stage import Stage
from dvc.utils import LARGE_DIR_SIZE
from dvc.utils import walk_files

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -68,7 +67,7 @@ def _find_all_targets(repo, target, recursive):
if os.path.isdir(target) and recursive:
return [
fname
for fname in walk_files(target, repo.dvcignore)
for fname in repo.tree.walk_files(target)
if not repo.is_dvc_internal(fname)
if not Stage.is_stage_file(fname)
if not repo.scm.belongs_to_scm(fname)
Expand Down
5 changes: 3 additions & 2 deletions dvc/repo/brancher.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from funcy import group_by

from dvc.ignore import CleanTree
from dvc.scm.tree import WorkingTree


Expand Down Expand Up @@ -34,7 +35,7 @@ def brancher( # noqa: E302

scm = self.scm

self.tree = WorkingTree(self.root_dir)
self.tree = CleanTree(WorkingTree(self.root_dir))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Guess need to wrap tree from down below too, right?

Copy link
Contributor

@efiop efiop Dec 24, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pared ^

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@efiop Do you mean the one obtained with get_tree?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yield "working tree"

if all_commits:
Expand All @@ -58,7 +59,7 @@ def brancher( # noqa: E302
# code which might expect the tree on which exception was raised to
# stay in place. This behavior is a subject to change.
for sha, names in group_by(scm.resolve_rev, revs).items():
self.tree = scm.get_tree(sha)
self.tree = CleanTree(scm.get_tree(sha))
yield ", ".join(names)

self.tree = saved_tree
5 changes: 3 additions & 2 deletions dvc/repo/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from errno import ENOENT

import dvc.logger as logger
from dvc.ignore import CleanTree
from . import locked
from dvc.scm.base import FileNotInCommitError
from dvc.scm.git import DIFF_A_REF
Expand Down Expand Up @@ -137,9 +138,9 @@ def _is_dir(path, a_outs, b_outs):


def _get_diff_outs(self, diff_dct):
self.tree = diff_dct[DIFF_A_TREE]
self.tree = CleanTree(diff_dct[DIFF_A_TREE])
a_outs = {str(out): out for st in self.stages for out in st.outs}
self.tree = diff_dct[DIFF_B_TREE]
self.tree = CleanTree(diff_dct[DIFF_B_TREE])
b_outs = {str(out): out for st in self.stages for out in st.outs}
outs_paths = set(a_outs.keys())
outs_paths.update(b_outs.keys())
Expand Down
2 changes: 1 addition & 1 deletion dvc/scm/git/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def _walk(self, tree, topdown=True):
if not topdown:
yield os.path.normpath(tree.abspath), dirs, nondirs

def walk(self, top, topdown=True, dvcignore=None):
def walk(self, top, topdown=True):
"""Directory tree generator.

See `os.walk` for the docs. Differences:
Expand Down
13 changes: 6 additions & 7 deletions dvc/scm/tree.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import os

from dvc.utils import dvc_walk
from dvc.utils.compat import open
from dvc.utils.compat import open, fspath


class BaseTree(object):
Expand All @@ -23,7 +22,7 @@ def isdir(self, path):
def isfile(self, path):
"""Test whether a path is a regular file"""

def walk(self, top, topdown=True, dvcignore=None):
def walk(self, top, topdown=True):
"""Directory tree generator.

See `os.walk` for the docs. Differences:
Expand Down Expand Up @@ -58,20 +57,20 @@ def isfile(self, path):
"""Test whether a path is a regular file"""
return os.path.isfile(path)

def walk(self, top, topdown=True, dvcignore=None):
def walk(self, top, topdown=True):
"""Directory tree generator.

See `os.walk` for the docs. Differences:
- no support for symlinks
- it could raise exceptions, there is no onerror argument
"""

assert dvcignore
top = fspath(top)

def onerror(e):
raise e

for root, dirs, files in dvc_walk(
os.path.abspath(top), dvcignore, topdown=topdown, onerror=onerror
for root, dirs, files in os.walk(
top, topdown=topdown, onerror=onerror
):
yield os.path.normpath(root), dirs, files
10 changes: 4 additions & 6 deletions dvc/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ def save(self, path_info, checksum):
assert os.path.exists(fspath_py35(path_info))

actual_mtime, actual_size = get_mtime_and_size(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should accept a tree, not dvcignore.

path_info, self.repo.dvcignore
path_info, self.repo.tree
)
actual_inode = get_inode(path_info)

Expand Down Expand Up @@ -410,9 +410,7 @@ def get(self, path_info):
if not os.path.exists(path):
return None

actual_mtime, actual_size = get_mtime_and_size(
path, self.repo.dvcignore
)
actual_mtime, actual_size = get_mtime_and_size(path, self.repo.tree)
actual_inode = get_inode(path)

existing_record = self.get_state_record_for_inode(actual_inode)
Expand All @@ -439,7 +437,7 @@ def save_link(self, path_info):
if not os.path.exists(path):
return

mtime, _ = get_mtime_and_size(path, self.repo.dvcignore)
mtime, _ = get_mtime_and_size(path, self.repo.tree)
inode = get_inode(path)
relative_path = relpath(path, self.root_dir)

Expand Down Expand Up @@ -469,7 +467,7 @@ def remove_unused_links(self, used):
continue

actual_inode = get_inode(path)
actual_mtime, _ = get_mtime_and_size(path, self.repo.dvcignore)
actual_mtime, _ = get_mtime_and_size(path, self.repo.tree)

if inode == actual_inode and mtime == actual_mtime:
logger.debug("Removing '{}' as unused link.".format(path))
Expand Down
21 changes: 2 additions & 19 deletions dvc/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,25 +288,8 @@ def to_yaml_string(data):
return stream.getvalue()


def dvc_walk(top, dvcignore, topdown=True, onerror=None, followlinks=False):
"""
Proxy for `os.walk` directory tree generator.
Utilizes DvcIgnoreFilter functionality.
"""
top = fspath_py35(top)

for root, dirs, files in os.walk(
top, topdown=topdown, onerror=onerror, followlinks=followlinks
):

if dvcignore:
dirs[:], files[:] = dvcignore(root, dirs, files)

yield root, dirs, files


def walk_files(directory, dvcignore):
for root, _, files in dvc_walk(directory, dvcignore):
def walk_files(directory):
for root, _, files in os.walk(fspath(directory)):
for f in files:
yield os.path.join(root, f)

Expand Down
9 changes: 6 additions & 3 deletions dvc/utils/fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
from dvc.utils import fspath
from dvc.utils import fspath_py35
from dvc.utils import relpath
from dvc.utils import walk_files
from dvc.utils.compat import str


Expand All @@ -35,11 +34,15 @@ def get_inode(path):
return inode


def get_mtime_and_size(path, dvcignore):
def get_mtime_and_size(path, tree):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to assert that tree is a working tree/clean working tree. Passing git tree here will break the abstraction - we walk git, but os.stat() working tree.

from dvc.ignore import CleanTree

assert isinstance(tree, CleanTree)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to update this assert the same way as in walk_files().


if os.path.isdir(fspath_py35(path)):
size = 0
files_mtimes = {}
for file_path in walk_files(path, dvcignore):
for file_path in tree.walk_files(path):
try:
stat = os.stat(file_path)
except OSError as exc:
Expand Down
9 changes: 3 additions & 6 deletions tests/func/test_checkout.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@
from dvc.stage import StageFileBadNameError
from dvc.stage import StageFileDoesNotExistError
from dvc.system import System
from dvc.utils import relpath
from dvc.utils import walk_files
from dvc.utils import relpath, walk_files
from dvc.utils.compat import is_py2
from dvc.utils.stage import dump_stage_file
from dvc.utils.stage import load_stage_file
Expand Down Expand Up @@ -137,7 +136,7 @@ def outs_info(self, stage):
paths = [
path
for output in stage["outs"]
for path in walk_files(output["path"], self.dvc.dvcignore)
for path in self.dvc.tree.walk_files(output["path"])
]

return [
Expand Down Expand Up @@ -520,6 +519,4 @@ def test_partial_checkout(tmp_dir, dvc, target):
tmp_dir.dvc_gen({"dir": {"subdir": {"file": "file"}, "other": "other"}})
shutil.rmtree("dir")
dvc.checkout([target])
assert list(walk_files("dir", None)) == [
os.path.join("dir", "subdir", "file")
]
assert list(walk_files("dir")) == [os.path.join("dir", "subdir", "file")]
Loading