From 858b57b9550ecafad146f7649ac1217b8c72d7b3 Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Tue, 23 Jun 2020 22:28:19 +0800 Subject: [PATCH 01/16] Remove Duplicate ignore Match --- dvc/ignore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index c9257e5204..c33f509af6 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -121,9 +121,9 @@ def __init__(self, tree, root_dir): DvcIgnoreDirs([".git", ".hg", ".dvc"]), DvcIgnoreRepo(), } - for root, dirs, files in self.tree.walk(self.root_dir): + for root, dirs, _ in self.tree.walk(self.root_dir): self._update(root) - dirs[:], files[:] = self(root, dirs, files) + dirs[:], _ = self(root, dirs, []) def _update(self, dirname): ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE) From 9ed4b7d7ad4d1f4139861cddc6c212a7c2566b41 Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Fri, 26 Jun 2020 15:21:30 +0800 Subject: [PATCH 02/16] Continue Optimize Dvcignore fix #3869 --- dvc/ignore.py | 100 ++++++++++++++++++++++++++++++-------- setup.cfg | 2 +- tests/func/test_ignore.py | 15 +++++- tests/unit/test_ignore.py | 2 +- 4 files changed, 94 insertions(+), 25 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index c33f509af6..6feab2454b 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -6,6 +6,7 @@ from funcy import cached_property from pathspec.patterns import GitWildMatchPattern from pathspec.util import normalize_file +from pygtrie import StringTrie from dvc.path_info import PathInfo from dvc.scm.tree import BaseTree @@ -23,24 +24,29 @@ def __call__(self, root, dirs, files): class DvcIgnorePatterns(DvcIgnore): - def __init__(self, ignore_file_path, tree): - assert os.path.isabs(ignore_file_path) + def __init__(self, pattern_list, dirname): + + self.pattern_list = pattern_list + self.dirname = dirname + self.prefix = self.dirname + os.sep + + regex_pattern_list = map( + GitWildMatchPattern.pattern_to_regex, pattern_list + ) - self.ignore_file_path = ignore_file_path - self.dirname = os.path.normpath(os.path.dirname(ignore_file_path)) + self.ignore_spec = [ + (ignore, re.compile("|".join(item[0] for item in group))) + for ignore, group in groupby(regex_pattern_list, lambda x: x[1]) + if ignore is not None + ] + @classmethod + def from_files(cls, ignore_file_path, tree): + assert os.path.isabs(ignore_file_path) + dirname = os.path.normpath(os.path.dirname(ignore_file_path)) with tree.open(ignore_file_path, encoding="utf-8") as fobj: - path_spec_lines = fobj.readlines() - regex_pattern_list = map( - GitWildMatchPattern.pattern_to_regex, path_spec_lines - ) - self.ignore_spec = [ - (ignore, re.compile("|".join(item[0] for item in group))) - for ignore, group in groupby( - regex_pattern_list, lambda x: x[1] - ) - if ignore is not None - ] + path_spec_lines = list(map(str.strip, fobj.readlines())) + return cls(path_spec_lines, dirname) def __call__(self, root, dirs, files): files = [f for f in files if not self.matches(root, f)] @@ -51,11 +57,10 @@ def __call__(self, root, dirs, files): def matches(self, dirname, basename): # NOTE: `relpath` is too slow, so we have to assume that both # `dirname` and `self.dirname` are relative or absolute together. - prefix = self.dirname + os.sep if dirname == self.dirname: path = basename - elif dirname.startswith(prefix): - rel = dirname[len(prefix) :] + elif dirname.startswith(self.prefix): + rel = dirname[len(self.prefix) :] # NOTE: `os.path.join` is ~x5.5 slower path = f"{rel}{os.sep}{basename}" else: @@ -73,13 +78,61 @@ def ignore(self, path): return result def __hash__(self): - return hash(self.ignore_file_path) + return hash(self.dirname + ":" + "\n".join(self.pattern_list)) def __eq__(self, other): if not isinstance(other, DvcIgnorePatterns): return NotImplemented + return (self.dirname == other.dirname) & ( + self.pattern_list == other.pattern_list + ) + + def __add__(self, other): + if not other: + return self + if not isinstance(other, DvcIgnorePatterns): + return NotImplemented + if self.prefix.startswith(other.prefix): + return DvcIgnorePatterns( + self.dirname, other.pattern_list + self.pattern_list + ) + elif other.prefix.startswith(self.prefix): + return DvcIgnorePatterns( + other.dirname, self.pattern_list + other.pattern_list + ) + else: + return NotImplemented + + __radd__ = __add__ + + +class DvcIgnorePatternsTrie(DvcIgnore): + def __init__(self): + self.trie = StringTrie(separator=os.sep) + + def __new__(cls, *args, **kwargs): + if not hasattr(DvcIgnorePatterns, "_instance"): + if not hasattr(DvcIgnorePatterns, "_instance"): + DvcIgnorePatterns._instance = object.__new__(cls) + return DvcIgnorePatterns._instance - return self.ignore_file_path == other.ignore_file_path + def __call__(self, root, dirs, files): + ignore_pattern = self[root] + if ignore_pattern: + return ignore_pattern(root, dirs, files) + else: + return dirs, files + + def __setitem__(self, root, ignore_pattern): + base_pattern = self[root] + self.trie[root] = base_pattern + ignore_pattern + + def __getitem__(self, root): + ignore_pattern = self.trie.longest_prefix(root) + if ignore_pattern: + return ignore_pattern.value + else: + return None class DvcIgnoreDirs(DvcIgnore): @@ -128,7 +181,12 @@ def __init__(self, tree, root_dir): def _update(self, dirname): ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE) if self.tree.exists(ignore_file_path): - self.ignores.add(DvcIgnorePatterns(ignore_file_path, self.tree)) + ignore_pattern = DvcIgnorePatterns.from_files( + ignore_file_path, self.tree + ) + ignore_pattern_trie = DvcIgnorePatternsTrie() + ignore_pattern_trie[dirname] = ignore_pattern + self.ignores.add(ignore_pattern_trie) def __call__(self, root, dirs, files): for ignore in self.ignores: diff --git a/setup.cfg b/setup.cfg index f7cb136c8c..69cf6278b2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,7 +17,7 @@ count=true [isort] include_trailing_comma=true known_first_party=dvc,tests -known_third_party=PyInstaller,RangeHTTPServer,boto3,colorama,configobj,distro,dpath,flaky,flufl,funcy,git,google,grandalf,mock,moto,nanotime,networkx,packaging,paramiko,pathspec,pytest,requests,ruamel,setuptools,shortuuid,shtab,tqdm,voluptuous,yaml,zc +known_third_party=PyInstaller,RangeHTTPServer,boto3,colorama,configobj,distro,dpath,flaky,flufl,funcy,git,google,grandalf,mock,moto,nanotime,networkx,packaging,paramiko,pathspec,pygtrie,pytest,requests,ruamel,setuptools,shortuuid,shtab,tqdm,voluptuous,yaml,zc line_length=79 force_grid_wrap=0 use_parentheses=True diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index 03b1e95678..f1713cc146 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -8,6 +8,7 @@ DvcIgnore, DvcIgnoreDirs, DvcIgnorePatterns, + DvcIgnorePatternsTrie, DvcIgnoreRepo, ) from dvc.repo import Repo @@ -98,12 +99,22 @@ def test_ignore_collecting_dvcignores(tmp_dir, dvc, dname): assert len(dvc.tree.dvcignore.ignores) == 3 assert DvcIgnoreDirs([".git", ".hg", ".dvc"]) in dvc.tree.dvcignore.ignores + ignore_pattern_trie = None + for ignore in dvc.tree.dvcignore.ignores: + if isinstance(ignore, DvcIgnorePatternsTrie): + ignore_pattern_trie = ignore + + print(os.fspath(top_ignore_file)) + print(os.fspath(ignore_file)) + + assert ignore_pattern_trie is not None assert ( - DvcIgnorePatterns( + DvcIgnorePatterns.from_files( os.fspath(top_ignore_file), WorkingTree(dvc.root_dir) ) - in dvc.tree.dvcignore.ignores + == ignore_pattern_trie[os.fspath(ignore_file)] ) + assert any( i for i in dvc.tree.dvcignore.ignores if isinstance(i, DvcIgnoreRepo) ) diff --git a/tests/unit/test_ignore.py b/tests/unit/test_ignore.py index 40f936d091..a1c1c8224c 100644 --- a/tests/unit/test_ignore.py +++ b/tests/unit/test_ignore.py @@ -9,7 +9,7 @@ def mock_dvcignore(dvcignore_path, patterns): tree = MagicMock() with patch.object(tree, "open", mock_open(read_data="\n".join(patterns))): - ignore_patterns = DvcIgnorePatterns(dvcignore_path, tree) + ignore_patterns = DvcIgnorePatterns.from_files(dvcignore_path, tree) return ignore_patterns From 516f59f2f9896f1db7072cdb01f7e335f4f1139e Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Fri, 26 Jun 2020 18:13:39 +0800 Subject: [PATCH 03/16] Add a new test with multi ignore files --- tests/func/test_ignore.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index f1713cc146..c4b899919f 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -184,3 +184,14 @@ def test_ignore_blank_line(tmp_dir, dvc): tmp_dir.gen(DvcIgnore.DVCIGNORE_FILE, "foo\n\ndir/ignored") assert _files_set("dir", dvc.tree) == {"dir/other"} + + +def test_multi_ignore_file(tmp_dir, dvc, monkeypatch): + tmp_dir.gen({"dir": {"subdir": {"should_ignore": "1", "not_ignore": "1"}}}) + tmp_dir.gen(DvcIgnore.DVCIGNORE_FILE, "dir/subdir/*_ignore") + tmp_dir.gen({"dir": {DvcIgnore.DVCIGNORE_FILE: "!subdir/not_ignore"}}) + + assert _files_set("dir", dvc.tree) == { + "dir/subdir/not_ignore", + "dir/{}".format(DvcIgnore.DVCIGNORE_FILE), + } From ae0dfef22264da558f0ca3cad0448b1b890616d5 Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Sat, 27 Jun 2020 10:54:02 +0800 Subject: [PATCH 04/16] Solve merging two dvc files --- dvc/ignore.py | 53 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index 6feab2454b..1647249567 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -30,9 +30,10 @@ def __init__(self, pattern_list, dirname): self.dirname = dirname self.prefix = self.dirname + os.sep - regex_pattern_list = map( - GitWildMatchPattern.pattern_to_regex, pattern_list + regex_pattern_list = list( + map(GitWildMatchPattern.pattern_to_regex, pattern_list) ) + self.regex_pattern_list = regex_pattern_list self.ignore_spec = [ (ignore, re.compile("|".join(item[0] for item in group))) @@ -45,7 +46,10 @@ def from_files(cls, ignore_file_path, tree): assert os.path.isabs(ignore_file_path) dirname = os.path.normpath(os.path.dirname(ignore_file_path)) with tree.open(ignore_file_path, encoding="utf-8") as fobj: - path_spec_lines = list(map(str.strip, fobj.readlines())) + path_spec_lines = [ + line for line in map(str.strip, fobj.readlines()) if line + ] + return cls(path_spec_lines, dirname) def __call__(self, root, dirs, files): @@ -87,6 +91,22 @@ def __eq__(self, other): self.pattern_list == other.pattern_list ) + def change_dirname(self, new_dirname): + prefix = new_dirname + os.sep + if new_dirname == self.dirname: + return self + if not self.dirname.startswith(prefix): + raise ValueError("change dirname can only change to parent path") + rel = self.dirname[len(prefix) :] + new_pattern_list = [] + for rule in self.pattern_list: + if rule.startswith("!"): + rule = f"!{rel}{os.sep}{rule[1:]}" + else: + rule = f"{rel}{os.sep}{rule}" + new_pattern_list.append(rule) + return DvcIgnorePatterns(new_pattern_list, new_dirname) + def __add__(self, other): if not other: return self @@ -94,21 +114,27 @@ def __add__(self, other): return NotImplemented if self.prefix.startswith(other.prefix): return DvcIgnorePatterns( - self.dirname, other.pattern_list + self.pattern_list + other.pattern_list + + self.change_dirname(other.dirname).pattern_list, + other.dirname, ) - elif other.prefix.startswith(self.prefix): + if other.prefix.startswith(self.prefix): return DvcIgnorePatterns( - other.dirname, self.pattern_list + other.pattern_list + self.pattern_list + + other.change_dirname(self.dirname).pattern_list, + self.dirname, ) - else: - return NotImplemented + return NotImplemented __radd__ = __add__ class DvcIgnorePatternsTrie(DvcIgnore): + trie = None + def __init__(self): - self.trie = StringTrie(separator=os.sep) + if self.trie is None: + self.trie = StringTrie(separator=os.sep) def __new__(cls, *args, **kwargs): if not hasattr(DvcIgnorePatterns, "_instance"): @@ -120,8 +146,7 @@ def __call__(self, root, dirs, files): ignore_pattern = self[root] if ignore_pattern: return ignore_pattern(root, dirs, files) - else: - return dirs, files + return dirs, files def __setitem__(self, root, ignore_pattern): base_pattern = self[root] @@ -131,8 +156,7 @@ def __getitem__(self, root): ignore_pattern = self.trie.longest_prefix(root) if ignore_pattern: return ignore_pattern.value - else: - return None + return None class DvcIgnoreDirs(DvcIgnore): @@ -306,3 +330,6 @@ def stat(self, path): @property def hash_jobs(self): return self.tree.hash_jobs + + def relative_path(self, abspath): + pass From 93d278abc2fabc833a82dbe0ea7dbc5c7ee7fe98 Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Sat, 27 Jun 2020 11:01:02 +0800 Subject: [PATCH 05/16] Solve Code Climate --- dvc/ignore.py | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index 1647249567..5881a66659 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -30,10 +30,9 @@ def __init__(self, pattern_list, dirname): self.dirname = dirname self.prefix = self.dirname + os.sep - regex_pattern_list = list( - map(GitWildMatchPattern.pattern_to_regex, pattern_list) + regex_pattern_list = map( + GitWildMatchPattern.pattern_to_regex, pattern_list ) - self.regex_pattern_list = regex_pattern_list self.ignore_spec = [ (ignore, re.compile("|".join(item[0] for item in group))) @@ -110,20 +109,19 @@ def change_dirname(self, new_dirname): def __add__(self, other): if not other: return self - if not isinstance(other, DvcIgnorePatterns): - return NotImplemented - if self.prefix.startswith(other.prefix): - return DvcIgnorePatterns( - other.pattern_list - + self.change_dirname(other.dirname).pattern_list, - other.dirname, - ) - if other.prefix.startswith(self.prefix): - return DvcIgnorePatterns( - self.pattern_list - + other.change_dirname(self.dirname).pattern_list, - self.dirname, - ) + if isinstance(other, DvcIgnorePatterns): + if self.prefix.startswith(other.prefix): + return DvcIgnorePatterns( + other.pattern_list + + self.change_dirname(other.dirname).pattern_list, + other.dirname, + ) + if other.prefix.startswith(self.prefix): + return DvcIgnorePatterns( + self.pattern_list + + other.change_dirname(self.dirname).pattern_list, + self.dirname, + ) return NotImplemented __radd__ = __add__ From 25bfcd330b6363e4a3d11be72df1e456f9bba145 Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Sat, 27 Jun 2020 15:14:39 +0800 Subject: [PATCH 06/16] For Windows --- dvc/ignore.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dvc/ignore.py b/dvc/ignore.py index 5881a66659..e271147c03 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -103,6 +103,7 @@ def change_dirname(self, new_dirname): rule = f"!{rel}{os.sep}{rule[1:]}" else: rule = f"{rel}{os.sep}{rule}" + rule = normalize_file(rule) new_pattern_list.append(rule) return DvcIgnorePatterns(new_pattern_list, new_dirname) From 8cf7b0904e30c831cd7671f55d299bae31f29800 Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Sun, 28 Jun 2020 07:09:38 +0800 Subject: [PATCH 07/16] Complete addition of patterns. Add one test --- dvc/ignore.py | 54 +++++++++++++++++++++++++++++---------- tests/func/test_ignore.py | 20 +++++++++++++++ 2 files changed, 60 insertions(+), 14 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index e271147c03..42c45b8b87 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -90,6 +90,11 @@ def __eq__(self, other): self.pattern_list == other.pattern_list ) + def __bool__(self): + if self.pattern_list: + return True + return False + def change_dirname(self, new_dirname): prefix = new_dirname + os.sep if new_dirname == self.dirname: @@ -107,23 +112,44 @@ def change_dirname(self, new_dirname): new_pattern_list.append(rule) return DvcIgnorePatterns(new_pattern_list, new_dirname) + @staticmethod + def _longest_common_dir(dir1, dir2): + dir1_split = dir1.split(os.sep) + dir2_split = dir2.split(os.sep) + max_match = 0 + + for index, (i, j) in enumerate(zip(dir1_split, dir2_split)): + if i != j: + break + max_match = index + return os.sep.join(dir1_split[: max_match + 1]) + def __add__(self, other): + if not isinstance(other, DvcIgnorePatterns): + return NotImplemented + if not other: - return self - if isinstance(other, DvcIgnorePatterns): - if self.prefix.startswith(other.prefix): - return DvcIgnorePatterns( - other.pattern_list - + self.change_dirname(other.dirname).pattern_list, - other.dirname, + merged = self + elif not self: + merged = other + else: + longest_common_dir = self._longest_common_dir( + self.dirname, other.dirname + ) + self_to_lcd = self.change_dirname(longest_common_dir) + other_to_lcd = other.change_dirname(longest_common_dir) + if len(self.dirname) < len(other.dirname): + merged = DvcIgnorePatterns( + self_to_lcd.pattern_list + other_to_lcd.pattern_list, + longest_common_dir, ) - if other.prefix.startswith(self.prefix): - return DvcIgnorePatterns( - self.pattern_list - + other.change_dirname(self.dirname).pattern_list, - self.dirname, + else: + merged = DvcIgnorePatterns( + other_to_lcd.pattern_list + self_to_lcd.pattern_list, + longest_common_dir, ) - return NotImplemented + + return merged __radd__ = __add__ @@ -155,7 +181,7 @@ def __getitem__(self, root): ignore_pattern = self.trie.longest_prefix(root) if ignore_pattern: return ignore_pattern.value - return None + return DvcIgnorePatterns([], root) class DvcIgnoreDirs(DvcIgnore): diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index c4b899919f..61e7fdc0a7 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -195,3 +195,23 @@ def test_multi_ignore_file(tmp_dir, dvc, monkeypatch): "dir/subdir/not_ignore", "dir/{}".format(DvcIgnore.DVCIGNORE_FILE), } + + +def test_no_ignore_in_parent_dir(tmp_dir, dvc, monkeypatch): + tmp_dir.gen( + { + "dir": { + "subdir": {"should_ignore": "1", "not_ignore": "1"}, + "file1": "a", + } + } + ) + tmp_dir.gen( + {"dir": {"subdir": {DvcIgnore.DVCIGNORE_FILE: "should_ignore"}}} + ) + + assert _files_set("dir", dvc.tree) == { + "dir/file1", + "dir/subdir/not_ignore", + "dir/subdir/{}".format(DvcIgnore.DVCIGNORE_FILE), + } From 03579de3826bf6eec7586aa2a58b04b60b4534b0 Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Sun, 5 Jul 2020 15:22:18 +0800 Subject: [PATCH 08/16] Systematic test --- dvc/ignore.py | 54 ++++++++++- tests/func/test_ignore.py | 186 +++++++++++++++++++++++++++++++++++--- 2 files changed, 222 insertions(+), 18 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index 42c45b8b87..b47829cbd3 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -86,6 +86,7 @@ def __hash__(self): def __eq__(self, other): if not isinstance(other, DvcIgnorePatterns): return NotImplemented + print(self.pattern_list, other.pattern_list) return (self.dirname == other.dirname) & ( self.pattern_list == other.pattern_list ) @@ -95,6 +96,52 @@ def __bool__(self): return True return False + @staticmethod + def _is_include(rule): + if rule.startswith("!"): + return True, rule[1:] + return False, rule + + @staticmethod + def _is_comment(rule): + if rule.startswith("#"): + return True + return False + + @staticmethod + def _remove_slash(rule): + if rule.startswith("\\"): + return rule[1:] + return rule + + @staticmethod + def _match_all_level(rule): + if rule[:-1].find("/") >= 0 and not rule.startswith("**/"): + if rule.startswith("/"): + rule = rule[1:] + return False, rule + if rule.startswith("**/"): + rule = rule[3:] + return True, rule + + def change_rule(self, rule, rel): + rule = rule.strip() + if self._is_comment(rule): + return rule + is_include, rule = self._is_include(rule) + match_all, rule = self._match_all_level(rule) + rule = self._remove_slash(rule) + if not match_all: + rule = f"/{rule}" + else: + rule = f"/**/{rule}" + if is_include: + rule = f"!/{rel}{rule}" + else: + rule = f"/{rel}{rule}" + rule = normalize_file(rule) + return rule + def change_dirname(self, new_dirname): prefix = new_dirname + os.sep if new_dirname == self.dirname: @@ -102,13 +149,10 @@ def change_dirname(self, new_dirname): if not self.dirname.startswith(prefix): raise ValueError("change dirname can only change to parent path") rel = self.dirname[len(prefix) :] + new_pattern_list = [] for rule in self.pattern_list: - if rule.startswith("!"): - rule = f"!{rel}{os.sep}{rule[1:]}" - else: - rule = f"{rel}{os.sep}{rule}" - rule = normalize_file(rule) + rule = self.change_rule(rule, rel) new_pattern_list.append(rule) return DvcIgnorePatterns(new_pattern_list, new_dirname) diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index 61e7fdc0a7..122ef48a63 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -104,9 +104,6 @@ def test_ignore_collecting_dvcignores(tmp_dir, dvc, dname): if isinstance(ignore, DvcIgnorePatternsTrie): ignore_pattern_trie = ignore - print(os.fspath(top_ignore_file)) - print(os.fspath(ignore_file)) - assert ignore_pattern_trie is not None assert ( DvcIgnorePatterns.from_files( @@ -197,21 +194,184 @@ def test_multi_ignore_file(tmp_dir, dvc, monkeypatch): } -def test_no_ignore_in_parent_dir(tmp_dir, dvc, monkeypatch): +@pytest.mark.parametrize( + "patterns, dirname, changed", + [ + # A line starting with # serves as a comment. + ("#comment", "dir", "#comment"), + # Put a backslash ("\") in front of the first hash for patterns that + # begin with a hash. + ("\\#hash", "dir", "/dir/**/#hash"), + ("\\#hash", "#dir", "/#dir/**/#hash"), + # Trailing spaces are ignored unless they are quoted with + # backslash ("\"). + (" space", "dir", "/dir/**/space"), + ("\\ space", "dir", "/dir/**/ space"), + # An optional prefix "!" which negates the pattern; + ("!include", "dir", "!/dir/**/include"), + # Put a backslash ("\") in front of the first "!" for patterns that + # begin with a literal "!", for example, "\!important!.txt". + ("\\!important!.txt", "dir", "/dir/**/!important!.txt"), + # If there is a separator at the beginning or middle (or both) of the + # pattern, then the pattern is relative to the directory level of the + # particular .gitignore file itself. + ("/separator.txt", "dir", "/dir/separator.txt"), + ("subdir/separator.txt", "dir", "/dir/subdir/separator.txt"), + # Otherwise the pattern may also match at any level below + # the .gitignore level. + ("no_sep", "dir", "/dir/**/no_sep"), + # If there is a separator at the end of the pattern then the pattern + # will only match directories, otherwise the pattern can match both + # files and directories. + ("doc/fortz/", "dir", "/dir/doc/fortz/"), + ("fortz/", "dir", "/dir/**/fortz/"), + # An asterisk "*" matches anything except a slash. + ("*aste*risk*", "dir", "/dir/**/*aste*risk*"), + # The character "?" matches any one character except "/". + ("?fi?le?", "dir", "/dir/**/?fi?le?"), + # The range notation, e.g. [a-zA-Z], can be used to match one of the + # characters in a range. See fnmatch(3) and the FNM_PATHNAME flag + # for a more detailed description. + ("[a-zA-Z]file[a-zA-Z]", "dir", "/dir/**/[a-zA-Z]file[a-zA-Z]"), + # Two consecutive asterisks ("**") in patterns matched against full + # pathname may have special meaning: + # A leading "**" followed by a slash means match in all directories. + # For example, "**/foo" matches file or directory "foo" anywhere, + # the same as pattern "foo". + ("**/foo", "dir", "/dir/**/foo"), + # "**/foo/bar" matches file or directory "bar" anywhere that is + # directly under directory "foo". + ("**/foo/bar", "dir", "/dir/**/foo/bar"), + # A trailing "/**" matches everything inside. + # For example, "abc/**" matches all files inside directory "abc", + # relative to the location of the .gitignore file, with infinite depth. + ("abc/**", "dir", "/dir/abc/**"), + # A slash followed by two consecutive asterisks then a slash matches + # zero or more directories. For example, "a/**/b" + # matches "a/b", "a/x/b", "a/x/y/b" and so on. + ("a/**/b", "dir", "/dir/a/**/b"), + # Other consecutive asterisks are considered regular asterisks and + # will match according to the previous rules. + ("/***.txt", "dir", "/dir/***.txt"), + ("data/***", "dir", "/dir/data/***"), + ("***/file.txt", "dir", "/dir/***/file.txt"), + ("***file", "dir", "/dir/**/***file"), + ("a/***/b", "dir", "/dir/a/***/b"), + ], +) +def test_dvcignore_pattern_change_dir( + tmp_dir, dvc, patterns, dirname, changed +): tmp_dir.gen( { - "dir": { - "subdir": {"should_ignore": "1", "not_ignore": "1"}, - "file1": "a", + "parent": { + dirname: {DvcIgnore.DVCIGNORE_FILE: patterns}, + "subdir": {}, } } ) + ignore_pattern_trie = None + for ignore in dvc.tree.dvcignore.ignores: + if isinstance(ignore, DvcIgnorePatternsTrie): + ignore_pattern_trie = ignore + break + + assert ignore_pattern_trie is not None + ignore_pattern = ignore_pattern_trie[ + os.fspath(tmp_dir / "parent" / dirname) + ] + ignore_pattern_changed = ignore_pattern.change_dirname( + os.fspath(tmp_dir / "parent") + ) + assert ( + DvcIgnorePatterns([changed], os.fspath(tmp_dir / "parent")) + == ignore_pattern_changed + ) + + +def test_dvcignore_pattern_merge(tmp_dir, dvc): tmp_dir.gen( - {"dir": {"subdir": {DvcIgnore.DVCIGNORE_FILE: "should_ignore"}}} + { + "top": { + "first": { + DvcIgnore.DVCIGNORE_FILE: "a\nb\nc", + "middle": { + "second": { + DvcIgnore.DVCIGNORE_FILE: "d\ne\nf", + "bottom": {}, + } + }, + }, + }, + "other": {DvcIgnore.DVCIGNORE_FILE: "1\n2\n3"}, + } ) + ignore_pattern_trie = None + for ignore in dvc.tree.dvcignore.ignores: + if isinstance(ignore, DvcIgnorePatternsTrie): + ignore_pattern_trie = ignore + break - assert _files_set("dir", dvc.tree) == { - "dir/file1", - "dir/subdir/not_ignore", - "dir/subdir/{}".format(DvcIgnore.DVCIGNORE_FILE), - } + assert ignore_pattern_trie is not None + ignore_pattern_top = ignore_pattern_trie[os.fspath(tmp_dir / "top")] + ignore_pattern_other = ignore_pattern_trie[os.fspath(tmp_dir / "other")] + ignore_pattern_first = ignore_pattern_trie[ + os.fspath(tmp_dir / "top" / "first") + ] + ignore_pattern_middle = ignore_pattern_trie[ + os.fspath(tmp_dir / "top" / "first" / "middle") + ] + ignore_pattern_second = ignore_pattern_trie[ + os.fspath(tmp_dir / "top" / "first" / "middle" / "second") + ] + ignore_pattern_bottom = ignore_pattern_trie[ + os.fspath(tmp_dir / "top" / "first" / "middle" / "second" / "bottom") + ] + assert not ignore_pattern_top + assert ( + DvcIgnorePatterns([], os.fspath(tmp_dir / "top")) == ignore_pattern_top + ) + assert ( + DvcIgnorePatterns(["1", "2", "3"], os.fspath(tmp_dir / "other")) + == ignore_pattern_other + ) + assert ( + DvcIgnorePatterns( + ["a", "b", "c"], os.fspath(tmp_dir / "top" / "first") + ) + == ignore_pattern_first + ) + assert ( + DvcIgnorePatterns( + ["a", "b", "c"], os.fspath(tmp_dir / "top" / "first") + ) + == ignore_pattern_middle + ) + assert ( + DvcIgnorePatterns( + [ + "a", + "b", + "c", + "/middle/second/**/d", + "/middle/second/**/e", + "/middle/second/**/f", + ], + os.fspath(tmp_dir / "top" / "first"), + ) + == ignore_pattern_second + ) + assert ( + DvcIgnorePatterns( + [ + "a", + "b", + "c", + "/middle/second/**/d", + "/middle/second/**/e", + "/middle/second/**/f", + ], + os.fspath(tmp_dir / "top" / "first"), + ) + == ignore_pattern_bottom + ) From 640bf6f8951e1f59cdf4de9051e1b0caa435cf04 Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Sun, 5 Jul 2020 15:32:15 +0800 Subject: [PATCH 09/16] Change request --- dvc/ignore.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index b47829cbd3..d1e0f50d0c 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -143,12 +143,11 @@ def change_rule(self, rule, rel): return rule def change_dirname(self, new_dirname): - prefix = new_dirname + os.sep if new_dirname == self.dirname: return self - if not self.dirname.startswith(prefix): + rel = os.path.relpath(self.dirname, new_dirname) + if rel.startswith(".."): raise ValueError("change dirname can only change to parent path") - rel = self.dirname[len(prefix) :] new_pattern_list = [] for rule in self.pattern_list: From bb8a2461656a35cc14ff2099e858c862180098cd Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Mon, 6 Jul 2020 21:01:53 +0800 Subject: [PATCH 10/16] Change request --- dvc/ignore.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index d1e0f50d0c..7421e60998 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -86,27 +86,20 @@ def __hash__(self): def __eq__(self, other): if not isinstance(other, DvcIgnorePatterns): return NotImplemented - print(self.pattern_list, other.pattern_list) return (self.dirname == other.dirname) & ( self.pattern_list == other.pattern_list ) def __bool__(self): - if self.pattern_list: - return True - return False + return bool(self.pattern_list) @staticmethod - def _is_include(rule): - if rule.startswith("!"): - return True, rule[1:] - return False, rule + def _include_rule(rule): + return (True, rule[1:]) if rule.startswith("!") else (False, rule) @staticmethod def _is_comment(rule): - if rule.startswith("#"): - return True - return False + return rule.startswith("#") @staticmethod def _remove_slash(rule): @@ -128,7 +121,7 @@ def change_rule(self, rule, rel): rule = rule.strip() if self._is_comment(rule): return rule - is_include, rule = self._is_include(rule) + is_include, rule = self._include_rule(rule) match_all, rule = self._match_all_level(rule) rule = self._remove_slash(rule) if not match_all: From cf3d2357f864b2da888ea9f0ace00352ad973bbf Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Thu, 9 Jul 2020 19:53:04 +0800 Subject: [PATCH 11/16] Seperate path sepcification math --- dvc/ignore.py | 105 +++---------------------------- dvc/pathspec_math.py | 94 +++++++++++++++++++++++++++ tests/func/test_ignore.py | 95 ---------------------------- tests/unit/test_pathspec_math.py | 72 +++++++++++++++++++++ 4 files changed, 174 insertions(+), 192 deletions(-) create mode 100644 dvc/pathspec_math.py create mode 100644 tests/unit/test_pathspec_math.py diff --git a/dvc/ignore.py b/dvc/ignore.py index 9990fd72a0..239b2cc3d5 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -9,6 +9,7 @@ from pygtrie import StringTrie from dvc.path_info import PathInfo +from dvc.pathspec_math import join_pattern from dvc.scm.tree import BaseTree from dvc.system import System from dvc.utils import relpath @@ -99,102 +100,6 @@ def __eq__(self, other): def __bool__(self): return bool(self.pattern_list) - @staticmethod - def _include_rule(rule): - return (True, rule[1:]) if rule.startswith("!") else (False, rule) - - @staticmethod - def _is_comment(rule): - return rule.startswith("#") - - @staticmethod - def _remove_slash(rule): - if rule.startswith("\\"): - return rule[1:] - return rule - - @staticmethod - def _match_all_level(rule): - if rule[:-1].find("/") >= 0 and not rule.startswith("**/"): - if rule.startswith("/"): - rule = rule[1:] - return False, rule - if rule.startswith("**/"): - rule = rule[3:] - return True, rule - - def change_rule(self, rule, rel): - rule = rule.strip() - if self._is_comment(rule): - return rule - is_include, rule = self._include_rule(rule) - match_all, rule = self._match_all_level(rule) - rule = self._remove_slash(rule) - if not match_all: - rule = f"/{rule}" - else: - rule = f"/**/{rule}" - if is_include: - rule = f"!/{rel}{rule}" - else: - rule = f"/{rel}{rule}" - rule = normalize_file(rule) - return rule - - def change_dirname(self, new_dirname): - if new_dirname == self.dirname: - return self - rel = os.path.relpath(self.dirname, new_dirname) - if rel.startswith(".."): - raise ValueError("change dirname can only change to parent path") - - new_pattern_list = [] - for rule in self.pattern_list: - rule = self.change_rule(rule, rel) - new_pattern_list.append(rule) - return DvcIgnorePatterns(new_pattern_list, new_dirname) - - @staticmethod - def _longest_common_dir(dir1, dir2): - dir1_split = dir1.split(os.sep) - dir2_split = dir2.split(os.sep) - max_match = 0 - - for index, (i, j) in enumerate(zip(dir1_split, dir2_split)): - if i != j: - break - max_match = index - return os.sep.join(dir1_split[: max_match + 1]) - - def __add__(self, other): - if not isinstance(other, DvcIgnorePatterns): - return NotImplemented - - if not other: - merged = self - elif not self: - merged = other - else: - longest_common_dir = self._longest_common_dir( - self.dirname, other.dirname - ) - self_to_lcd = self.change_dirname(longest_common_dir) - other_to_lcd = other.change_dirname(longest_common_dir) - if len(self.dirname) < len(other.dirname): - merged = DvcIgnorePatterns( - self_to_lcd.pattern_list + other_to_lcd.pattern_list, - longest_common_dir, - ) - else: - merged = DvcIgnorePatterns( - other_to_lcd.pattern_list + self_to_lcd.pattern_list, - longest_common_dir, - ) - - return merged - - __radd__ = __add__ - class DvcIgnorePatternsTrie(DvcIgnore): trie = None @@ -217,7 +122,13 @@ def __call__(self, root, dirs, files): def __setitem__(self, root, ignore_pattern): base_pattern = self[root] - self.trie[root] = base_pattern + ignore_pattern + common_dirname, merged_pattern = join_pattern( + base_pattern.dirname, + base_pattern.pattern_list, + ignore_pattern.dirname, + ignore_pattern.pattern_list, + ) + self.trie[root] = DvcIgnorePatterns(merged_pattern, common_dirname) def __getitem__(self, root): ignore_pattern = self.trie.longest_prefix(root) diff --git a/dvc/pathspec_math.py b/dvc/pathspec_math.py new file mode 100644 index 0000000000..3084298361 --- /dev/null +++ b/dvc/pathspec_math.py @@ -0,0 +1,94 @@ +# Path Specification Pattern Math +# Including changing base dir of path specification patterns and merging +# of two path specification patterns with different base +# All the operations follow the documents of `gitignore` +import os + +from pathspec.util import normalize_file + + +def _include_rule(rule): + return (True, rule[1:]) if rule.startswith("!") else (False, rule) + + +def _is_comment(rule): + return rule.startswith("#") + + +def _remove_slash(rule): + if rule.startswith("\\"): + return rule[1:] + return rule + + +def _match_all_level(rule): + if rule[:-1].find("/") >= 0 and not rule.startswith("**/"): + if rule.startswith("/"): + rule = rule[1:] + return False, rule + if rule.startswith("**/"): + rule = rule[3:] + return True, rule + + +def change_rule(rule, rel): + rule = rule.strip() + if _is_comment(rule): + return rule + is_include, rule = _include_rule(rule) + match_all, rule = _match_all_level(rule) + rule = _remove_slash(rule) + if not match_all: + rule = f"/{rule}" + else: + rule = f"/**/{rule}" + if is_include: + rule = f"!/{rel}{rule}" + else: + rule = f"/{rel}{rule}" + rule = normalize_file(rule) + return rule + + +def change_dirname(dirname, pattern_list, new_dirname): + if new_dirname == dirname: + return pattern_list + rel = os.path.relpath(dirname, new_dirname) + if rel.startswith(".."): + raise ValueError("change dirname can only change to parent path") + + new_pattern_list = [] + for rule in pattern_list: + rule = change_rule(rule, rel) + new_pattern_list.append(rule) + return new_pattern_list + + +def _longest_common_dir(dir1, dir2): + dir1_split = dir1.split(os.sep) + dir2_split = dir2.split(os.sep) + max_match = 0 + + for index, (i, j) in enumerate(zip(dir1_split, dir2_split)): + if i != j: + break + max_match = index + return os.sep.join(dir1_split[: max_match + 1]) + + +def join_pattern(prefix_a, pattern_a, prefix_b, pattern_b): + if not pattern_a: + return prefix_b, pattern_b + elif not pattern_b: + return prefix_a, pattern_a + + longest_common_dir = _longest_common_dir(prefix_a, prefix_b) + new_pattern_a = change_dirname(prefix_a, pattern_a, longest_common_dir) + new_pattern_b = change_dirname(prefix_b, pattern_b, longest_common_dir) + + if len(prefix_a) < len(prefix_b): + merged_pattern = new_pattern_a + new_pattern_b + else: + merged_pattern = new_pattern_b + new_pattern_a + + return longest_common_dir, merged_pattern diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index 9f01d3971c..36abcbf7ff 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -257,101 +257,6 @@ def test_multi_ignore_file(tmp_dir, dvc, monkeypatch): } -@pytest.mark.parametrize( - "patterns, dirname, changed", - [ - # A line starting with # serves as a comment. - ("#comment", "dir", "#comment"), - # Put a backslash ("\") in front of the first hash for patterns that - # begin with a hash. - ("\\#hash", "dir", "/dir/**/#hash"), - ("\\#hash", "#dir", "/#dir/**/#hash"), - # Trailing spaces are ignored unless they are quoted with - # backslash ("\"). - (" space", "dir", "/dir/**/space"), - ("\\ space", "dir", "/dir/**/ space"), - # An optional prefix "!" which negates the pattern; - ("!include", "dir", "!/dir/**/include"), - # Put a backslash ("\") in front of the first "!" for patterns that - # begin with a literal "!", for example, "\!important!.txt". - ("\\!important!.txt", "dir", "/dir/**/!important!.txt"), - # If there is a separator at the beginning or middle (or both) of the - # pattern, then the pattern is relative to the directory level of the - # particular .gitignore file itself. - ("/separator.txt", "dir", "/dir/separator.txt"), - ("subdir/separator.txt", "dir", "/dir/subdir/separator.txt"), - # Otherwise the pattern may also match at any level below - # the .gitignore level. - ("no_sep", "dir", "/dir/**/no_sep"), - # If there is a separator at the end of the pattern then the pattern - # will only match directories, otherwise the pattern can match both - # files and directories. - ("doc/fortz/", "dir", "/dir/doc/fortz/"), - ("fortz/", "dir", "/dir/**/fortz/"), - # An asterisk "*" matches anything except a slash. - ("*aste*risk*", "dir", "/dir/**/*aste*risk*"), - # The character "?" matches any one character except "/". - ("?fi?le?", "dir", "/dir/**/?fi?le?"), - # The range notation, e.g. [a-zA-Z], can be used to match one of the - # characters in a range. See fnmatch(3) and the FNM_PATHNAME flag - # for a more detailed description. - ("[a-zA-Z]file[a-zA-Z]", "dir", "/dir/**/[a-zA-Z]file[a-zA-Z]"), - # Two consecutive asterisks ("**") in patterns matched against full - # pathname may have special meaning: - # A leading "**" followed by a slash means match in all directories. - # For example, "**/foo" matches file or directory "foo" anywhere, - # the same as pattern "foo". - ("**/foo", "dir", "/dir/**/foo"), - # "**/foo/bar" matches file or directory "bar" anywhere that is - # directly under directory "foo". - ("**/foo/bar", "dir", "/dir/**/foo/bar"), - # A trailing "/**" matches everything inside. - # For example, "abc/**" matches all files inside directory "abc", - # relative to the location of the .gitignore file, with infinite depth. - ("abc/**", "dir", "/dir/abc/**"), - # A slash followed by two consecutive asterisks then a slash matches - # zero or more directories. For example, "a/**/b" - # matches "a/b", "a/x/b", "a/x/y/b" and so on. - ("a/**/b", "dir", "/dir/a/**/b"), - # Other consecutive asterisks are considered regular asterisks and - # will match according to the previous rules. - ("/***.txt", "dir", "/dir/***.txt"), - ("data/***", "dir", "/dir/data/***"), - ("***/file.txt", "dir", "/dir/***/file.txt"), - ("***file", "dir", "/dir/**/***file"), - ("a/***/b", "dir", "/dir/a/***/b"), - ], -) -def test_dvcignore_pattern_change_dir( - tmp_dir, dvc, patterns, dirname, changed -): - tmp_dir.gen( - { - "parent": { - dirname: {DvcIgnore.DVCIGNORE_FILE: patterns}, - "subdir": {}, - } - } - ) - ignore_pattern_trie = None - for ignore in dvc.tree.dvcignore.ignores: - if isinstance(ignore, DvcIgnorePatternsTrie): - ignore_pattern_trie = ignore - break - - assert ignore_pattern_trie is not None - ignore_pattern = ignore_pattern_trie[ - os.fspath(tmp_dir / "parent" / dirname) - ] - ignore_pattern_changed = ignore_pattern.change_dirname( - os.fspath(tmp_dir / "parent") - ) - assert ( - DvcIgnorePatterns([changed], os.fspath(tmp_dir / "parent")) - == ignore_pattern_changed - ) - - def test_dvcignore_pattern_merge(tmp_dir, dvc): tmp_dir.gen( { diff --git a/tests/unit/test_pathspec_math.py b/tests/unit/test_pathspec_math.py new file mode 100644 index 0000000000..0cc4e29a83 --- /dev/null +++ b/tests/unit/test_pathspec_math.py @@ -0,0 +1,72 @@ +import pytest + +from dvc.pathspec_math import change_dirname + + +@pytest.mark.parametrize( + "patterns, dirname, changed", + [ + # A line starting with # serves as a comment. + ("#comment", "/dir", "#comment"), + # Put a backslash ("\") in front of the first hash for patterns that + # begin with a hash. + ("\\#hash", "/dir", "/dir/**/#hash"), + ("\\#hash", "/#dir", "/#dir/**/#hash"), + # Trailing spaces are ignored unless they are quoted with + # backslash ("\"). + (" space", "/dir", "/dir/**/space"), + ("\\ space", "/dir", "/dir/**/ space"), + # An optional prefix "!" which negates the pattern; + ("!include", "/dir", "!/dir/**/include"), + # Put a backslash ("\") in front of the first "!" for patterns that + # begin with a literal "!", for example, "\!important!.txt". + ("\\!important!.txt", "/dir", "/dir/**/!important!.txt"), + # If there is a separator at the beginning or middle (or both) of the + # pattern, then the pattern is relative to the directory level of the + # particular .gitignore file itself. + ("/separator.txt", "/dir", "/dir/separator.txt"), + ("subdir/separator.txt", "/dir", "/dir/subdir/separator.txt"), + # Otherwise the pattern may also match at any level below + # the .gitignore level. + ("no_sep", "/dir", "/dir/**/no_sep"), + # If there is a separator at the end of the pattern then the pattern + # will only match directories, otherwise the pattern can match both + # files and directories. + ("doc/fortz/", "/dir", "/dir/doc/fortz/"), + ("fortz/", "/dir", "/dir/**/fortz/"), + # An asterisk "*" matches anything except a slash. + ("*aste*risk*", "/dir", "/dir/**/*aste*risk*"), + # The character "?" matches any one character except "/". + ("?fi?le?", "/dir", "/dir/**/?fi?le?"), + # The range notation, e.g. [a-zA-Z], can be used to match one of the + # characters in a range. See fnmatch(3) and the FNM_PATHNAME flag + # for a more detailed description. + ("[a-zA-Z]file[a-zA-Z]", "/dir", "/dir/**/[a-zA-Z]file[a-zA-Z]"), + # Two consecutive asterisks ("**") in patterns matched against full + # pathname may have special meaning: + # A leading "**" followed by a slash means match in all directories. + # For example, "**/foo" matches file or directory "foo" anywhere, + # the same as pattern "foo". + ("**/foo", "/dir", "/dir/**/foo"), + # "**/foo/bar" matches file or directory "bar" anywhere that is + # directly under directory "foo". + ("**/foo/bar", "/dir", "/dir/**/foo/bar"), + # A trailing "/**" matches everything inside. + # For example, "abc/**" matches all files inside directory "abc", + # relative to the location of the .gitignore file, with infinite depth. + ("abc/**", "/dir", "/dir/abc/**"), + # A slash followed by two consecutive asterisks then a slash matches + # zero or more directories. For example, "a/**/b" + # matches "a/b", "a/x/b", "a/x/y/b" and so on. + ("a/**/b", "/dir", "/dir/a/**/b"), + # Other consecutive asterisks are considered regular asterisks and + # will match according to the previous rules. + ("/***.txt", "/dir", "/dir/***.txt"), + ("data/***", "/dir", "/dir/data/***"), + ("***/file.txt", "/dir", "/dir/***/file.txt"), + ("***file", "/dir", "/dir/**/***file"), + ("a/***/b", "/dir", "/dir/a/***/b"), + ], +) +def test_dvcignore_pattern_change_dir(tmp_dir, patterns, dirname, changed): + assert change_dirname(dirname, [patterns], "/") == [changed] From ec723bf08013c64cc9aed20b43e51fd6a8b59392 Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Sat, 11 Jul 2020 11:46:15 +0800 Subject: [PATCH 12/16] Rename and add comment --- dvc/ignore.py | 4 ++-- dvc/pathspec_math.py | 9 ++++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index 239b2cc3d5..03b12869f3 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -9,7 +9,7 @@ from pygtrie import StringTrie from dvc.path_info import PathInfo -from dvc.pathspec_math import join_pattern +from dvc.pathspec_math import merge_patterns from dvc.scm.tree import BaseTree from dvc.system import System from dvc.utils import relpath @@ -122,7 +122,7 @@ def __call__(self, root, dirs, files): def __setitem__(self, root, ignore_pattern): base_pattern = self[root] - common_dirname, merged_pattern = join_pattern( + common_dirname, merged_pattern = merge_patterns( base_pattern.dirname, base_pattern.pattern_list, ignore_pattern.dirname, diff --git a/dvc/pathspec_math.py b/dvc/pathspec_math.py index 3084298361..35a8824f1f 100644 --- a/dvc/pathspec_math.py +++ b/dvc/pathspec_math.py @@ -76,7 +76,14 @@ def _longest_common_dir(dir1, dir2): return os.sep.join(dir1_split[: max_match + 1]) -def join_pattern(prefix_a, pattern_a, prefix_b, pattern_b): +def merge_patterns(prefix_a, pattern_a, prefix_b, pattern_b): + """ + Merge two path specification patterns. + + This implementation merge two path specification patterns on different + bases. It returns the longest common parent directory, and the patterns + based on this new base directory. + """ if not pattern_a: return prefix_b, pattern_b elif not pattern_b: From c629d30870044737b3239c729732d67862c03029 Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Sun, 12 Jul 2020 09:27:21 +0800 Subject: [PATCH 13/16] rename change_dirname to private --- dvc/pathspec_math.py | 6 +++--- tests/unit/test_pathspec_math.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dvc/pathspec_math.py b/dvc/pathspec_math.py index 35a8824f1f..a961eca9ca 100644 --- a/dvc/pathspec_math.py +++ b/dvc/pathspec_math.py @@ -50,7 +50,7 @@ def change_rule(rule, rel): return rule -def change_dirname(dirname, pattern_list, new_dirname): +def _change_dirname(dirname, pattern_list, new_dirname): if new_dirname == dirname: return pattern_list rel = os.path.relpath(dirname, new_dirname) @@ -90,8 +90,8 @@ def merge_patterns(prefix_a, pattern_a, prefix_b, pattern_b): return prefix_a, pattern_a longest_common_dir = _longest_common_dir(prefix_a, prefix_b) - new_pattern_a = change_dirname(prefix_a, pattern_a, longest_common_dir) - new_pattern_b = change_dirname(prefix_b, pattern_b, longest_common_dir) + new_pattern_a = _change_dirname(prefix_a, pattern_a, longest_common_dir) + new_pattern_b = _change_dirname(prefix_b, pattern_b, longest_common_dir) if len(prefix_a) < len(prefix_b): merged_pattern = new_pattern_a + new_pattern_b diff --git a/tests/unit/test_pathspec_math.py b/tests/unit/test_pathspec_math.py index 0cc4e29a83..1cc6837153 100644 --- a/tests/unit/test_pathspec_math.py +++ b/tests/unit/test_pathspec_math.py @@ -1,6 +1,6 @@ import pytest -from dvc.pathspec_math import change_dirname +from dvc.pathspec_math import _change_dirname @pytest.mark.parametrize( @@ -69,4 +69,4 @@ ], ) def test_dvcignore_pattern_change_dir(tmp_dir, patterns, dirname, changed): - assert change_dirname(dirname, [patterns], "/") == [changed] + assert _change_dirname(dirname, [patterns], "/") == [changed] From de227283c6d9ae5c085e50233473fcef8c42c142 Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Mon, 13 Jul 2020 17:54:13 +0800 Subject: [PATCH 14/16] Update dvc/pathspec_math.py list comprehension Co-authored-by: Alexander Schepanovski --- dvc/pathspec_math.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/dvc/pathspec_math.py b/dvc/pathspec_math.py index a961eca9ca..38d680dc77 100644 --- a/dvc/pathspec_math.py +++ b/dvc/pathspec_math.py @@ -57,11 +57,7 @@ def _change_dirname(dirname, pattern_list, new_dirname): if rel.startswith(".."): raise ValueError("change dirname can only change to parent path") - new_pattern_list = [] - for rule in pattern_list: - rule = change_rule(rule, rel) - new_pattern_list.append(rule) - return new_pattern_list + return [change_rule(rule, rel) for rule in pattern_list] def _longest_common_dir(dir1, dir2): From 012bb4afc6cf33b72ac8fb2759d1895d8978ca7e Mon Sep 17 00:00:00 2001 From: karajan1001 Date: Mon, 13 Jul 2020 18:57:57 +0800 Subject: [PATCH 15/16] Change request --- dvc/ignore.py | 22 ++++++++-------------- dvc/pathspec_math.py | 20 ++++---------------- tests/func/test_ignore.py | 2 +- 3 files changed, 13 insertions(+), 31 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index 03b12869f3..124bcf08fa 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -108,12 +108,6 @@ def __init__(self): if self.trie is None: self.trie = StringTrie(separator=os.sep) - def __new__(cls, *args, **kwargs): - if not hasattr(DvcIgnorePatterns, "_instance"): - if not hasattr(DvcIgnorePatterns, "_instance"): - DvcIgnorePatterns._instance = object.__new__(cls) - return DvcIgnorePatterns._instance - def __call__(self, root, dirs, files): ignore_pattern = self[root] if ignore_pattern: @@ -176,19 +170,19 @@ def __init__(self, tree, root_dir): DvcIgnoreDirs([".git", ".hg", ".dvc"]), DvcIgnoreRepo(), } + ignore_pattern_trie = DvcIgnorePatternsTrie() for root, dirs, _ in self.tree.walk(self.root_dir): - self._update(root) + ignore_pattern = self._get_ignore_pattern(root) + if ignore_pattern: + ignore_pattern_trie[root] = ignore_pattern + self.ignores.add(ignore_pattern_trie) dirs[:], _ = self(root, dirs, []) - def _update(self, dirname): + def _get_ignore_pattern(self, dirname): ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE) if self.tree.exists(ignore_file_path): - ignore_pattern = DvcIgnorePatterns.from_files( - ignore_file_path, self.tree - ) - ignore_pattern_trie = DvcIgnorePatternsTrie() - ignore_pattern_trie[dirname] = ignore_pattern - self.ignores.add(ignore_pattern_trie) + return DvcIgnorePatterns.from_files(ignore_file_path, self.tree) + return None def __call__(self, root, dirs, files): for ignore in self.ignores: diff --git a/dvc/pathspec_math.py b/dvc/pathspec_math.py index 38d680dc77..7377fe4bc4 100644 --- a/dvc/pathspec_math.py +++ b/dvc/pathspec_math.py @@ -7,7 +7,7 @@ from pathspec.util import normalize_file -def _include_rule(rule): +def _not_ignore(rule): return (True, rule[1:]) if rule.startswith("!") else (False, rule) @@ -35,14 +35,14 @@ def change_rule(rule, rel): rule = rule.strip() if _is_comment(rule): return rule - is_include, rule = _include_rule(rule) + not_ignore, rule = _not_ignore(rule) match_all, rule = _match_all_level(rule) rule = _remove_slash(rule) if not match_all: rule = f"/{rule}" else: rule = f"/**/{rule}" - if is_include: + if not_ignore: rule = f"!/{rel}{rule}" else: rule = f"/{rel}{rule}" @@ -60,18 +60,6 @@ def _change_dirname(dirname, pattern_list, new_dirname): return [change_rule(rule, rel) for rule in pattern_list] -def _longest_common_dir(dir1, dir2): - dir1_split = dir1.split(os.sep) - dir2_split = dir2.split(os.sep) - max_match = 0 - - for index, (i, j) in enumerate(zip(dir1_split, dir2_split)): - if i != j: - break - max_match = index - return os.sep.join(dir1_split[: max_match + 1]) - - def merge_patterns(prefix_a, pattern_a, prefix_b, pattern_b): """ Merge two path specification patterns. @@ -85,7 +73,7 @@ def merge_patterns(prefix_a, pattern_a, prefix_b, pattern_b): elif not pattern_b: return prefix_a, pattern_a - longest_common_dir = _longest_common_dir(prefix_a, prefix_b) + longest_common_dir = os.path.commonpath([prefix_a, prefix_b]) new_pattern_a = _change_dirname(prefix_a, pattern_a, longest_common_dir) new_pattern_b = _change_dirname(prefix_b, pattern_b, longest_common_dir) diff --git a/tests/func/test_ignore.py b/tests/func/test_ignore.py index 36abcbf7ff..c9b9feea14 100644 --- a/tests/func/test_ignore.py +++ b/tests/func/test_ignore.py @@ -257,7 +257,7 @@ def test_multi_ignore_file(tmp_dir, dvc, monkeypatch): } -def test_dvcignore_pattern_merge(tmp_dir, dvc): +def test_pattern_trie_tree(tmp_dir, dvc): tmp_dir.gen( { "top": { From e8bb06ac0416b122c7047eb1a645f8e325cc5dc1 Mon Sep 17 00:00:00 2001 From: "Restyled.io" Date: Tue, 14 Jul 2020 22:28:51 +0000 Subject: [PATCH 16/16] Restyled by black --- dvc/ignore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index 1f250e4fd5..2ab78ea00a 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -304,4 +304,4 @@ def hash_jobs(self): return self.tree.hash_jobs def makedirs(self, path, mode=0o777, exist_ok=True): - self.tree.makedirs(path, mode=mode, exist_ok=exist_ok) \ No newline at end of file + self.tree.makedirs(path, mode=mode, exist_ok=exist_ok)