From 88addc340ecb2d611d7e081febcb932b5ef4756d Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Fri, 24 Jul 2020 22:48:24 +0300 Subject: [PATCH] ignore: dynamically collect dvcignore This allows us avoid collecting dvcignore for the whole repo if we only care about particular paths. As a result, in a repo with 2 datasets (2M + 0.5M files), creating a defunct stage takes ~4sec on 1.2.0, but ~1sec(most of it is actually dvc module initialization) with this PR. This is also a pre-requisite for dynamic dvcignore and subrepo collection (https://github.com/iterative/dvc/pull/4247) while walking the tree. Also, it is important to clarify that regular `dvc status`(without arguments) has the same performance after this PR, because when we check dataset for changes, we call things like `tree.exists()`, which call dvcignore and make it collect dvcignore in the dataset itself, so we still endup collecting dvcignore for the whole repo (including walking into the datasets). This should be solved soon by telling dvcignore that it shouldn't walk into the datasets searching for `.dvcignore`s. --- dvc/ignore.py | 50 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/dvc/ignore.py b/dvc/ignore.py index 831486209f..a87e750c47 100644 --- a/dvc/ignore.py +++ b/dvc/ignore.py @@ -1,7 +1,7 @@ import logging import os import re -from itertools import groupby +from itertools import groupby, takewhile from pathspec.patterns import GitWildMatchPattern from pathspec.util import normalize_file @@ -131,20 +131,19 @@ def __init__(self, tree, root_dir): self.ignores_trie_tree[root_dir] = DvcIgnorePatterns( default_ignore_patterns, root_dir ) - for root, dirs, _ in self.tree.walk( - self.root_dir, use_dvcignore=False - ): - self._update(root) - self._update_sub_repo(root, dirs) - dirs[:], _ = self(root, dirs, []) + self._update(self.root_dir) def _update(self, dirname): + old_pattern = self.ignores_trie_tree.longest_prefix(dirname).value + matches = old_pattern.matches(dirname, DvcIgnore.DVCIGNORE_FILE, False) + ignore_file_path = os.path.join(dirname, DvcIgnore.DVCIGNORE_FILE) - if self.tree.exists(ignore_file_path, use_dvcignore=False): + if not matches and self.tree.exists( + ignore_file_path, use_dvcignore=False + ): new_pattern = DvcIgnorePatterns.from_files( ignore_file_path, self.tree ) - old_pattern = self._get_trie_pattern(dirname) if old_pattern: self.ignores_trie_tree[dirname] = DvcIgnorePatterns( *merge_patterns( @@ -156,11 +155,18 @@ def _update(self, dirname): ) else: self.ignores_trie_tree[dirname] = new_pattern + elif old_pattern: + self.ignores_trie_tree[dirname] = old_pattern + + # NOTE: using `walk` + `break` because tree doesn't have `listdir()` + for root, dirs, _ in self.tree.walk(dirname, use_dvcignore=False): + self._update_sub_repo(root, dirs) + break def _update_sub_repo(self, root, dirs): for d in dirs: if self._is_dvc_repo(root, d): - old_pattern = self._get_trie_pattern(root) + old_pattern = self.ignores_trie_tree.longest_prefix(root).value if old_pattern: self.ignores_trie_tree[root] = DvcIgnorePatterns( *merge_patterns( @@ -183,8 +189,28 @@ def __call__(self, root, dirs, files): return dirs, files def _get_trie_pattern(self, dirname): - ignore_pattern = self.ignores_trie_tree.longest_prefix(dirname).value - return ignore_pattern + ignore_pattern = self.ignores_trie_tree.get(dirname) + if ignore_pattern: + return ignore_pattern + + prefix = self.ignores_trie_tree.longest_prefix(dirname).key + if not prefix: + # outside of the repo + return None + + dirs = list( + takewhile( + lambda path: path != prefix, + (parent.fspath for parent in PathInfo(dirname).parents), + ) + ) + dirs.reverse() + dirs.append(dirname) + + for parent in dirs: + self._update(parent) + + return self.ignores_trie_tree.get(dirname) def _is_ignored(self, path, is_dir=False): if self._outside_repo(path):