Skip to content

Commit

Permalink
deps/outs: get rid of self.info (#4502)
Browse files Browse the repository at this point in the history
By itself `self.info` is quite confusing, as it is not clear what it is
about. Using `hash_info` makes much more sense and is required to
support alternative hash types.

Related to #4144, #3069, #1676
  • Loading branch information
efiop committed Aug 31, 2020
1 parent 4f84328 commit 5650060
Show file tree
Hide file tree
Showing 16 changed files with 134 additions and 129 deletions.
9 changes: 4 additions & 5 deletions dvc/cache/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,17 +620,16 @@ def merge(self, ancestor_info, our_info, their_info):
assert their_info

if ancestor_info:
ancestor_hash = ancestor_info[self.tree.PARAM_CHECKSUM]
ancestor_hash = ancestor_info.value
ancestor = self.get_dir_cache(ancestor_hash)
else:
ancestor = []

our_hash = our_info[self.tree.PARAM_CHECKSUM]
our_hash = our_info.value
our = self.get_dir_cache(our_hash)

their_hash = their_info[self.tree.PARAM_CHECKSUM]
their_hash = their_info.value
their = self.get_dir_cache(their_hash)

merged = self._merge_dirs(ancestor, our, their)
hash_info = self.tree.save_dir_info(merged)
return {hash_info.name: hash_info.value}
return self.tree.save_dir_info(merged)
45 changes: 24 additions & 21 deletions dvc/dependency/param.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

from dvc.dependency.local import LocalDependency
from dvc.exceptions import DvcException
from dvc.hash_info import HashInfo
from dvc.utils.serialize import LOADERS, ParseError


Expand All @@ -31,7 +32,7 @@ def __init__(self, stage, path, params):
else:
assert isinstance(params, dict)
self.params = list(params.keys())
info = params
info = {self.PARAM_PARAMS: params}

super().__init__(
stage,
Expand All @@ -40,46 +41,48 @@ def __init__(self, stage, path, params):
info=info,
)

def dumpd(self):
ret = super().dumpd()
if not self.hash_info:
ret[self.PARAM_PARAMS] = self.params
return ret

def fill_values(self, values=None):
"""Load params values dynamically."""
if not values:
return
info = {}
for param in self.params:
if param in values:
self.info[param] = values[param]
info[param] = values[param]
self.hash_info = HashInfo(self.PARAM_PARAMS, info)

def save(self):
super().save()
self.info = self.save_info()
def workspace_status(self):
status = super().workspace_status()

def status(self):
status = super().status()

if status[str(self)] == "deleted":
if status.get(str(self)) == "deleted":
return status

status = defaultdict(dict)
info = self.read_params()
info = self.hash_info.value if self.hash_info else {}
actual = self.read_params()
for param in self.params:
if param not in info.keys():
if param not in actual.keys():
st = "deleted"
elif param not in self.info:
elif param not in info:
st = "new"
elif info[param] != self.info[param]:
elif actual[param] != info[param]:
st = "modified"
else:
assert info[param] == self.info[param]
assert actual[param] == info[param]
continue

status[str(self)][param] = st

return status

def dumpd(self):
return {
self.PARAM_PATH: self.def_path,
self.PARAM_PARAMS: self.info or self.params,
}
def status(self):
return self.workspace_status()

def read_params(self):
if not self.exists:
Expand All @@ -102,7 +105,7 @@ def read_params(self):
pass
return ret

def save_info(self):
def get_hash(self):
info = self.read_params()

missing_params = set(self.params) - set(info.keys())
Expand All @@ -113,4 +116,4 @@ def save_info(self):
)
)

return info
return HashInfo(self.PARAM_PARAMS, info)
14 changes: 6 additions & 8 deletions dvc/dependency/repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,12 @@ def _make_repo(self, *, locked=True):
rev = (d.get("rev_lock") if locked else None) or d.get("rev")
return external_repo(d["url"], rev=rev)

def _get_checksum(self, locked=True):
def _get_hash(self, locked=True):
from dvc.tree.repo import RepoTree

with self._make_repo(locked=locked) as repo:
try:
return repo.find_out_by_relpath(self.def_path).info["md5"]
return repo.find_out_by_relpath(self.def_path).hash_info
except OutputNotFoundError:
path = PathInfo(os.path.join(repo.root_dir, self.def_path))

Expand All @@ -64,16 +64,14 @@ def _get_checksum(self, locked=True):

# We are polluting our repo cache with some dir listing here
if tree.isdir(path):
return self.repo.cache.local.tree.get_hash(
path, tree=tree
).value
return self.repo.cache.local.tree.get_hash(path, tree=tree)
return tree.get_file_hash(path)

def workspace_status(self):
current_checksum = self._get_checksum(locked=True)
updated_checksum = self._get_checksum(locked=False)
current = self._get_hash(locked=True)
updated = self._get_hash(locked=False)

if current_checksum != updated_checksum:
if current != updated:
return {str(self): "update available"}

return {}
Expand Down
10 changes: 5 additions & 5 deletions dvc/external_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,23 +138,23 @@ def fetch_external(self, paths: Iterable, **kwargs):
def download_update(result):
download_results.append(result)

save_infos = []
hash_infos = []
for path in paths:
if not self.repo_tree.exists(path):
raise PathMissingError(path, self.url)
hash_info = self.repo_tree.save_info(
hash_info = self.repo_tree.get_hash(
path, download_callback=download_update
)
).to_dict()
self.local_cache.save(
path,
self.repo_tree,
hash_info,
save_link=False,
download_callback=download_update,
)
save_infos.append(hash_info)
hash_infos.append(hash_info)

return sum(download_results), failed, save_infos
return sum(download_results), failed, hash_infos

def get_external(self, path, dest):
"""Convenience wrapper for fetch_external and checkout."""
Expand Down
10 changes: 10 additions & 0 deletions dvc/hash_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,13 @@ class HashInfo:

def __bool__(self):
return bool(self.value)

@classmethod
def from_dict(cls, d):
if not d:
return cls(None, None)
((name, value),) = d.items()
return cls(name, value)

def to_dict(self):
return {self.name: self.value} if self else {}
33 changes: 19 additions & 14 deletions dvc/output/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
MergeError,
RemoteCacheRequiredError,
)
from dvc.hash_info import HashInfo

from ..tree.base import BaseTree

Expand Down Expand Up @@ -113,7 +114,7 @@ def __init__(
self.stage = stage
self.repo = stage.repo if stage else None
self.def_path = path
self.info = info
self.hash_info = HashInfo.from_dict(info)
if tree:
self.tree = tree
else:
Expand Down Expand Up @@ -174,10 +175,13 @@ def cache_path(self):

@property
def checksum(self):
return self.info.get(self.tree.PARAM_CHECKSUM)
return self.hash_info.value

def get_checksum(self):
return self.tree.get_hash(self.path_info).value
return self.get_hash().value

def get_hash(self):
return self.tree.get_hash(self.path_info)

@property
def is_dir_checksum(self):
Expand All @@ -187,9 +191,6 @@ def is_dir_checksum(self):
def exists(self):
return self.tree.exists(self.path_info)

def save_info(self):
return self.tree.save_info(self.path_info)

def changed_checksum(self):
return self.checksum != self.get_checksum()

Expand Down Expand Up @@ -267,7 +268,7 @@ def save(self):
self.verify_metric()

if not self.use_cache:
self.info = self.save_info()
self.hash_info = self.get_hash()
if not self.IS_DEPENDENCY:
logger.debug(
"Output '%s' doesn't use cache. Skipping saving.", self
Expand All @@ -280,15 +281,17 @@ def save(self):
logger.debug("Output '%s' didn't change. Skipping saving.", self)
return

self.info = self.save_info()
self.hash_info = self.get_hash()

def commit(self):
assert self.info
assert self.hash_info
if self.use_cache:
self.cache.save(self.path_info, self.cache.tree, self.info)
self.cache.save(
self.path_info, self.cache.tree, self.hash_info.to_dict()
)

def dumpd(self):
ret = copy(self.info)
ret = copy(self.hash_info.to_dict())
ret[self.PARAM_PATH] = self.def_path

if self.IS_DEPENDENCY:
Expand Down Expand Up @@ -337,7 +340,7 @@ def checkout(

return self.cache.checkout(
self.path_info,
self.info,
self.hash_info.to_dict(),
force=force,
progress_callback=progress_callback,
relink=relink,
Expand Down Expand Up @@ -535,11 +538,13 @@ def merge(self, ancestor, other):

if ancestor:
self._check_can_merge(ancestor)
ancestor_info = ancestor.info
ancestor_info = ancestor.hash_info
else:
ancestor_info = None

self._check_can_merge(self)
self._check_can_merge(other)

self.info = self.cache.merge(ancestor_info, self.info, other.info)
self.hash_info = self.cache.merge(
ancestor_info, self.hash_info, other.hash_info
)
8 changes: 5 additions & 3 deletions dvc/stage/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from funcy import get_in, lcat, project

from dvc import dependency, output
from dvc.hash_info import HashInfo

from . import PipelineStage, Stage, loads_from
from .exceptions import StageNameUnspecified, StageNotFound
Expand Down Expand Up @@ -42,9 +43,10 @@ def fill_from_lock(stage, lock_data=None):
if isinstance(item, dependency.ParamsDependency):
item.fill_values(get_in(lock_data, [stage.PARAM_PARAMS, path]))
continue
item.info = get_in(checksums, [key, path], {})
item.info = item.info.copy()
item.info.pop("path", None)
info = get_in(checksums, [key, path], {})
info = info.copy()
info.pop("path", None)
item.hash_info = HashInfo.from_dict(info)

@classmethod
def load_stage(cls, dvcfile, name, stage_data, lock_data=None):
Expand Down
7 changes: 6 additions & 1 deletion dvc/stage/serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,12 @@ def to_single_stage_lockfile(stage: "Stage") -> dict:
params, deps = split_params_deps(stage)
deps, outs = [
[
OrderedDict([(PARAM_PATH, item.def_path), *item.info.items()])
OrderedDict(
[
(PARAM_PATH, item.def_path),
*item.hash_info.to_dict().items(),
]
)
for item in sort_by_path(items)
]
for items in [deps, stage.outs]
Expand Down
4 changes: 0 additions & 4 deletions dvc/tree/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,10 +294,6 @@ def path_to_hash(self, path):

return "".join(parts)

def save_info(self, path_info, **kwargs):
hash_info = self.get_hash(path_info, **kwargs)
return {hash_info.name: hash_info.value}

def _calculate_hashes(self, file_infos):
file_infos = list(file_infos)
with Tqdm(
Expand Down
5 changes: 3 additions & 2 deletions tests/func/test_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
OverlappingOutputPathsError,
RecursiveAddingWhileUsingFilename,
)
from dvc.hash_info import HashInfo
from dvc.main import main
from dvc.output.base import OutputAlreadyTrackedError, OutputIsStageFileError
from dvc.repo import Repo as DvcRepo
Expand All @@ -42,7 +43,7 @@ def test_add(tmp_dir, dvc):
assert len(stage.outs) == 1
assert len(stage.deps) == 0
assert stage.cmd is None
assert stage.outs[0].info["md5"] == md5
assert stage.outs[0].hash_info == HashInfo("md5", md5)
assert stage.md5 is None

assert load_yaml("foo.dvc") == {
Expand Down Expand Up @@ -71,7 +72,7 @@ def test_add_directory(tmp_dir, dvc):
assert len(stage.deps) == 0
assert len(stage.outs) == 1

md5 = stage.outs[0].info["md5"]
md5 = stage.outs[0].hash_info.value

dir_info = dvc.cache.local.load_dir_cache(md5)
for info in dir_info:
Expand Down
2 changes: 1 addition & 1 deletion tests/func/test_run_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def test_uncached_outs_are_cached(tmp_dir, dvc, run_copy):
name="copy-foo-bar",
)
with dvc.state:
stage.outs[0].info = stage.outs[0].save_info()
stage.outs[0].hash_info = stage.outs[0].get_hash()
assert os.path.exists(relpath(stage.outs[0].cache_path))


Expand Down
2 changes: 1 addition & 1 deletion tests/func/test_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def test_repotree_cache_save(tmp_dir, dvc, scm, erepo_dir, local_cloud):
cache = dvc.cache.local
with cache.tree.state:
path_info = PathInfo(erepo_dir / "dir")
hash_info = cache.tree.save_info(path_info)
hash_info = cache.tree.get_hash(path_info).to_dict()
cache.save(path_info, tree, hash_info)

for hash_ in expected:
Expand Down
Loading

0 comments on commit 5650060

Please sign in to comment.