Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

output: dont rely on the default hash type #4466

Merged
merged 1 commit into from
Aug 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions dvc/output/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,18 +172,10 @@ def supported(cls, url):
def cache_path(self):
return self.cache.tree.hash_to_path_info(self.checksum).url

@property
def checksum_type(self):
return self.tree.PARAM_CHECKSUM

@property
def checksum(self):
return self.info.get(self.tree.PARAM_CHECKSUM)

@checksum.setter
def checksum(self, checksum):
self.info[self.tree.PARAM_CHECKSUM] = checksum

def get_checksum(self):
return self.tree.get_hash(self.path_info)[1]

Expand Down
4 changes: 3 additions & 1 deletion dvc/stage/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ def fill_from_lock(stage, lock_data=None):
if isinstance(item, dependency.ParamsDependency):
item.fill_values(get_in(lock_data, [stage.PARAM_PARAMS, path]))
continue
item.checksum = get_in(checksums, [key, path, item.checksum_type])
item.info = get_in(checksums, [key, path], {})
item.info = item.info.copy()
item.info.pop("path", None)
Comment on lines +45 to +47
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the record: Discussed with @skshetry that we really need to move serialization/loading to specific deps/outputs (e.g. in a form of custom loaders/dumpers). So this is a temporary workaround.


@classmethod
def load_stage(cls, dvcfile, name, stage_data, lock_data=None):
Expand Down
7 changes: 1 addition & 6 deletions dvc/stage/serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,12 +138,7 @@ def to_single_stage_lockfile(stage: "Stage") -> dict:
params, deps = split_params_deps(stage)
deps, outs = [
[
OrderedDict(
[
(PARAM_PATH, item.def_path),
(item.checksum_type, item.checksum),
]
)
OrderedDict([(PARAM_PATH, item.def_path), *item.info.items()])
for item in sort_by_path(items)
]
for items in [deps, stage.outs]
Expand Down
2 changes: 1 addition & 1 deletion tests/func/test_run_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def test_uncached_outs_are_cached(tmp_dir, dvc, run_copy):
name="copy-foo-bar",
)
with dvc.state:
stage.outs[0].checksum = stage.outs[0].get_checksum()
stage.outs[0].info = stage.outs[0].save_info()
assert os.path.exists(relpath(stage.outs[0].cache_path))


Expand Down
27 changes: 24 additions & 3 deletions tests/unit/stage/test_serialize_pipeline_lock.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,11 +144,31 @@ def test_lock_outs_order(dvc, typ):
)


def test_dump_appropriate_checksums(dvc):
def test_dump_nondefault_hash(dvc):
stage = create_stage(
PipelineStage, dvc, deps=["s3://dvc-temp/file"], **kwargs
)
stage.deps[0].info = {"etag": "is-it-etag", "md5": "or-md5?"}
stage.deps[0].info = {"md5": "value"}
assert to_single_stage_lockfile(stage) == OrderedDict(
[
("cmd", "command"),
(
"deps",
[
OrderedDict(
[("path", "s3://dvc-temp/file"), ("md5", "value")]
)
],
),
]
)


def test_dump_multiple_hashes(dvc):
Copy link
Contributor Author

@efiop efiop Aug 25, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't use this actively right now, but this is a result of us treating loading out.info as a leftover of unparsed values, which needs symmetry on dumping as well. Could've put a few asserts to check that there is only one hash in info, though...

stage = create_stage(
PipelineStage, dvc, deps=["s3://dvc-temp/file"], **kwargs
)
stage.deps[0].info = {"etag": "etag-value", "md5": "md5-value"}
assert to_single_stage_lockfile(stage) == OrderedDict(
[
("cmd", "command"),
Expand All @@ -158,7 +178,8 @@ def test_dump_appropriate_checksums(dvc):
OrderedDict(
[
("path", "s3://dvc-temp/file"),
("etag", "is-it-etag"),
("etag", "etag-value"),
("md5", "md5-value"),
]
)
],
Expand Down