Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Disallow duplicate keys in yaml tags #2379

Merged
merged 1 commit into from
May 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions src/datasets/utils/metadata.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import logging
from collections import Counter
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple
Expand Down Expand Up @@ -38,6 +39,21 @@ def load_json_resource(resource: str) -> Tuple[Any, str]:
known_multilingualities, known_multilingualities_url = load_json_resource("multilingualities.json")


class NoDuplicateSafeLoader(yaml.SafeLoader):
def _check_no_duplicates_on_constructed_node(self, node):
keys = [self.constructed_objects[key_node] for key_node, _ in node.value]
keys = [tuple(key) if isinstance(key, list) else key for key in keys]
counter = Counter(keys)
duplicate_keys = [key for key in counter if counter[key] > 1]
if duplicate_keys:
raise TypeError(f"Got duplicate yaml keys: {duplicate_keys}")

def construct_mapping(self, node, deep=False):
mapping = super().construct_mapping(node, deep=deep)
self._check_no_duplicates_on_constructed_node(node)
return mapping


def yaml_block_from_readme(path: Path) -> Optional[str]:
with path.open() as readme_file:
content = [line.strip() for line in readme_file]
Expand All @@ -54,7 +70,7 @@ def metadata_dict_from_readme(path: Path) -> Optional[Dict[str, List[str]]]:
yaml_block = yaml_block_from_readme(path=path)
if yaml_block is None:
return None
metada_dict = yaml.safe_load(yaml_block) or dict()
metada_dict = yaml.load(yaml_block, Loader=NoDuplicateSafeLoader) or dict()
return metada_dict


Expand Down Expand Up @@ -174,7 +190,7 @@ def from_yaml_string(cls, string: str) -> "DatasetMetadata":
Raises:
:obj:`TypeError`: If the dataset's metadata is invalid
"""
metada_dict = yaml.safe_load(string) or dict()
metada_dict = yaml.load(string, Loader=NoDuplicateSafeLoader) or dict()
# flatten the metadata of each config
for key in metada_dict:
if isinstance(metada_dict[key], dict):
Expand Down
57 changes: 55 additions & 2 deletions tests/test_metadata_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

def _dedent(string: str) -> str:
indent_level = min(re.search("^ +", t).end() if t.startswith(" ") else 0 for t in string.splitlines())
return "\n".join([line[indent_level:] for line in string.splitlines()])
return "\n".join([line[indent_level:] for line in string.splitlines() if indent_level < len(line)])


README_YAML = """\
Expand Down Expand Up @@ -126,7 +126,7 @@ def test_yaml_block_from_readme(self):
- en
task_ids:
- sentiment-classification
"""
"""
),
)

Expand Down Expand Up @@ -263,3 +263,56 @@ def test_from_yaml_string(self):
)
with self.assertRaises(TypeError):
DatasetMetadata.from_yaml_string(missing_tag_yaml)

duplicate_yaml_keys = _dedent(
"""\
annotations_creators:
- found
languages:
- en
licenses:
- unknown
multilinguality:
- monolingual
size_categories:
- 10K<n<100K
source_datasets:
- extended|other-yahoo-webscope-l6
task_categories:
- question-answering
task_ids:
- open-domain-qa
task_ids:
- open-domain-qa
"""
)
with self.assertRaises(TypeError):
DatasetMetadata.from_yaml_string(duplicate_yaml_keys)

valid_yaml_string_with_duplicate_configs = _dedent(
"""\
annotations_creators:
- found
language_creators:
- found
languages:
en:
- en
en:
- en
licenses:
- unknown
multilinguality:
- monolingual
size_categories:
- 10K<n<100K
source_datasets:
- extended|other-yahoo-webscope-l6
task_categories:
- question-answering
task_ids:
- open-domain-qa
"""
)
with self.assertRaises(TypeError):
DatasetMetadata.from_yaml_string(valid_yaml_string_with_duplicate_configs)