Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Metadata validation #2107

Merged
merged 32 commits into from
Apr 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
fadc0a0
basic validation
theo-m Mar 22, 2021
7a4b594
ci script and test change
theo-m Mar 23, 2021
c3c97ea
color is better
theo-m Mar 23, 2021
2fe5787
check all option
theo-m Mar 24, 2021
0f68ce4
validate size cats & multiling, point to reference file urls on error
theo-m Mar 24, 2021
2d264e8
add validation to ci and rename files
theo-m Mar 24, 2021
fc46ec3
spurrious change to trigger CI
theo-m Mar 24, 2021
58763d2
add qa reqs
theo-m Mar 24, 2021
115d252
disallow empty lists
theo-m Mar 24, 2021
9ae048e
better error msg: show all invalid values rather than first one
theo-m Mar 24, 2021
299e907
some code shuffling & better error msg for langcodes
theo-m Mar 24, 2021
b4a0665
add pyyaml to qa reqs
theo-m Mar 24, 2021
7eeb647
fix package file loading
theo-m Mar 24, 2021
3a94086
include json resources
theo-m Mar 24, 2021
e4409a9
reflect changes to size cats from https://github.com/huggingface/data…
theo-m Mar 24, 2021
9450b5f
trying another format for package_data
theo-m Mar 24, 2021
58709bf
ci works! fixing the readme like a good citizen 🤗
theo-m Mar 24, 2021
702a8a1
escape validation everywhere it's allowed in the tagging app
theo-m Mar 24, 2021
d3eec3c
code review: more json files, conditional import
theo-m Mar 25, 2021
59d7dde
Merge remote-tracking branch 'origin/master' into theo/config-validator
theo-m Mar 26, 2021
84de013
pointers to integrate readme metadata in class (wip)
theo-m Mar 29, 2021
7fbd51d
no pydantic
theo-m Mar 31, 2021
0aefcae
Merge remote-tracking branch 'origin/master' into theo/config-validator
theo-m Mar 31, 2021
ab82a6c
fix docs?
theo-m Mar 31, 2021
a4953db
Revert "fix docs?"
theo-m Mar 31, 2021
4cfd2e8
Merge remote-tracking branch 'origin/master' into theo/config-validator
theo-m Apr 1, 2021
e63d325
remove pointers to add readme to loader
theo-m Apr 1, 2021
2f2e197
Merge branch 'master' into theo/config-validator
SBrandeis Apr 23, 2021
3102ccf
Get rid of langcodes, some refactor
SBrandeis Apr 23, 2021
a9846fd
Update languages.json
SBrandeis Apr 23, 2021
551ae96
Refactor, add tests
SBrandeis Apr 23, 2021
8afb25a
I said, tests!!
SBrandeis Apr 23, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ jobs:
- run: black --check --line-length 119 --target-version py36 tests src benchmarks datasets metrics
- run: isort --check-only tests src benchmarks datasets metrics
- run: flake8 tests src benchmarks datasets metrics
- run: ./scripts/datasets_metadata_validator.py

build_doc:
working_directory: ~/datasets
Expand Down
59 changes: 59 additions & 0 deletions scripts/datasets_metadata_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/usr/bin/env python

""" This script will run in CI and make sure all new changes to datasets readme files have valid metadata yaml headers.

"""

from pathlib import Path
from subprocess import check_output
from typing import List

from datasets.utils.metadata import DatasetMetadata


def get_changed_files(repo_path: Path) -> List[Path]:
diff_output = check_output(["git", "diff", "--name-only", "HEAD..origin/master"], cwd=repo_path)
changed_files = [Path(repo_path, f) for f in diff_output.decode().splitlines()]
return changed_files


if __name__ == "__main__":
import logging
from argparse import ArgumentParser

logging.basicConfig(level=logging.DEBUG)

ap = ArgumentParser()
ap.add_argument("--repo_path", type=Path, default=Path.cwd())
ap.add_argument("--check_all", action="store_true")
args = ap.parse_args()

repo_path: Path = args.repo_path
if args.check_all:
readmes = [dd / "README.md" for dd in (repo_path / "datasets").iterdir()]
else:
changed_files = get_changed_files(repo_path)
readmes = [
f
for f in changed_files
if f.exists() and f.name.lower() == "readme.md" and f.parent.parent.name == "datasets"
]

failed: List[Path] = []
for readme in sorted(readmes):
try:
DatasetMetadata.from_readme(readme)
logging.debug(f"✅️ Validated '{readme.relative_to(repo_path)}'")
except TypeError as e:
failed.append(readme)
logging.warning(f"❌ Failed to validate '{readme.relative_to(repo_path)}':\n{e}")
except Exception as e:
failed.append(readme)
logging.warning(f"⁉️ Something unexpected happened on '{readme.relative_to(repo_path)}':\n{e}")

if len(failed) > 0:
logging.info(f"❌ Failed on {len(failed)} files.")
exit(1)
else:
logging.info("All is well, keep up the good work 🤗!")
exit(0)
26 changes: 12 additions & 14 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,20 @@
import os
import sys

from setuptools import find_packages
from setuptools import setup
from setuptools import find_packages, setup


DOCLINES = __doc__.split("\n")


# Pin some dependencies for old python versions
_deps = {
"fsspec": "fsspec" if sys.version_info >= (3, 7) else "fsspec<0.8.1", # fsspec>=0.8.1 requires py>=3.7 for async stuff
"s3fs": "s3fs" if sys.version_info >= (3, 7) else "s3fs==0.4.2", # later versions of s3fs have issues downloading directories recursively for py36
"fsspec": "fsspec"
if sys.version_info >= (3, 7)
else "fsspec<0.8.1", # fsspec>=0.8.1 requires py>=3.7 for async stuff
"s3fs": "s3fs"
if sys.version_info >= (3, 7)
else "s3fs==0.4.2", # later versions of s3fs have issues downloading directories recursively for py36
}


Expand Down Expand Up @@ -149,6 +153,8 @@
"tldextract>=3.1.0",
"texttable>=1.6.3",
"Werkzeug>=1.0.1",
# metadata validation
"importlib_resources;python_version<'3.7'",
]

if os.name == "nt": # windows
Expand All @@ -167,11 +173,7 @@
)


QUALITY_REQUIRE = [
"black",
"isort",
"flake8==3.7.9",
]
QUALITY_REQUIRE = ["black", "flake8==3.7.9", "isort", "pyyaml>=5.3.1"]


EXTRAS_REQUIRE = {
Expand Down Expand Up @@ -214,11 +216,7 @@
license="Apache 2.0",
package_dir={"": "src"},
packages=find_packages("src"),
package_data={
"datasets": [
"scripts/templates/*",
],
},
package_data={"datasets": ["scripts/templates/*"], "datasets.utils.resources": ["*.json"]},
entry_points={"console_scripts": ["datasets-cli=datasets.commands.datasets_cli:main"]},
install_requires=REQUIRED_PKGS,
extras_require=EXTRAS_REQUIRE,
Expand Down
259 changes: 259 additions & 0 deletions src/datasets/utils/metadata.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
import json
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple


# loading package files: https://stackoverflow.com/a/20885799
try:
import importlib.resources as pkg_resources
except ImportError:
# Try backported to PY<37 `importlib_resources`.
import importlib_resources as pkg_resources
theo-m marked this conversation as resolved.
Show resolved Hide resolved

import yaml

from . import resources


BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils"
this_url = f"{BASE_REF_URL}/{__file__}"
logger = logging.getLogger(__name__)


def load_json_resource(resource: str) -> Tuple[Any, str]:
content = pkg_resources.read_text(resources, resource)
return json.loads(content), f"{BASE_REF_URL}/resources/{resource}"


# Source of languages.json:
# https://datahub.io/core/language-codes/r/ietf-language-tags.csv
# Language names were obtained with langcodes: https://github.com/LuminosoInsight/langcodes
known_language_codes, known_language_codes_url = load_json_resource("languages.json")
known_licenses, known_licenses_url = load_json_resource("licenses.json")
known_task_ids, known_task_ids_url = load_json_resource("tasks.json")
known_creators, known_creators_url = load_json_resource("creators.json")
known_size_categories, known_size_categories_url = load_json_resource("size_categories.json")
known_multilingualities, known_multilingualities_url = load_json_resource("multilingualities.json")


def yaml_block_from_readme(path: Path) -> Optional[str]:
with path.open() as readme_file:
content = [line.strip() for line in readme_file]

if content[0] == "---" and "---" in content[1:]:
yamlblock = "\n".join(content[1 : content[1:].index("---") + 1])
return yamlblock

return None


def metadata_dict_from_readme(path: Path) -> Optional[Dict[str, List[str]]]:
""""Loads a dataset's metadata from the dataset card (REAMDE.md), as a Python dict"""
yaml_block = yaml_block_from_readme(path=path)
if yaml_block is None:
return None
metada_dict = yaml.safe_load(yaml_block) or dict()
return metada_dict


ValidatorOutput = Tuple[List[str], Optional[str]]


def tagset_validator(values: List[str], reference_values: List[str], name: str, url: str) -> ValidatorOutput:
invalid_values = [v for v in values if v not in reference_values]
if len(invalid_values) > 0:
return [], f"{invalid_values} are not registered tags for '{name}', reference at {url}"
return values, None


def escape_validation_for_predicate(
values: List[Any], predicate_fn: Callable[[Any], bool]
) -> Tuple[List[Any], List[Any]]:
trues, falses = list(), list()
for v in values:
if predicate_fn(v):
trues.append(v)
else:
falses.append(v)
if len(trues) > 0:
logger.warning(f"The following values will escape validation: {trues}")
return trues, falses


def validate_metadata_type(metadata_dict: dict):
basic_typing_errors = {
name: value
for name, value in metadata_dict.items()
if not isinstance(value, list) or len(value) == 0 or not isinstance(value[0], str)
}
if len(basic_typing_errors) > 0:
raise TypeError(f"Found fields that are not non-empty list of strings: {basic_typing_errors}")


@dataclass
class DatasetMetadata:
annotations_creators: List[str]
language_creators: List[str]
languages: List[str]
licenses: List[str]
multilinguality: List[str]
size_categories: List[str]
source_datasets: List[str]
task_categories: List[str]
task_ids: List[str]

def __post_init__(self):
validate_metadata_type(metadata_dict=vars(self))

self.annotations_creators, annotations_creators_errors = self.validate_annotations_creators(
self.annotations_creators
)
self.language_creators, language_creators_errors = self.validate_language_creators(self.language_creators)
self.languages, languages_errors = self.validate_language_codes(self.languages)
self.licenses, licenses_errors = self.validate_licences(self.licenses)
self.multilinguality, multilinguality_errors = self.validate_mulitlinguality(self.multilinguality)
self.size_categories, size_categories_errors = self.validate_size_catgeories(self.size_categories)
self.source_datasets, source_datasets_errors = self.validate_source_datasets(self.source_datasets)
self.task_categories, task_categories_errors = self.validate_task_categories(self.task_categories)
self.task_ids, task_ids_errors = self.validate_task_ids(self.task_ids)

errors = {
"annotations_creators": annotations_creators_errors,
"language_creators": language_creators_errors,
"licenses": licenses_errors,
"multilinguality": multilinguality_errors,
"size_categories": size_categories_errors,
"source_datasets": source_datasets_errors,
"task_categories": task_categories_errors,
"task_ids": task_ids_errors,
"languages": languages_errors,
}

exception_msg_dict = dict()
for field, errs in errors.items():
if errs is not None:
exception_msg_dict[field] = errs
if len(exception_msg_dict) > 0:
raise TypeError(
"Could not validate the metada, found the following errors:\n"
+ "\n".join(f"* field '{fieldname}':\n\t{err}" for fieldname, err in exception_msg_dict.items())
)

@classmethod
def from_readme(cls, path: Path) -> "DatasetMetadata":
"""Loads and validates the dataset metadat from its dataset card (README.md)

Args:
path (:obj:`Path`): Path to the dataset card (its README.md file)

Returns:
:class:`DatasetMetadata`: The dataset's metadata

Raises:
:obj:`TypeError`: If the dataset card has no metadata (no YAML header)
:obj:`TypeError`: If the dataset's metadata is invalid
"""
yaml_string = yaml_block_from_readme(path)
if yaml_string is not None:
return cls.from_yaml_string(yaml_string)
else:
raise TypeError(f"did not find a yaml block in '{path}'")

@classmethod
def from_yaml_string(cls, string: str) -> "DatasetMetadata":
"""Loads and validates the dataset metadat from a YAML string

Args:
string (:obj:`str`): The YAML string

Returns:
:class:`DatasetMetadata`: The dataset's metadata

Raises:
:obj:`TypeError`: If the dataset's metadata is invalid
"""
metada_dict = yaml.safe_load(string) or dict()
return cls(**metada_dict)

@staticmethod
def validate_annotations_creators(annotations_creators: List[str]) -> ValidatorOutput:
return tagset_validator(
annotations_creators, known_creators["annotations"], "annotations_creators", known_creators_url
)

@staticmethod
def validate_language_creators(language_creators: List[str]) -> ValidatorOutput:
return tagset_validator(language_creators, known_creators["language"], "language_creators", known_creators_url)

@staticmethod
def validate_language_codes(languages: List[str]) -> ValidatorOutput:
return tagset_validator(
values=languages,
reference_values=known_language_codes.keys(),
name="languages",
url=known_language_codes_url,
)

@staticmethod
def validate_licences(licenses: List[str]) -> ValidatorOutput:
others, to_validate = escape_validation_for_predicate(licenses, lambda e: "-other-" in e)
validated, error = tagset_validator(to_validate, list(known_licenses.keys()), "licenses", known_licenses_url)
return [*validated, *others], error

@staticmethod
def validate_task_categories(task_categories: List[str]) -> ValidatorOutput:
# TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change
# in the near future and we don't want to waste energy in tagging against a moving taxonomy.
known_set = list(known_task_ids.keys())
others, to_validate = escape_validation_for_predicate(task_categories, lambda e: e.startswith("other"))
lhoestq marked this conversation as resolved.
Show resolved Hide resolved
validated, error = tagset_validator(to_validate, known_set, "task_categories", known_task_ids_url)
return [*validated, *others], error

@staticmethod
def validate_task_ids(task_ids: List[str]) -> ValidatorOutput:
# TODO: we're currently ignoring all values starting with 'other' as our task taxonomy is bound to change
# in the near future and we don't want to waste energy in tagging against a moving taxonomy.
known_set = [tid for _cat, d in known_task_ids.items() for tid in d["options"]]
others, to_validate = escape_validation_for_predicate(task_ids, lambda e: "-other-" in e)
validated, error = tagset_validator(to_validate, known_set, "task_ids", known_task_ids_url)
return [*validated, *others], error

@staticmethod
def validate_mulitlinguality(multilinguality: List[str]) -> ValidatorOutput:
others, to_validate = escape_validation_for_predicate(multilinguality, lambda e: e.startswith("other"))
validated, error = tagset_validator(
to_validate, list(known_multilingualities.keys()), "multilinguality", known_size_categories_url
)
return [*validated, *others], error

@staticmethod
def validate_size_catgeories(size_cats: List[str]) -> ValidatorOutput:
return tagset_validator(size_cats, known_size_categories, "size_categories", known_size_categories_url)

@staticmethod
def validate_source_datasets(sources: List[str]) -> ValidatorOutput:
invalid_values = []
for src in sources:
is_ok = src in ["original", "extended"] or src.startswith("extended|")
if not is_ok:
invalid_values.append(src)
if len(invalid_values) > 0:
return (
[],
f"'source_datasets' has invalid values: {invalid_values}, refer to source code to understand {this_url}",
)

return sources, None


if __name__ == "__main__":
from argparse import ArgumentParser

ap = ArgumentParser(usage="Validate the yaml metadata block of a README.md file.")
ap.add_argument("readme_filepath")
args = ap.parse_args()

readme_filepath = Path(args.readme_filepath)
DatasetMetadata.from_readme(readme_filepath)
theo-m marked this conversation as resolved.
Show resolved Hide resolved
Empty file.