-
Notifications
You must be signed in to change notification settings - Fork 2.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Validation For README #2121
Changes from 15 commits
993acd6
c2f0699
014c49d
2040602
79c2ad0
7a1654b
c6d2345
99d2222
51c08ae
4e54669
1d788a9
2d13f70
a1c1f67
ee31e15
ae60ce5
362e464
057d0d9
cd4b69e
35e08d8
a3de91a
8dd3feb
87b0668
1d49a4d
d9f0ac3
414fc2e
0c3425a
933fdf7
cd895a1
a3bdb1f
6e85d4a
10386e7
b4ca9ca
a69c019
f12a105
d45ec9b
309a69e
cdcffe0
ffdfcb6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
#!/usr/bin/env python | ||
|
||
""" This script will run in CI and make sure all new changes to datasets readme files have valid content present. | ||
""" | ||
|
||
from pathlib import Path | ||
from subprocess import check_output | ||
from typing import List | ||
|
||
from datasets.utils.readme import validate_readme | ||
|
||
|
||
def get_changed_files(repo_path: Path) -> List[Path]: | ||
diff_output = check_output(["git", "diff", "--name-only", "HEAD..origin/master"], cwd=repo_path) | ||
changed_files = [Path(repo_path, f) for f in diff_output.decode().splitlines()] | ||
return changed_files | ||
|
||
|
||
if __name__ == "__main__": | ||
import logging | ||
from argparse import ArgumentParser | ||
|
||
logging.basicConfig(level=logging.DEBUG) | ||
|
||
ap = ArgumentParser() | ||
ap.add_argument("--repo_path", type=Path, default=Path.cwd()) | ||
ap.add_argument("--check_all", action="store_true") | ||
args = ap.parse_args() | ||
|
||
repo_path: Path = args.repo_path | ||
if args.check_all: | ||
readmes = [dd / "README.md" for dd in (repo_path / "datasets").iterdir()] | ||
else: | ||
changed_files = get_changed_files(repo_path) | ||
readmes = [ | ||
f | ||
for f in changed_files | ||
if f.exists() and f.name.lower() == "readme.md" and f.parent.parent.name == "datasets" | ||
] | ||
|
||
failed: List[Path] = [] | ||
for readme in sorted(readmes): | ||
try: | ||
DatasetMetadata.from_readme(readme) | ||
logging.debug(f"✅️ Validated '{readme.relative_to(repo_path)}'") | ||
except TypeError as e: | ||
failed.append(readme) | ||
logging.warning(f"❌ Failed to validate '{readme.relative_to(repo_path)}':\n{e}") | ||
except Exception as e: | ||
failed.append(readme) | ||
logging.warning(f"⁉️ Something unexpected happened on '{readme.relative_to(repo_path)}':\n{e}") | ||
|
||
if len(failed) > 0: | ||
logging.info(f"❌ Failed on {len(failed)} files.") | ||
exit(1) | ||
else: | ||
logging.info("All is well, keep up the good work 🤗!") | ||
exit(0) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,260 @@ | ||
import logging | ||
from dataclasses import dataclass | ||
from pathlib import Path | ||
from typing import Any, List, Tuple | ||
|
||
import yaml | ||
|
||
|
||
# loading package files: https://stackoverflow.com/a/20885799 | ||
try: | ||
import importlib.resources as pkg_resources | ||
except ImportError: | ||
# Try backported to PY<37 `importlib_resources`. | ||
import importlib_resources as pkg_resources | ||
|
||
from . import resources | ||
|
||
|
||
BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils" | ||
this_url = f"{BASE_REF_URL}/{__file__}" | ||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def load_yaml_resource(resource: str) -> Tuple[Any, str]: | ||
content = pkg_resources.read_text(resources, resource) | ||
return yaml.safe_load(content), f"{BASE_REF_URL}/resources/{resource}" | ||
|
||
|
||
readme_structure, known_readme_structure_url = load_yaml_resource("readme_structure.yaml") | ||
|
||
FILLER_TEXT = [ | ||
"[Needs More Information]", | ||
"[More Information Needed]", | ||
"(https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)", | ||
] | ||
|
||
# Dictionary representation of section/readme, error_list, warning_list | ||
ReadmeValidatorOutput = Tuple[dict, List[str], List[str]] | ||
|
||
|
||
@dataclass | ||
class Section: | ||
name: str | ||
level: str | ||
lines: List[str] = None | ||
|
||
def __post_init__(self): | ||
self.text = "" | ||
self.is_empty = True | ||
self.content = {} | ||
if self.lines is not None: | ||
self.parse() | ||
|
||
def parse(self): | ||
current_sub_level = "" | ||
current_lines = [] | ||
code_start = False | ||
for line in self.lines: | ||
if line.strip(" \n") == "": | ||
continue | ||
elif line.strip(" \n")[:3] == "```": | ||
code_start = not code_start | ||
elif line.split()[0] == self.level + "#" and not code_start: | ||
if current_sub_level != "": | ||
self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines) | ||
current_lines = [] | ||
else: | ||
if current_lines != []: | ||
self.text += "".join(current_lines).strip() | ||
if self.text != "" and self.text not in FILLER_TEXT: | ||
self.is_empty = False | ||
current_lines = [] | ||
|
||
current_sub_level = " ".join(line.split()[1:]).strip(" \n") | ||
else: | ||
current_lines.append(line) | ||
else: | ||
if current_sub_level != "": | ||
if current_sub_level in self.content: | ||
print( | ||
f"Multiple sections with the same heading '{current_sub_level}' have been found. Using the latest one found." | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe you could also have This way in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should I also add Currently I am raising both warnings and errors together. Should I handle them separately? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As you want. |
||
self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines) | ||
else: | ||
if current_lines != []: | ||
self.text += "".join(current_lines).strip() | ||
if self.text != "" and self.text not in FILLER_TEXT: | ||
self.is_empty = False | ||
|
||
def validate(self, structure: dict) -> ReadmeValidatorOutput: | ||
"""Validates a Section class object recursively using the structure provided as a dictionary. | ||
|
||
Args: | ||
structute (:obj: `dict`): The dictionary representing expected structure. | ||
|
||
Returns: | ||
:obj: `ReadmeValidatorOutput`: The dictionary representation of the section, and the errors. | ||
""" | ||
# Header text validation | ||
error_list = [] | ||
warning_list = [] | ||
if structure["allow_empty"] == False: | ||
# If header text is expected | ||
if self.is_empty: | ||
# If no header text is found, mention it in the error_list | ||
error_list.append( | ||
f"Expected some header text for section '{self.name}', reference at {known_readme_structure_url}." | ||
) | ||
|
||
# Subsections Validation | ||
if structure["subsections"] is not None: | ||
# If subsections are expected | ||
if self.content == {}: | ||
# If no subsections are present | ||
values = [subsection["name"] for subsection in structure["subsections"]] | ||
# Mention the expected values in the error_list | ||
error_list.append( | ||
f"Section '{self.name}' expected the following subsections: {values}, found `None`, reference at {known_readme_structure_url}." | ||
) | ||
else: | ||
# If some subsections are present | ||
structure_names = [subsection["name"] for subsection in structure["subsections"]] | ||
for idx, name in enumerate(structure_names): | ||
if name not in self.content: | ||
# If the expected subsection is not present | ||
error_list.append( | ||
f"Section '{self.name}' is missing subsection: '{name}', reference at {known_readme_structure_url}." | ||
) | ||
else: | ||
# If the subsection is present, validate subsection, return the result | ||
# and concat the errors from subsection to section error_list | ||
_, subsec_error_list, subsec_warning_list = self.content[name].validate( | ||
structure["subsections"][idx] | ||
) | ||
error_list += subsec_error_list | ||
warning_list += subsec_warning_list | ||
|
||
for name in self.content: | ||
if name not in structure_names: | ||
# If an extra subsection is present | ||
warning_list.append( | ||
f"'{self.name}' has an extra subsection: '{name}'. Skipping further validation checks for this subsection as expected structure is unknown." | ||
) | ||
if error_list: | ||
# If there are errors, do not return the dictionary as it is invalid | ||
return {}, error_list, warning_list | ||
else: | ||
return self.to_dict(), error_list, warning_list | ||
|
||
def to_dict(self) -> dict: | ||
"""Returns the dictionary representation of a section.""" | ||
return { | ||
"name": self.name, | ||
"text": self.text, | ||
"is_empty": self.is_empty, | ||
"subsections": [value.to_dict() for value in self.content.values()], | ||
} | ||
|
||
|
||
class ReadMe(Section): # Level 0 | ||
def __init__(self, name: str, lines: List[str], structure: dict = None): | ||
super().__init__(name=name, level="") # Not using lines here as we need to use a child class parse | ||
self.structure = structure | ||
self.yaml_tags_line_count = -2 | ||
self.lines = lines | ||
if self.lines is not None: | ||
self.parse() | ||
if self.structure is None: | ||
content, error_list, warning_list = self.validate(readme_structure) | ||
else: | ||
content, error_list, warning_list = self.validate(self.structure) | ||
|
||
if error_list != [] or warning_list != []: | ||
errors = "\n".join(list(map(lambda x: "-\t" + x, error_list + warning_list))) | ||
error_string = f"The following issues were found for the README at `{self.name}`:\n" + errors | ||
raise ValueError(error_string) | ||
|
||
@classmethod | ||
def from_readme(cls, path: Path, structure: dict = None): | ||
with open(path) as f: | ||
lines = f.readlines() | ||
return cls(path, lines, structure) | ||
|
||
@classmethod | ||
def from_string(cls, string: str, structure: dict = None, root_name: str = "root"): | ||
lines = string.split("\n") | ||
return cls(root_name, lines, structure) | ||
|
||
def parse(self): | ||
# Skip Tags | ||
tag_count = 0 | ||
line_count = 0 | ||
|
||
for line in self.lines: | ||
self.yaml_tags_line_count += 1 | ||
if line.strip(" \n") == "---": | ||
tag_count += 1 | ||
if tag_count == 2: | ||
break | ||
line_count += 1 | ||
|
||
self.lines = self.lines[line_count + 1 :] # Get the last + 1 th item. | ||
super().parse() | ||
|
||
def __str__(self): | ||
"""Returns the string of dictionary representation of the ReadMe.""" | ||
return str(self.to_dict()) | ||
|
||
def validate(self, readme_structure): | ||
error_list = [] | ||
warning_list = [] | ||
if self.yaml_tags_line_count == 0: | ||
warning_list.append(f"YAML Tags are not present in the README at `{self.name}`.") | ||
elif self.yaml_tags_line_count == -1: | ||
warning_list.append(f"Only the start of YAML tags present in the README at `{self.name}`.") | ||
|
||
# Check how many first level sections are present. | ||
num_first_level_keys = len(self.content.keys()) | ||
if num_first_level_keys > 1: | ||
# If more than one, add to the error list, continue | ||
error_list.append( | ||
f"The README present at `{self.name}` has found several first-level headings: {list(self.content.keys())}. Only one heading is expected. Skipping further validation for this README." | ||
) | ||
elif num_first_level_keys < 1: | ||
# If less than one, append error. | ||
error_list.append( | ||
f"The README present as `{self.name}` has no first-level headings. One heading is expected. Skipping further validation for this README." | ||
) | ||
|
||
else: | ||
# If one exactly | ||
start_key = list(self.content.keys())[0] # Get the key | ||
if start_key.startswith("Dataset Card for"): # Check correct start | ||
|
||
# If the starting is correct, validate all the sections | ||
_, sec_error_list, sec_warning_list = self.content[start_key].validate( | ||
readme_structure["subsections"][0] | ||
) | ||
error_list += sec_error_list | ||
warning_list += sec_warning_list | ||
else: | ||
# If not found, append error | ||
error_list.append( | ||
f"No first-level heading starting with `Dataset Card for` found in README present at `{self.name}`. Skipping further validation for this README." | ||
) | ||
if error_list: | ||
# If there are errors, do not return the dictionary as it is invalid | ||
return {}, error_list, warning_list | ||
else: | ||
return self.to_dict(), error_list, warning_list | ||
|
||
|
||
if __name__ == "__main__": | ||
from argparse import ArgumentParser | ||
|
||
ap = ArgumentParser(usage="Validate the content (excluding YAML tags) of a README.md file.") | ||
ap.add_argument("readme_filepath") | ||
args = ap.parse_args() | ||
readme_filepath = Path(args.readme_filepath) | ||
readme = ReadMe.from_readme(readme_filepath) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the future we may have subclasses of this to have more finegrained validation per section
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this class can be extended and we can keep a section-to-class mapping in the future. For now, this should be fine, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes it's fine for now