Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Validation For README #2121

Merged
merged 38 commits into from
May 10, 2021
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
993acd6
Add Initial README parser
gchhablani Mar 26, 2021
c2f0699
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani Mar 29, 2021
014c49d
Add basic validation checks
gchhablani Mar 29, 2021
2040602
Minor fix
gchhablani Mar 29, 2021
79c2ad0
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani Apr 1, 2021
7a1654b
Changes from review
gchhablani Apr 1, 2021
c6d2345
Merge branch 'master' of https://github.com/huggingface/datasets into…
gchhablani Apr 8, 2021
99d2222
Make main into a function in readme_parser
gchhablani Apr 8, 2021
51c08ae
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani Apr 23, 2021
4e54669
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani Apr 24, 2021
1d788a9
Move README validator to scripts
gchhablani Apr 25, 2021
2d13f70
Arrange README validation files
gchhablani Apr 26, 2021
a1c1f67
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani Apr 27, 2021
ee31e15
Update readme validator class
gchhablani Apr 27, 2021
ae60ce5
Add from_string tests
gchhablani Apr 27, 2021
362e464
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani Apr 28, 2021
057d0d9
Add PyTest tests
gchhablani Apr 28, 2021
cd4b69e
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani May 2, 2021
35e08d8
Add tests for from_readme
gchhablani May 2, 2021
a3de91a
Add ReadMe validator script
gchhablani May 2, 2021
8dd3feb
Fix style
gchhablani May 2, 2021
87b0668
Remove print statement
gchhablani May 2, 2021
1d49a4d
Add validator to CircleCI
gchhablani May 2, 2021
d9f0ac3
Fix style
gchhablani May 2, 2021
414fc2e
Add YAML files to setup resources
gchhablani May 2, 2021
0c3425a
Make validator executable
gchhablani May 2, 2021
933fdf7
Add no subsections test
gchhablani May 2, 2021
cd895a1
Add incorrect YAML test
gchhablani May 2, 2021
a3bdb1f
Fix style
gchhablani May 2, 2021
6e85d4a
Fix tests
gchhablani May 2, 2021
10386e7
Fix tests
gchhablani May 2, 2021
b4ca9ca
Fix style
gchhablani May 2, 2021
a69c019
Fix escape character issue
gchhablani May 4, 2021
f12a105
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani May 4, 2021
d45ec9b
Add three-level heading validation limit
gchhablani May 7, 2021
309a69e
Merge remote-tracking branch 'upstream/master' into add-readme-parser
gchhablani May 7, 2021
cdcffe0
Add either text or subsection option
gchhablani May 7, 2021
ffdfcb6
Fix style
gchhablani May 7, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
58 changes: 58 additions & 0 deletions scripts/datasets_readme_validator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env python

""" This script will run in CI and make sure all new changes to datasets readme files have valid content present.
"""

from pathlib import Path
from subprocess import check_output
from typing import List

from datasets.utils.readme import validate_readme


def get_changed_files(repo_path: Path) -> List[Path]:
diff_output = check_output(["git", "diff", "--name-only", "HEAD..origin/master"], cwd=repo_path)
changed_files = [Path(repo_path, f) for f in diff_output.decode().splitlines()]
return changed_files


if __name__ == "__main__":
import logging
from argparse import ArgumentParser

logging.basicConfig(level=logging.DEBUG)

ap = ArgumentParser()
ap.add_argument("--repo_path", type=Path, default=Path.cwd())
ap.add_argument("--check_all", action="store_true")
args = ap.parse_args()

repo_path: Path = args.repo_path
if args.check_all:
readmes = [dd / "README.md" for dd in (repo_path / "datasets").iterdir()]
else:
changed_files = get_changed_files(repo_path)
readmes = [
f
for f in changed_files
if f.exists() and f.name.lower() == "readme.md" and f.parent.parent.name == "datasets"
]

failed: List[Path] = []
for readme in sorted(readmes):
try:
DatasetMetadata.from_readme(readme)
logging.debug(f"✅️ Validated '{readme.relative_to(repo_path)}'")
except TypeError as e:
failed.append(readme)
logging.warning(f"❌ Failed to validate '{readme.relative_to(repo_path)}':\n{e}")
except Exception as e:
failed.append(readme)
logging.warning(f"⁉️ Something unexpected happened on '{readme.relative_to(repo_path)}':\n{e}")

if len(failed) > 0:
logging.info(f"❌ Failed on {len(failed)} files.")
exit(1)
else:
logging.info("All is well, keep up the good work 🤗!")
exit(0)
260 changes: 260 additions & 0 deletions src/datasets/utils/readme.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import Any, List, Tuple

import yaml


# loading package files: https://stackoverflow.com/a/20885799
try:
import importlib.resources as pkg_resources
except ImportError:
# Try backported to PY<37 `importlib_resources`.
import importlib_resources as pkg_resources

from . import resources


BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils"
this_url = f"{BASE_REF_URL}/{__file__}"
logger = logging.getLogger(__name__)


def load_yaml_resource(resource: str) -> Tuple[Any, str]:
content = pkg_resources.read_text(resources, resource)
return yaml.safe_load(content), f"{BASE_REF_URL}/resources/{resource}"


readme_structure, known_readme_structure_url = load_yaml_resource("readme_structure.yaml")

FILLER_TEXT = [
"[Needs More Information]",
"[More Information Needed]",
"(https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)",
]

# Dictionary representation of section/readme, error_list, warning_list
ReadmeValidatorOutput = Tuple[dict, List[str], List[str]]


@dataclass
class Section:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the future we may have subclasses of this to have more finegrained validation per section

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this class can be extended and we can keep a section-to-class mapping in the future. For now, this should be fine, right?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes it's fine for now

name: str
level: str
lines: List[str] = None

def __post_init__(self):
self.text = ""
self.is_empty = True
self.content = {}
if self.lines is not None:
self.parse()

def parse(self):
current_sub_level = ""
current_lines = []
code_start = False
for line in self.lines:
if line.strip(" \n") == "":
continue
elif line.strip(" \n")[:3] == "```":
code_start = not code_start
elif line.split()[0] == self.level + "#" and not code_start:
if current_sub_level != "":
self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines)
current_lines = []
else:
if current_lines != []:
self.text += "".join(current_lines).strip()
if self.text != "" and self.text not in FILLER_TEXT:
self.is_empty = False
current_lines = []

current_sub_level = " ".join(line.split()[1:]).strip(" \n")
else:
current_lines.append(line)
else:
if current_sub_level != "":
if current_sub_level in self.content:
print(
f"Multiple sections with the same heading '{current_sub_level}' have been found. Using the latest one found."
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe you could also have self.parsing_error_list and self.parsing_warning_list ?

This way in validate you could get the errors and warnings with section.parsing_error_list and section.parsing_warning_list

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should I also add self.validate_error_list and self.validate_warning_list?

Currently I am raising both warnings and errors together. Should I handle them separately?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As you want.
The advantage of having the parsing error and warnings in the attributes is that you can access them from the validate methods

self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines)
else:
if current_lines != []:
self.text += "".join(current_lines).strip()
if self.text != "" and self.text not in FILLER_TEXT:
self.is_empty = False

def validate(self, structure: dict) -> ReadmeValidatorOutput:
"""Validates a Section class object recursively using the structure provided as a dictionary.

Args:
structute (:obj: `dict`): The dictionary representing expected structure.

Returns:
:obj: `ReadmeValidatorOutput`: The dictionary representation of the section, and the errors.
"""
# Header text validation
error_list = []
warning_list = []
if structure["allow_empty"] == False:
# If header text is expected
if self.is_empty:
# If no header text is found, mention it in the error_list
error_list.append(
f"Expected some header text for section '{self.name}', reference at {known_readme_structure_url}."
)

# Subsections Validation
if structure["subsections"] is not None:
# If subsections are expected
if self.content == {}:
# If no subsections are present
values = [subsection["name"] for subsection in structure["subsections"]]
# Mention the expected values in the error_list
error_list.append(
f"Section '{self.name}' expected the following subsections: {values}, found `None`, reference at {known_readme_structure_url}."
)
else:
# If some subsections are present
structure_names = [subsection["name"] for subsection in structure["subsections"]]
for idx, name in enumerate(structure_names):
if name not in self.content:
# If the expected subsection is not present
error_list.append(
f"Section '{self.name}' is missing subsection: '{name}', reference at {known_readme_structure_url}."
)
else:
# If the subsection is present, validate subsection, return the result
# and concat the errors from subsection to section error_list
_, subsec_error_list, subsec_warning_list = self.content[name].validate(
structure["subsections"][idx]
)
error_list += subsec_error_list
warning_list += subsec_warning_list

for name in self.content:
if name not in structure_names:
# If an extra subsection is present
warning_list.append(
f"'{self.name}' has an extra subsection: '{name}'. Skipping further validation checks for this subsection as expected structure is unknown."
)
if error_list:
# If there are errors, do not return the dictionary as it is invalid
return {}, error_list, warning_list
else:
return self.to_dict(), error_list, warning_list

def to_dict(self) -> dict:
"""Returns the dictionary representation of a section."""
return {
"name": self.name,
"text": self.text,
"is_empty": self.is_empty,
"subsections": [value.to_dict() for value in self.content.values()],
}


class ReadMe(Section): # Level 0
def __init__(self, name: str, lines: List[str], structure: dict = None):
super().__init__(name=name, level="") # Not using lines here as we need to use a child class parse
self.structure = structure
self.yaml_tags_line_count = -2
self.lines = lines
if self.lines is not None:
self.parse()
if self.structure is None:
content, error_list, warning_list = self.validate(readme_structure)
else:
content, error_list, warning_list = self.validate(self.structure)

if error_list != [] or warning_list != []:
errors = "\n".join(list(map(lambda x: "-\t" + x, error_list + warning_list)))
error_string = f"The following issues were found for the README at `{self.name}`:\n" + errors
raise ValueError(error_string)

@classmethod
def from_readme(cls, path: Path, structure: dict = None):
with open(path) as f:
lines = f.readlines()
return cls(path, lines, structure)

@classmethod
def from_string(cls, string: str, structure: dict = None, root_name: str = "root"):
lines = string.split("\n")
return cls(root_name, lines, structure)

def parse(self):
# Skip Tags
tag_count = 0
line_count = 0

for line in self.lines:
self.yaml_tags_line_count += 1
if line.strip(" \n") == "---":
tag_count += 1
if tag_count == 2:
break
line_count += 1

self.lines = self.lines[line_count + 1 :] # Get the last + 1 th item.
super().parse()

def __str__(self):
"""Returns the string of dictionary representation of the ReadMe."""
return str(self.to_dict())

def validate(self, readme_structure):
error_list = []
warning_list = []
if self.yaml_tags_line_count == 0:
warning_list.append(f"YAML Tags are not present in the README at `{self.name}`.")
elif self.yaml_tags_line_count == -1:
warning_list.append(f"Only the start of YAML tags present in the README at `{self.name}`.")

# Check how many first level sections are present.
num_first_level_keys = len(self.content.keys())
if num_first_level_keys > 1:
# If more than one, add to the error list, continue
error_list.append(
f"The README present at `{self.name}` has found several first-level headings: {list(self.content.keys())}. Only one heading is expected. Skipping further validation for this README."
)
elif num_first_level_keys < 1:
# If less than one, append error.
error_list.append(
f"The README present as `{self.name}` has no first-level headings. One heading is expected. Skipping further validation for this README."
)

else:
# If one exactly
start_key = list(self.content.keys())[0] # Get the key
if start_key.startswith("Dataset Card for"): # Check correct start

# If the starting is correct, validate all the sections
_, sec_error_list, sec_warning_list = self.content[start_key].validate(
readme_structure["subsections"][0]
)
error_list += sec_error_list
warning_list += sec_warning_list
else:
# If not found, append error
error_list.append(
f"No first-level heading starting with `Dataset Card for` found in README present at `{self.name}`. Skipping further validation for this README."
)
if error_list:
# If there are errors, do not return the dictionary as it is invalid
return {}, error_list, warning_list
else:
return self.to_dict(), error_list, warning_list


if __name__ == "__main__":
from argparse import ArgumentParser

ap = ArgumentParser(usage="Validate the content (excluding YAML tags) of a README.md file.")
ap.add_argument("readme_filepath")
args = ap.parse_args()
readme_filepath = Path(args.readme_filepath)
readme = ReadMe.from_readme(readme_filepath)