huggingface · lhoestq · May 10, 2021 · Mar 26, 2021 · Mar 29, 2021 · Mar 29, 2021
diff --git a/scripts/datasets_readme_validator.py b/scripts/datasets_readme_validator.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python
+
+""" This script will run in CI and make sure all new changes to datasets readme files have valid content present.
+"""
+
+from pathlib import Path
+from subprocess import check_output
+from typing import List
+
+from datasets.utils.readme import validate_readme
+
+
+def get_changed_files(repo_path: Path) -> List[Path]:
+    diff_output = check_output(["git", "diff", "--name-only", "HEAD..origin/master"], cwd=repo_path)
+    changed_files = [Path(repo_path, f) for f in diff_output.decode().splitlines()]
+    return changed_files
+
+
+if __name__ == "__main__":
+    import logging
+    from argparse import ArgumentParser
+
+    logging.basicConfig(level=logging.DEBUG)
+
+    ap = ArgumentParser()
+    ap.add_argument("--repo_path", type=Path, default=Path.cwd())
+    ap.add_argument("--check_all", action="store_true")
+    args = ap.parse_args()
+
+    repo_path: Path = args.repo_path
+    if args.check_all:
+        readmes = [dd / "README.md" for dd in (repo_path / "datasets").iterdir()]
+    else:
+        changed_files = get_changed_files(repo_path)
+        readmes = [
+            f
+            for f in changed_files
+            if f.exists() and f.name.lower() == "readme.md" and f.parent.parent.name == "datasets"
+        ]
+
+    failed: List[Path] = []
+    for readme in sorted(readmes):
+        try:
+            DatasetMetadata.from_readme(readme)
+            logging.debug(f"✅️ Validated '{readme.relative_to(repo_path)}'")
+        except TypeError as e:
+            failed.append(readme)
+            logging.warning(f"❌ Failed to validate '{readme.relative_to(repo_path)}':\n{e}")
+        except Exception as e:
+            failed.append(readme)
+            logging.warning(f"⁉️ Something unexpected happened on '{readme.relative_to(repo_path)}':\n{e}")
+
+    if len(failed) > 0:
+        logging.info(f"❌ Failed on {len(failed)} files.")
+        exit(1)
+    else:
+        logging.info("All is well, keep up the good work 🤗!")
+        exit(0)
diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py
@@ -0,0 +1,260 @@
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, List, Tuple
+
+import yaml
+
+
+# loading package files: https://stackoverflow.com/a/20885799
+try:
+    import importlib.resources as pkg_resources
+except ImportError:
+    # Try backported to PY<37 `importlib_resources`.
+    import importlib_resources as pkg_resources
+
+from . import resources
+
+
+BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils"
+this_url = f"{BASE_REF_URL}/{__file__}"
+logger = logging.getLogger(__name__)
+
+
+def load_yaml_resource(resource: str) -> Tuple[Any, str]:
+    content = pkg_resources.read_text(resources, resource)
+    return yaml.safe_load(content), f"{BASE_REF_URL}/resources/{resource}"
+
+
+readme_structure, known_readme_structure_url = load_yaml_resource("readme_structure.yaml")
+
+FILLER_TEXT = [
+    "[Needs More Information]",
+    "[More Information Needed]",
+    "(https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)",
+]
+
+# Dictionary representation of section/readme, error_list, warning_list
+ReadmeValidatorOutput = Tuple[dict, List[str], List[str]]
+
+
+@dataclass
+class Section:
+    name: str
+    level: str
+    lines: List[str] = None
+
+    def __post_init__(self):
+        self.text = ""
+        self.is_empty = True
+        self.content = {}
+        if self.lines is not None:
+            self.parse()
+
+    def parse(self):
+        current_sub_level = ""
+        current_lines = []
+        code_start = False
+        for line in self.lines:
+            if line.strip(" \n") == "":
+                continue
+            elif line.strip(" \n")[:3] == "```":
+                code_start = not code_start
+            elif line.split()[0] == self.level + "#" and not code_start:
+                if current_sub_level != "":
+                    self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines)
+                    current_lines = []
+                else:
+                    if current_lines != []:
+                        self.text += "".join(current_lines).strip()
+                        if self.text != "" and self.text not in FILLER_TEXT:
+                            self.is_empty = False
+                        current_lines = []
+
+                current_sub_level = " ".join(line.split()[1:]).strip(" \n")
+            else:
+                current_lines.append(line)
+        else:
+            if current_sub_level != "":
+                if current_sub_level in self.content:
+                    print(
+                        f"Multiple sections with the same heading '{current_sub_level}' have been found. Using the latest one found."
+                    )
+                self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines)
+            else:
+                if current_lines != []:
+                    self.text += "".join(current_lines).strip()
+                    if self.text != "" and self.text not in FILLER_TEXT:
+                        self.is_empty = False
+
+    def validate(self, structure: dict) -> ReadmeValidatorOutput:
+        """Validates a Section class object recursively using the structure provided as a dictionary.
+
+        Args:
+            structute (:obj: `dict`): The dictionary representing expected structure.
+
+        Returns:
+            :obj: `ReadmeValidatorOutput`: The dictionary representation of the section, and the errors.
+        """
+        # Header text validation
+        error_list = []
+        warning_list = []
+        if structure["allow_empty"] == False:
+            # If header text is expected
+            if self.is_empty:
+                # If no header text is found, mention it in the error_list
+                error_list.append(
+                    f"Expected some header text for section '{self.name}', reference at {known_readme_structure_url}."
+                )
+
+        # Subsections Validation
+        if structure["subsections"] is not None:
+            # If subsections are expected
+            if self.content == {}:
+                # If no subsections are present
+                values = [subsection["name"] for subsection in structure["subsections"]]
+                # Mention the expected values in the error_list
+                error_list.append(
+                    f"Section '{self.name}' expected the following subsections: {values}, found `None`, reference at {known_readme_structure_url}."
+                )
+            else:
+                # If some subsections are present
+                structure_names = [subsection["name"] for subsection in structure["subsections"]]
+                for idx, name in enumerate(structure_names):
+                    if name not in self.content:
+                        # If the expected subsection is not present
+                        error_list.append(
+                            f"Section '{self.name}' is missing subsection: '{name}', reference at {known_readme_structure_url}."
+                        )
+                    else:
+                        # If the subsection is present, validate subsection, return the result
+                        # and concat the errors from subsection to section error_list
+                        _, subsec_error_list, subsec_warning_list = self.content[name].validate(
+                            structure["subsections"][idx]
+                        )
+                        error_list += subsec_error_list
+                        warning_list += subsec_warning_list
+
+                for name in self.content:
+                    if name not in structure_names:
+                        # If an extra subsection is present
+                        warning_list.append(
+                            f"'{self.name}' has an extra subsection: '{name}'. Skipping further validation checks for this subsection as expected structure is unknown."
+                        )
+        if error_list:
+            # If there are errors, do not return the dictionary as it is invalid
+            return {}, error_list, warning_list
+        else:
+            return self.to_dict(), error_list, warning_list
+
+    def to_dict(self) -> dict:
+        """Returns the dictionary representation of a section."""
+        return {
+            "name": self.name,
+            "text": self.text,
+            "is_empty": self.is_empty,
+            "subsections": [value.to_dict() for value in self.content.values()],
+        }
+
+
+class ReadMe(Section):  # Level 0
+    def __init__(self, name: str, lines: List[str], structure: dict = None):
+        super().__init__(name=name, level="")  # Not using lines here as we need to use a child class parse
+        self.structure = structure
+        self.yaml_tags_line_count = -2
+        self.lines = lines
+        if self.lines is not None:
+            self.parse()
+        if self.structure is None:
+            content, error_list, warning_list = self.validate(readme_structure)
+        else:
+            content, error_list, warning_list = self.validate(self.structure)
+
+        if error_list != [] or warning_list != []:
+            errors = "\n".join(list(map(lambda x: "-\t" + x, error_list + warning_list)))
+            error_string = f"The following issues were found for the README at `{self.name}`:\n" + errors
+            raise ValueError(error_string)
+
+    @classmethod
+    def from_readme(cls, path: Path, structure: dict = None):
+        with open(path) as f:
+            lines = f.readlines()
+        return cls(path, lines, structure)
+
+    @classmethod
+    def from_string(cls, string: str, structure: dict = None, root_name: str = "root"):
+        lines = string.split("\n")
+        return cls(root_name, lines, structure)
+
+    def parse(self):
+        # Skip Tags
+        tag_count = 0
+        line_count = 0
+
+        for line in self.lines:
+            self.yaml_tags_line_count += 1
+            if line.strip(" \n") == "---":
+                tag_count += 1
+                if tag_count == 2:
+                    break
+            line_count += 1
+
+        self.lines = self.lines[line_count + 1 :]  # Get the last + 1 th item.
+        super().parse()
+
+    def __str__(self):
+        """Returns the string of dictionary representation of the ReadMe."""
+        return str(self.to_dict())
+
+    def validate(self, readme_structure):
+        error_list = []
+        warning_list = []
+        if self.yaml_tags_line_count == 0:
+            warning_list.append(f"YAML Tags are not present in the README at `{self.name}`.")
+        elif self.yaml_tags_line_count == -1:
+            warning_list.append(f"Only the start of YAML tags present in the README at `{self.name}`.")
+
+        # Check how many first level sections are present.
+        num_first_level_keys = len(self.content.keys())
+        if num_first_level_keys > 1:
+            # If more than one, add to the error list, continue
+            error_list.append(
+                f"The README present at `{self.name}` has found several first-level headings: {list(self.content.keys())}. Only one heading is expected. Skipping further validation for this README."
+            )
+        elif num_first_level_keys < 1:
+            # If less than one, append error.
+            error_list.append(
+                f"The README present as `{self.name}` has no first-level headings. One heading is expected. Skipping further validation for this README."
+            )
+
+        else:
+            # If one exactly
+            start_key = list(self.content.keys())[0]  # Get the key
+            if start_key.startswith("Dataset Card for"):  # Check correct start
+
+                # If the starting is correct, validate all the sections
+                _, sec_error_list, sec_warning_list = self.content[start_key].validate(
+                    readme_structure["subsections"][0]
+                )
+                error_list += sec_error_list
+                warning_list += sec_warning_list
+            else:
+                # If not found, append error
+                error_list.append(
+                    f"No first-level heading starting with `Dataset Card for` found in README present at `{self.name}`. Skipping further validation for this README."
+                )
+        if error_list:
+            # If there are errors, do not return the dictionary as it is invalid
+            return {}, error_list, warning_list
+        else:
+            return self.to_dict(), error_list, warning_list
+
+
+if __name__ == "__main__":
+    from argparse import ArgumentParser
+
+    ap = ArgumentParser(usage="Validate the content (excluding YAML tags) of a README.md file.")
+    ap.add_argument("readme_filepath")
+    args = ap.parse_args()
+    readme_filepath = Path(args.readme_filepath)
+    readme = ReadMe.from_readme(readme_filepath)