From 993acd6cf2acab834f7840e38a33cc04f4e3c504 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Fri, 26 Mar 2021 22:29:33 +0530 Subject: [PATCH 01/28] Add Initial README parser --- src/datasets/utils/readme_parser.py | 77 +++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 src/datasets/utils/readme_parser.py diff --git a/src/datasets/utils/readme_parser.py b/src/datasets/utils/readme_parser.py new file mode 100644 index 00000000000..94343007e9f --- /dev/null +++ b/src/datasets/utils/readme_parser.py @@ -0,0 +1,77 @@ +# class_mapping = { +# "Dataset Description": DatasetDescription, +# } + +# key_mapping = { +# "Dataset Desription": 'dataset_desc' +# } +import json + + +class Section: + def __init__(self, name, level, lines=None): + self.name = name + self.level = level + self.attributes = "" + self.content = {} + if lines is not None: + self.parse(lines) + + def parse(self, lines): + current_sub_level = "" + current_lines = [] + code_start = False + for line in lines: + if line.strip(" \n") == "": + continue + elif line.strip(" \n")[:3] == "```": + code_start = not code_start + elif line.split()[0] == self.level + "#" and not code_start: + if current_sub_level != "": + self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines) + current_lines = [] + else: + if current_lines != []: + self.attributes += "".join(current_lines).strip() + current_lines = [] + + current_sub_level = " ".join(line.split()[1:]).strip(" \n") + else: + current_lines.append(line) + else: + if current_sub_level != "": + self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines) + else: + if current_lines != []: + self.attributes += "".join(current_lines).strip() + + def to_dict(self): + return { + "name": self.name, + "attributes": self.attributes, + "subsections": [value.to_dict() for value in self.content.values()], + } + + +class ReadMe(Section): # Level 0 + def __init__(self, file_path): + super().__init__(name=file_path, level="") + self.parse(file_path) + + def parse(self, file_path): + with open(self.name) as f: + # Skip Tags + tag_count = 0 + for line in f: + if line.strip(" \n") == "---": + tag_count += 1 + if tag_count == 2: + break + super().parse(f) + + +if __name__ == "__main__": + readme = ReadMe("./datasets/fashion_mnist/README.md") + # print(readme.attributes) + json_obj = json.dumps(readme.to_dict(), indent=4) + print(json_obj) From 014c49df8ccc9edc1c24f355863667f700d4339b Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Tue, 30 Mar 2021 00:44:10 +0530 Subject: [PATCH 02/28] Add basic validation checks --- src/datasets/utils/readme_parser.py | 160 ++++++++++++++++++++++++++-- 1 file changed, 151 insertions(+), 9 deletions(-) diff --git a/src/datasets/utils/readme_parser.py b/src/datasets/utils/readme_parser.py index 94343007e9f..3a04ec44d39 100644 --- a/src/datasets/utils/readme_parser.py +++ b/src/datasets/utils/readme_parser.py @@ -5,14 +5,109 @@ # key_mapping = { # "Dataset Desription": 'dataset_desc' # } -import json + +# import json +import yaml +import pprint +yaml_struc = """ +name: "" # Filename +text: false +subsections: + - name: "Dataset Card for X" + text: false + required: true + subsections: + - name: "Table of Contents" + text: true + subsections: null # meaning it should not be checked. + - name: "Dataset Description" + text: false + subsections: + - name: "Dataset Summary" + text: true + subsections: null + - name: "Supported Tasks and Leaderboards" + text: false + subsections: null + - name: Languages + text: false + subsections: null + - name: "Dataset Structure" + text: false + subsections: + - name: "Data Instances" + text: true + subsections: null + - name: "Data Fields" + text: true + subsections: null + - name: "Data Splits" + text: true + subsections: null + - name: "Dataset Creation" + text: false + subsections: + - name: "Curation Rationale" + text: false + subsections: null + - name: "Source Data" + text: false + subsections: + - name: "Initial Data Collection and Normalization" + text: false + subsections: null + - name: "Who are the source X producers?" + text: false + subsections: null + - name: "Annotations" + text: false + subsections: + - name: "Annotation process" + text: false + subsections: null + - name: "Who are the annotators?" + text: false + subsections: null + - name: "Personal and Sensitive Information" + text: false + subsections: null + - name: "Considerations for Using the Data" + text: false + subsections: + - name: "Social Impact of Dataset" + text: false + subsections: null + - name: "Discussion of Biases" + text: false + subsections: null + - name: "Other Known Limitations" + text: false + subsections: null + - name: "Additional Information" + text: false + subsections: + - name: "Dataset Curators" + text: false + subsections: null + - name: "Licensing Information" + text: false + subsections: null + - name: "Citation Information" + text: true + subsections: null + - name: "Contributions" + text: true + subsections: null +""" + +filler_text = ["[Needs More Information]", "[More Information Needed]", "(https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)"] class Section: def __init__(self, name, level, lines=None): self.name = name self.level = level - self.attributes = "" + self.text = "" self.content = {} if lines is not None: self.parse(lines) @@ -32,7 +127,7 @@ def parse(self, lines): current_lines = [] else: if current_lines != []: - self.attributes += "".join(current_lines).strip() + self.text += "".join(current_lines).strip() current_lines = [] current_sub_level = " ".join(line.split()[1:]).strip(" \n") @@ -43,12 +138,12 @@ def parse(self, lines): self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines) else: if current_lines != []: - self.attributes += "".join(current_lines).strip() + self.text += "".join(current_lines).strip() def to_dict(self): return { "name": self.name, - "attributes": self.attributes, + "test": self.text, "subsections": [value.to_dict() for value in self.content.values()], } @@ -67,11 +162,58 @@ def parse(self, file_path): tag_count += 1 if tag_count == 2: break + else: + print("The README doesn't contain proper tags. Please ensure you add the correct YAML tags.") + return super().parse(f) + def _validate_section(self, section , structure): + # Text validation + if structure['text'] == True: + if section.text.strip() == '': + print(f"Expected some text for {section.name}") + + if structure['subsections'] is not None: + # If no subsections present + if section.content == {}: + values = [subsection['name'] for subsection in structure['subsections']] + print(f"'{section.name}'' expected the following subsections: {values}, found `None`.") + else: + # Each key validation + structure_names = [subsection['name'] for subsection in structure['subsections']] + for idx, name in enumerate(structure_names): + if name not in section.content: + print(f"'{section.name}' is missing subsection: '{name}'.") + else: + self._validate_section(section.content[name], structure['subsections'][idx]) + + for name in section.content: + if name not in structure_names: + print(f"'{section.name}' has an extra subsection: '{name}'. Skipping validation checks for this subsection.") + + def validate(self, yaml_struc): + structure = yaml.safe_load(yaml_struc) + num_first_level_keys = len(self.content.keys()) + if num_first_level_keys > 1: + print(f"The README has found several first-level headings: {list(self.content.keys())}. Only one heading is expected.") + elif num_first_level_keys < 1: + print(f"The README has no first-level headings.") + + else: + print(self.content.keys()) + start_key = list(self.content.keys())[0] + if start_key.startswith("Dataset Card for"): + self._validate_section(self.content[start_key], structure['subsections'][0]) + else: + print("No first-level hearding starting with `Dataset Card for` found.") + if __name__ == "__main__": - readme = ReadMe("./datasets/fashion_mnist/README.md") - # print(readme.attributes) - json_obj = json.dumps(readme.to_dict(), indent=4) - print(json_obj) + readme = ReadMe("./dummy_readme.md") + print(readme.content["Dataset Card for FashionMNIST"].content["Additional Information"].content) + readme.validate(yaml_struc) + # print(readme.text) + # json_obj = json.dumps(readme.to_dict(), indent=4) + # print(json_obj) + # with open('dump.json', 'w') as f: + # json.dump(readme.to_dict(), f) From 204060249a44d9fb389237647e5d97acfe93daf6 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Tue, 30 Mar 2021 00:47:42 +0530 Subject: [PATCH 03/28] Minor fix --- src/datasets/utils/readme_parser.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/datasets/utils/readme_parser.py b/src/datasets/utils/readme_parser.py index 3a04ec44d39..7a842223e9b 100644 --- a/src/datasets/utils/readme_parser.py +++ b/src/datasets/utils/readme_parser.py @@ -170,8 +170,8 @@ def parse(self, file_path): def _validate_section(self, section , structure): # Text validation if structure['text'] == True: - if section.text.strip() == '': - print(f"Expected some text for {section.name}") + if section.text.strip() == '' or section.text.strip() in filler_text: + print(f"Expected some text for section '{section.name}'") if structure['subsections'] is not None: # If no subsections present @@ -210,10 +210,4 @@ def validate(self, yaml_struc): if __name__ == "__main__": readme = ReadMe("./dummy_readme.md") - print(readme.content["Dataset Card for FashionMNIST"].content["Additional Information"].content) readme.validate(yaml_struc) - # print(readme.text) - # json_obj = json.dumps(readme.to_dict(), indent=4) - # print(json_obj) - # with open('dump.json', 'w') as f: - # json.dump(readme.to_dict(), f) From 7a1654bb741cbf57033d7389311797f5f7f0fe2c Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Thu, 1 Apr 2021 13:20:00 +0530 Subject: [PATCH 04/28] Changes from review --- src/datasets/utils/readme_parser.py | 131 +++++++++++++++++----------- 1 file changed, 79 insertions(+), 52 deletions(-) diff --git a/src/datasets/utils/readme_parser.py b/src/datasets/utils/readme_parser.py index 7a842223e9b..a7b22076981 100644 --- a/src/datasets/utils/readme_parser.py +++ b/src/datasets/utils/readme_parser.py @@ -6,101 +6,107 @@ # "Dataset Desription": 'dataset_desc' # } +import pprint + # import json import yaml -import pprint + + yaml_struc = """ name: "" # Filename -text: false +allow_empty: false subsections: - name: "Dataset Card for X" - text: false - required: true + allow_empty: true subsections: - name: "Table of Contents" - text: true + allow_empty: false subsections: null # meaning it should not be checked. - name: "Dataset Description" - text: false + allow_empty: false subsections: - name: "Dataset Summary" - text: true + allow_empty: false subsections: null - name: "Supported Tasks and Leaderboards" - text: false + allow_empty: true subsections: null - name: Languages - text: false + allow_empty: true subsections: null - name: "Dataset Structure" - text: false + allow_empty: true subsections: - name: "Data Instances" - text: true + allow_empty: false subsections: null - name: "Data Fields" - text: true + allow_empty: false subsections: null - name: "Data Splits" - text: true + allow_empty: false subsections: null - name: "Dataset Creation" - text: false + allow_empty: true subsections: - name: "Curation Rationale" - text: false + allow_empty: true subsections: null - name: "Source Data" - text: false + allow_empty: true subsections: - name: "Initial Data Collection and Normalization" - text: false + allow_empty: true subsections: null - name: "Who are the source X producers?" - text: false + allow_empty: true subsections: null - name: "Annotations" - text: false + allow_empty: true subsections: - name: "Annotation process" - text: false + allow_empty: true subsections: null - name: "Who are the annotators?" - text: false + allow_empty: true subsections: null - name: "Personal and Sensitive Information" - text: false + allow_empty: true subsections: null - name: "Considerations for Using the Data" - text: false + allow_empty: true subsections: - name: "Social Impact of Dataset" - text: false + allow_empty: true subsections: null - name: "Discussion of Biases" - text: false + allow_empty: true subsections: null - name: "Other Known Limitations" - text: false + allow_empty: true subsections: null - name: "Additional Information" - text: false + allow_empty: true subsections: - name: "Dataset Curators" - text: false + allow_empty: true subsections: null - name: "Licensing Information" - text: false + allow_empty: true subsections: null - name: "Citation Information" - text: true + allow_empty: false subsections: null - name: "Contributions" - text: true + allow_empty: false subsections: null """ -filler_text = ["[Needs More Information]", "[More Information Needed]", "(https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)"] +filler_text = [ + "[Needs More Information]", + "[More Information Needed]", + "(https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)", +] class Section: @@ -108,6 +114,7 @@ def __init__(self, name, level, lines=None): self.name = name self.level = level self.text = "" + self.is_empty = True self.content = {} if lines is not None: self.parse(lines) @@ -128,6 +135,8 @@ def parse(self, lines): else: if current_lines != []: self.text += "".join(current_lines).strip() + if self.text != "" and self.text not in filler_text: + self.is_empty = False current_lines = [] current_sub_level = " ".join(line.split()[1:]).strip(" \n") @@ -139,11 +148,14 @@ def parse(self, lines): else: if current_lines != []: self.text += "".join(current_lines).strip() + if self.text != "" and self.text not in filler_text: + self.is_empty = False def to_dict(self): return { "name": self.name, - "test": self.text, + "text": self.text, + "is_empty": self.is_empty, "subsections": [value.to_dict() for value in self.content.values()], } @@ -163,51 +175,66 @@ def parse(self, file_path): if tag_count == 2: break else: - print("The README doesn't contain proper tags. Please ensure you add the correct YAML tags.") - return + raise ValueError("The README doesn't contain proper tags. Please ensure you add the correct YAML tags.") super().parse(f) - def _validate_section(self, section , structure): + def _validate_section(self, section, structure): # Text validation - if structure['text'] == True: - if section.text.strip() == '' or section.text.strip() in filler_text: - print(f"Expected some text for section '{section.name}'") + error_list = [] + if structure["allow_empty"] == False: + if section.is_empty: + print(section.text) + error_list.append(f"Expected some text for section '{section.name}'") - if structure['subsections'] is not None: + if structure["subsections"] is not None: # If no subsections present if section.content == {}: - values = [subsection['name'] for subsection in structure['subsections']] - print(f"'{section.name}'' expected the following subsections: {values}, found `None`.") + values = [subsection["name"] for subsection in structure["subsections"]] + error_list.append(f"'{section.name}'' expected the following subsections: {values}, found `None`.") else: # Each key validation - structure_names = [subsection['name'] for subsection in structure['subsections']] + structure_names = [subsection["name"] for subsection in structure["subsections"]] for idx, name in enumerate(structure_names): if name not in section.content: - print(f"'{section.name}' is missing subsection: '{name}'.") + error_list.append(f"'{section.name}' is missing subsection: '{name}'.") else: - self._validate_section(section.content[name], structure['subsections'][idx]) + error_list += self._validate_section(section.content[name], structure["subsections"][idx]) for name in section.content: if name not in structure_names: - print(f"'{section.name}' has an extra subsection: '{name}'. Skipping validation checks for this subsection.") + error_list.append( + f"'{section.name}' has an extra subsection: '{name}'. Skipping validation checks for this subsection." + ) + + return error_list + + def __str__(self): + return str(self.to_dict()) def validate(self, yaml_struc): + error_list = [] structure = yaml.safe_load(yaml_struc) num_first_level_keys = len(self.content.keys()) if num_first_level_keys > 1: - print(f"The README has found several first-level headings: {list(self.content.keys())}. Only one heading is expected.") + error_list.append( + f"The README has found several first-level headings: {list(self.content.keys())}. Only one heading is expected." + ) elif num_first_level_keys < 1: - print(f"The README has no first-level headings.") + error_list.append(f"The README has no first-level headings.") else: - print(self.content.keys()) start_key = list(self.content.keys())[0] if start_key.startswith("Dataset Card for"): - self._validate_section(self.content[start_key], structure['subsections'][0]) + error_list += self._validate_section(self.content[start_key], structure["subsections"][0]) else: - print("No first-level hearding starting with `Dataset Card for` found.") + error_list.append("No first-level heading starting with `Dataset Card for` found.") + return error_list if __name__ == "__main__": readme = ReadMe("./dummy_readme.md") - readme.validate(yaml_struc) + error_list = readme.validate(yaml_struc) + if error_list != []: + errors = "\n".join(list(map(lambda x: "-\t" + x, error_list))) + error_string = "The following issues were found with the README\n" + errors + raise ValueError(error_string) From 99d22226b1184b2de166e40484740221cf3bfc5f Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Thu, 8 Apr 2021 18:53:14 +0530 Subject: [PATCH 05/28] Make main into a function in readme_parser --- src/datasets/utils/readme_parser.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/datasets/utils/readme_parser.py b/src/datasets/utils/readme_parser.py index a7b22076981..dc4c1e54540 100644 --- a/src/datasets/utils/readme_parser.py +++ b/src/datasets/utils/readme_parser.py @@ -58,7 +58,7 @@ - name: "Initial Data Collection and Normalization" allow_empty: true subsections: null - - name: "Who are the source X producers?" + - name: "Who are the source language producers?" allow_empty: true subsections: null - name: "Annotations" @@ -163,6 +163,7 @@ def to_dict(self): class ReadMe(Section): # Level 0 def __init__(self, file_path): super().__init__(name=file_path, level="") + self.yaml_tags_line_count = -2 self.parse(file_path) def parse(self, file_path): @@ -170,12 +171,16 @@ def parse(self, file_path): # Skip Tags tag_count = 0 for line in f: + self.yaml_tags_line_count += 1 if line.strip(" \n") == "---": tag_count += 1 + if tag_count == 2: break else: - raise ValueError("The README doesn't contain proper tags. Please ensure you add the correct YAML tags.") + raise ValueError( + "The README doesn't contain proper tags. Please ensure you add the correct YAML tags." + ) super().parse(f) def _validate_section(self, section, structure): @@ -183,14 +188,13 @@ def _validate_section(self, section, structure): error_list = [] if structure["allow_empty"] == False: if section.is_empty: - print(section.text) error_list.append(f"Expected some text for section '{section.name}'") if structure["subsections"] is not None: # If no subsections present if section.content == {}: values = [subsection["name"] for subsection in structure["subsections"]] - error_list.append(f"'{section.name}'' expected the following subsections: {values}, found `None`.") + error_list.append(f"'{section.name}' expected the following subsections: {values}, found `None`.") else: # Each key validation structure_names = [subsection["name"] for subsection in structure["subsections"]] @@ -231,8 +235,12 @@ def validate(self, yaml_struc): return error_list -if __name__ == "__main__": - readme = ReadMe("./dummy_readme.md") +def validate_readme(file_path): + readme = ReadMe(file_path) + if readme.yaml_tags_line_count == 0: + raise Warning("YAML Tags are not present in this README.") + elif readme.yaml_tags_line_count == -1: + raise Warning("Only the start of YAML tags present in this README.") error_list = readme.validate(yaml_struc) if error_list != []: errors = "\n".join(list(map(lambda x: "-\t" + x, error_list))) From 1d788a9aa2931b1be706bdfe141a126eb9c970d5 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Mon, 26 Apr 2021 02:04:48 +0530 Subject: [PATCH 06/28] Move README validator to scripts --- .../datasets_readme_validator.py | 55 +++++++++++++++---- 1 file changed, 44 insertions(+), 11 deletions(-) rename src/datasets/utils/readme_parser.py => scripts/datasets_readme_validator.py (84%) diff --git a/src/datasets/utils/readme_parser.py b/scripts/datasets_readme_validator.py similarity index 84% rename from src/datasets/utils/readme_parser.py rename to scripts/datasets_readme_validator.py index dc4c1e54540..b6088155d82 100644 --- a/src/datasets/utils/readme_parser.py +++ b/scripts/datasets_readme_validator.py @@ -1,14 +1,4 @@ -# class_mapping = { -# "Dataset Description": DatasetDescription, -# } - -# key_mapping = { -# "Dataset Desription": 'dataset_desc' -# } - -import pprint - -# import json +import os import yaml @@ -246,3 +236,46 @@ def validate_readme(file_path): errors = "\n".join(list(map(lambda x: "-\t" + x, error_list))) error_string = "The following issues were found with the README\n" + errors raise ValueError(error_string) + + +if __name__ == '__main__': + datasets = os.listdir('./datasets') + for dataset in sorted(datasets): + if not dataset.startswith('.'): + file_path = os.path.join("./datasets", dataset, "README.md") + if os.path.exists(file_path): + try: + validate_readme(file_path) + except Exception as e: + print("=" * 30) + print(dataset) + print("=" * 30) + print(e) + else: + try: + raise FileNotFoundError(f"No such file: {file_path}") + except Exception as e: + print("=" * 30) + print(dataset) + print("=" * 30) + print(e) + datasets = os.listdir('./datasets') + for dataset in sorted(datasets): + if not dataset.startswith('.'): + file_path = os.path.join("./datasets", dataset, "README.md") + if os.path.exists(file_path): + try: + validate_readme(file_path) + except Exception as e: + print("=" * 30) + print(dataset) + print("=" * 30) + print(e) + else: + try: + raise FileNotFoundError(f"No such file: {file_path}") + except Exception as e: + print("=" * 30) + print(dataset) + print("=" * 30) + print(e) From 2d13f70160699864c42769ed3248c13537372f30 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Mon, 26 Apr 2021 13:35:34 +0530 Subject: [PATCH 07/28] Arrange README validation files --- scripts/datasets_readme_validator.py | 333 +++--------------- src/datasets/utils/readme.py | 173 +++++++++ src/datasets/utils/resources/__init__.py | 0 .../utils/resources/readme_structure.yaml | 87 +++++ 4 files changed, 315 insertions(+), 278 deletions(-) create mode 100644 src/datasets/utils/readme.py create mode 100644 src/datasets/utils/resources/__init__.py create mode 100644 src/datasets/utils/resources/readme_structure.yaml diff --git a/scripts/datasets_readme_validator.py b/scripts/datasets_readme_validator.py index b6088155d82..c56b59f2601 100644 --- a/scripts/datasets_readme_validator.py +++ b/scripts/datasets_readme_validator.py @@ -1,281 +1,58 @@ -import os -import yaml +#!/usr/bin/env python - -yaml_struc = """ -name: "" # Filename -allow_empty: false -subsections: - - name: "Dataset Card for X" - allow_empty: true - subsections: - - name: "Table of Contents" - allow_empty: false - subsections: null # meaning it should not be checked. - - name: "Dataset Description" - allow_empty: false - subsections: - - name: "Dataset Summary" - allow_empty: false - subsections: null - - name: "Supported Tasks and Leaderboards" - allow_empty: true - subsections: null - - name: Languages - allow_empty: true - subsections: null - - name: "Dataset Structure" - allow_empty: true - subsections: - - name: "Data Instances" - allow_empty: false - subsections: null - - name: "Data Fields" - allow_empty: false - subsections: null - - name: "Data Splits" - allow_empty: false - subsections: null - - name: "Dataset Creation" - allow_empty: true - subsections: - - name: "Curation Rationale" - allow_empty: true - subsections: null - - name: "Source Data" - allow_empty: true - subsections: - - name: "Initial Data Collection and Normalization" - allow_empty: true - subsections: null - - name: "Who are the source language producers?" - allow_empty: true - subsections: null - - name: "Annotations" - allow_empty: true - subsections: - - name: "Annotation process" - allow_empty: true - subsections: null - - name: "Who are the annotators?" - allow_empty: true - subsections: null - - name: "Personal and Sensitive Information" - allow_empty: true - subsections: null - - name: "Considerations for Using the Data" - allow_empty: true - subsections: - - name: "Social Impact of Dataset" - allow_empty: true - subsections: null - - name: "Discussion of Biases" - allow_empty: true - subsections: null - - name: "Other Known Limitations" - allow_empty: true - subsections: null - - name: "Additional Information" - allow_empty: true - subsections: - - name: "Dataset Curators" - allow_empty: true - subsections: null - - name: "Licensing Information" - allow_empty: true - subsections: null - - name: "Citation Information" - allow_empty: false - subsections: null - - name: "Contributions" - allow_empty: false - subsections: null +""" This script will run in CI and make sure all new changes to datasets readme files have valid content present. """ -filler_text = [ - "[Needs More Information]", - "[More Information Needed]", - "(https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)", -] - - -class Section: - def __init__(self, name, level, lines=None): - self.name = name - self.level = level - self.text = "" - self.is_empty = True - self.content = {} - if lines is not None: - self.parse(lines) - - def parse(self, lines): - current_sub_level = "" - current_lines = [] - code_start = False - for line in lines: - if line.strip(" \n") == "": - continue - elif line.strip(" \n")[:3] == "```": - code_start = not code_start - elif line.split()[0] == self.level + "#" and not code_start: - if current_sub_level != "": - self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines) - current_lines = [] - else: - if current_lines != []: - self.text += "".join(current_lines).strip() - if self.text != "" and self.text not in filler_text: - self.is_empty = False - current_lines = [] - - current_sub_level = " ".join(line.split()[1:]).strip(" \n") - else: - current_lines.append(line) - else: - if current_sub_level != "": - self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines) - else: - if current_lines != []: - self.text += "".join(current_lines).strip() - if self.text != "" and self.text not in filler_text: - self.is_empty = False - - def to_dict(self): - return { - "name": self.name, - "text": self.text, - "is_empty": self.is_empty, - "subsections": [value.to_dict() for value in self.content.values()], - } - - -class ReadMe(Section): # Level 0 - def __init__(self, file_path): - super().__init__(name=file_path, level="") - self.yaml_tags_line_count = -2 - self.parse(file_path) - - def parse(self, file_path): - with open(self.name) as f: - # Skip Tags - tag_count = 0 - for line in f: - self.yaml_tags_line_count += 1 - if line.strip(" \n") == "---": - tag_count += 1 - - if tag_count == 2: - break - else: - raise ValueError( - "The README doesn't contain proper tags. Please ensure you add the correct YAML tags." - ) - super().parse(f) - - def _validate_section(self, section, structure): - # Text validation - error_list = [] - if structure["allow_empty"] == False: - if section.is_empty: - error_list.append(f"Expected some text for section '{section.name}'") - - if structure["subsections"] is not None: - # If no subsections present - if section.content == {}: - values = [subsection["name"] for subsection in structure["subsections"]] - error_list.append(f"'{section.name}' expected the following subsections: {values}, found `None`.") - else: - # Each key validation - structure_names = [subsection["name"] for subsection in structure["subsections"]] - for idx, name in enumerate(structure_names): - if name not in section.content: - error_list.append(f"'{section.name}' is missing subsection: '{name}'.") - else: - error_list += self._validate_section(section.content[name], structure["subsections"][idx]) - - for name in section.content: - if name not in structure_names: - error_list.append( - f"'{section.name}' has an extra subsection: '{name}'. Skipping validation checks for this subsection." - ) - - return error_list - - def __str__(self): - return str(self.to_dict()) - - def validate(self, yaml_struc): - error_list = [] - structure = yaml.safe_load(yaml_struc) - num_first_level_keys = len(self.content.keys()) - if num_first_level_keys > 1: - error_list.append( - f"The README has found several first-level headings: {list(self.content.keys())}. Only one heading is expected." - ) - elif num_first_level_keys < 1: - error_list.append(f"The README has no first-level headings.") - - else: - start_key = list(self.content.keys())[0] - if start_key.startswith("Dataset Card for"): - error_list += self._validate_section(self.content[start_key], structure["subsections"][0]) - else: - error_list.append("No first-level heading starting with `Dataset Card for` found.") - return error_list - - -def validate_readme(file_path): - readme = ReadMe(file_path) - if readme.yaml_tags_line_count == 0: - raise Warning("YAML Tags are not present in this README.") - elif readme.yaml_tags_line_count == -1: - raise Warning("Only the start of YAML tags present in this README.") - error_list = readme.validate(yaml_struc) - if error_list != []: - errors = "\n".join(list(map(lambda x: "-\t" + x, error_list))) - error_string = "The following issues were found with the README\n" + errors - raise ValueError(error_string) - - -if __name__ == '__main__': - datasets = os.listdir('./datasets') - for dataset in sorted(datasets): - if not dataset.startswith('.'): - file_path = os.path.join("./datasets", dataset, "README.md") - if os.path.exists(file_path): - try: - validate_readme(file_path) - except Exception as e: - print("=" * 30) - print(dataset) - print("=" * 30) - print(e) - else: - try: - raise FileNotFoundError(f"No such file: {file_path}") - except Exception as e: - print("=" * 30) - print(dataset) - print("=" * 30) - print(e) - datasets = os.listdir('./datasets') - for dataset in sorted(datasets): - if not dataset.startswith('.'): - file_path = os.path.join("./datasets", dataset, "README.md") - if os.path.exists(file_path): - try: - validate_readme(file_path) - except Exception as e: - print("=" * 30) - print(dataset) - print("=" * 30) - print(e) - else: - try: - raise FileNotFoundError(f"No such file: {file_path}") - except Exception as e: - print("=" * 30) - print(dataset) - print("=" * 30) - print(e) +from pathlib import Path +from subprocess import check_output +from typing import List + +from datasets.utils.readme import validate_readme + + +def get_changed_files(repo_path: Path) -> List[Path]: + diff_output = check_output(["git", "diff", "--name-only", "HEAD..origin/master"], cwd=repo_path) + changed_files = [Path(repo_path, f) for f in diff_output.decode().splitlines()] + return changed_files + + +if __name__ == "__main__": + import logging + from argparse import ArgumentParser + + logging.basicConfig(level=logging.DEBUG) + + ap = ArgumentParser() + ap.add_argument("--repo_path", type=Path, default=Path.cwd()) + ap.add_argument("--check_all", action="store_true") + args = ap.parse_args() + + repo_path: Path = args.repo_path + if args.check_all: + readmes = [dd / "README.md" for dd in (repo_path / "datasets").iterdir()] + else: + changed_files = get_changed_files(repo_path) + readmes = [ + f + for f in changed_files + if f.exists() and f.name.lower() == "readme.md" and f.parent.parent.name == "datasets" + ] + + failed: List[Path] = [] + for readme in sorted(readmes): + try: + DatasetMetadata.from_readme(readme) + logging.debug(f"✅️ Validated '{readme.relative_to(repo_path)}'") + except TypeError as e: + failed.append(readme) + logging.warning(f"❌ Failed to validate '{readme.relative_to(repo_path)}':\n{e}") + except Exception as e: + failed.append(readme) + logging.warning(f"⁉️ Something unexpected happened on '{readme.relative_to(repo_path)}':\n{e}") + + if len(failed) > 0: + logging.info(f"❌ Failed on {len(failed)} files.") + exit(1) + else: + logging.info("All is well, keep up the good work 🤗!") + exit(0) diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py new file mode 100644 index 00000000000..c3500b675ab --- /dev/null +++ b/src/datasets/utils/readme.py @@ -0,0 +1,173 @@ +import logging +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Tuple + +import yaml + + +BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils" +this_url = f"{BASE_REF_URL}/{__file__}" +logger = logging.getLogger(__name__) + + +def load_yaml_resource(resource: str) -> Tuple[Any, str]: + with open(resource) as f: + content = yaml.safe_load(f) + return content, f"{BASE_REF_URL}/resources/{resource}" + + +readme_structure, known_readme_structure_url = load_yaml_resource("readme_structure.yaml") +filler_text = [ + "[Needs More Information]", + "[More Information Needed]", + "(https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)", +] + + +class Section: + def __init__(self, name, level, lines=None): + self.name = name + self.level = level + self.text = "" + self.is_empty = True + self.content = {} + if lines is not None: + self.parse(lines) + + def parse(self, lines): + current_sub_level = "" + current_lines = [] + code_start = False + for line in lines: + if line.strip(" \n") == "": + continue + elif line.strip(" \n")[:3] == "```": + code_start = not code_start + elif line.split()[0] == self.level + "#" and not code_start: + if current_sub_level != "": + self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines) + current_lines = [] + else: + if current_lines != []: + self.text += "".join(current_lines).strip() + if self.text != "" and self.text not in filler_text: + self.is_empty = False + current_lines = [] + + current_sub_level = " ".join(line.split()[1:]).strip(" \n") + else: + current_lines.append(line) + else: + if current_sub_level != "": + self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines) + else: + if current_lines != []: + self.text += "".join(current_lines).strip() + if self.text != "" and self.text not in filler_text: + self.is_empty = False + + def to_dict(self): + return { + "name": self.name, + "text": self.text, + "is_empty": self.is_empty, + "subsections": [value.to_dict() for value in self.content.values()], + } + + +class ReadMe(Section): # Level 0 + def __init__(self, file_path): + super().__init__(name=file_path, level="") + self.yaml_tags_line_count = -2 + self.parse(file_path) + + def parse(self, file_path): + with open(self.name) as f: + # Skip Tags + tag_count = 0 + for line in f: + self.yaml_tags_line_count += 1 + if line.strip(" \n") == "---": + tag_count += 1 + + if tag_count == 2: + break + else: + raise ValueError( + "The README doesn't contain proper tags. Please ensure you add the correct YAML tags." + ) + super().parse(f) + + def _validate_section(self, section, structure): + # Text validation + error_list = [] + if structure["allow_empty"] == False: + if section.is_empty: + error_list.append(f"Expected some text for section '{section.name}'") + + if structure["subsections"] is not None: + # If no subsections present + if section.content == {}: + values = [subsection["name"] for subsection in structure["subsections"]] + error_list.append(f"'{section.name}' expected the following subsections: {values}, found `None`.") + else: + # Each key validation + structure_names = [subsection["name"] for subsection in structure["subsections"]] + for idx, name in enumerate(structure_names): + if name not in section.content: + error_list.append(f"'{section.name}' is missing subsection: '{name}'.") + else: + error_list += self._validate_section(section.content[name], structure["subsections"][idx]) + + for name in section.content: + if name not in structure_names: + error_list.append( + f"'{section.name}' has an extra subsection: '{name}'. Skipping validation checks for this subsection." + ) + + return error_list + + def __str__(self): + return str(self.to_dict()) + + def validate(self, readme_structure): + error_list = [] + num_first_level_keys = len(self.content.keys()) + if num_first_level_keys > 1: + error_list.append( + f"The README has found several first-level headings: {list(self.content.keys())}. Only one heading is expected." + ) + elif num_first_level_keys < 1: + error_list.append(f"The README has no first-level headings.") + + else: + start_key = list(self.content.keys())[0] + if start_key.startswith("Dataset Card for"): + error_list += self._validate_section(self.content[start_key], readme_structure["subsections"][0]) + else: + error_list.append("No first-level heading starting with `Dataset Card for` found.") + return error_list + + +def validate_readme(file_path): + readme = ReadMe(file_path) + if readme.yaml_tags_line_count == 0: + raise Warning("YAML Tags are not present in this README.") + elif readme.yaml_tags_line_count == -1: + raise Warning("Only the start of YAML tags present in this README.") + error_list = readme.validate(readme_structure) + if error_list != []: + errors = "\n".join(list(map(lambda x: "-\t" + x, error_list))) + error_string = "The following issues were found with the README\n" + errors + raise ValueError(error_string) + + +if __name__ == "__main__": + from argparse import ArgumentParser + + ap = ArgumentParser(usage="Validate the content (excluding YAML tags) of a README.md file.") + ap.add_argument("readme_filepath") + args = ap.parse_args() + readme_filepath = Path(args.readme_filepath) + validate_readme(readme_filepath) diff --git a/src/datasets/utils/resources/__init__.py b/src/datasets/utils/resources/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/datasets/utils/resources/readme_structure.yaml b/src/datasets/utils/resources/readme_structure.yaml new file mode 100644 index 00000000000..7fa2f663df0 --- /dev/null +++ b/src/datasets/utils/resources/readme_structure.yaml @@ -0,0 +1,87 @@ +name: "" # Filename comes here +allow_empty: false +subsections: + - name: "Dataset Card for X" # First-level markdown heading + allow_empty: true + subsections: + - name: "Table of Contents" + allow_empty: false + subsections: null # meaning it should not be checked. + - name: "Dataset Description" + allow_empty: false + subsections: + - name: "Dataset Summary" + allow_empty: false + subsections: null + - name: "Supported Tasks and Leaderboards" + allow_empty: true + subsections: null + - name: Languages + allow_empty: true + subsections: null + - name: "Dataset Structure" + allow_empty: true + subsections: + - name: "Data Instances" + allow_empty: false + subsections: null + - name: "Data Fields" + allow_empty: false + subsections: null + - name: "Data Splits" + allow_empty: false + subsections: null + - name: "Dataset Creation" + allow_empty: true + subsections: + - name: "Curation Rationale" + allow_empty: true + subsections: null + - name: "Source Data" + allow_empty: true + subsections: + - name: "Initial Data Collection and Normalization" + allow_empty: true + subsections: null + - name: "Who are the source language producers?" + allow_empty: true + subsections: null + - name: "Annotations" + allow_empty: true + subsections: + - name: "Annotation process" + allow_empty: true + subsections: null + - name: "Who are the annotators?" + allow_empty: true + subsections: null + - name: "Personal and Sensitive Information" + allow_empty: true + subsections: null + - name: "Considerations for Using the Data" + allow_empty: true + subsections: + - name: "Social Impact of Dataset" + allow_empty: true + subsections: null + - name: "Discussion of Biases" + allow_empty: true + subsections: null + - name: "Other Known Limitations" + allow_empty: true + subsections: null + - name: "Additional Information" + allow_empty: true + subsections: + - name: "Dataset Curators" + allow_empty: true + subsections: null + - name: "Licensing Information" + allow_empty: true + subsections: null + - name: "Citation Information" + allow_empty: false + subsections: null + - name: "Contributions" + allow_empty: false + subsections: null \ No newline at end of file From ee31e1511ca056dcde579c9f5e0f0c7b232bbb69 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Tue, 27 Apr 2021 13:54:52 +0530 Subject: [PATCH 08/28] Update readme validator class --- src/datasets/utils/readme.py | 247 +++++++++++++++++++++++------------ 1 file changed, 167 insertions(+), 80 deletions(-) diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py index c3500b675ab..1e5d9b9a42e 100644 --- a/src/datasets/utils/readme.py +++ b/src/datasets/utils/readme.py @@ -1,45 +1,61 @@ import logging from dataclasses import dataclass from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Tuple +from typing import Any, List, Tuple import yaml +# loading package files: https://stackoverflow.com/a/20885799 +try: + import importlib.resources as pkg_resources +except ImportError: + # Try backported to PY<37 `importlib_resources`. + import importlib_resources as pkg_resources + +from .import resources + + BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils" this_url = f"{BASE_REF_URL}/{__file__}" logger = logging.getLogger(__name__) def load_yaml_resource(resource: str) -> Tuple[Any, str]: - with open(resource) as f: - content = yaml.safe_load(f) - return content, f"{BASE_REF_URL}/resources/{resource}" + content = pkg_resources.read_text(resources, resource) + return yaml.safe_load(content), f"{BASE_REF_URL}/resources/{resource}" readme_structure, known_readme_structure_url = load_yaml_resource("readme_structure.yaml") -filler_text = [ + +FILLER_TEXT = [ "[Needs More Information]", "[More Information Needed]", "(https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)", ] +# Dictionary representation of section/readme, error_list, warning_list +ReadmeValidatorOutput = Tuple[dict, List[str], List[str]] + +@dataclass class Section: - def __init__(self, name, level, lines=None): - self.name = name - self.level = level + name: str + level: str + lines: List[str] = None + + def __post_init__(self): self.text = "" self.is_empty = True self.content = {} - if lines is not None: - self.parse(lines) + if self.lines is not None: + self.parse() - def parse(self, lines): + def parse(self): current_sub_level = "" current_lines = [] code_start = False - for line in lines: + for line in self.lines: if line.strip(" \n") == "": continue elif line.strip(" \n")[:3] == "```": @@ -51,7 +67,7 @@ def parse(self, lines): else: if current_lines != []: self.text += "".join(current_lines).strip() - if self.text != "" and self.text not in filler_text: + if self.text != "" and self.text not in FILLER_TEXT: self.is_empty = False current_lines = [] @@ -60,107 +76,178 @@ def parse(self, lines): current_lines.append(line) else: if current_sub_level != "": + if current_sub_level in self.content: + print( + f"Multiple sections with the same heading '{current_sub_level}' have been found. Using the latest one found." + ) self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines) else: if current_lines != []: self.text += "".join(current_lines).strip() - if self.text != "" and self.text not in filler_text: + if self.text != "" and self.text not in FILLER_TEXT: self.is_empty = False - def to_dict(self): - return { - "name": self.name, - "text": self.text, - "is_empty": self.is_empty, - "subsections": [value.to_dict() for value in self.content.values()], - } + def validate(self, structure: dict) -> ReadmeValidatorOutput: + """Validates a Section class object recursively using the structure provided as a dictionary. + Args: + structute (:obj: `dict`): The dictionary representing expected structure. -class ReadMe(Section): # Level 0 - def __init__(self, file_path): - super().__init__(name=file_path, level="") - self.yaml_tags_line_count = -2 - self.parse(file_path) - - def parse(self, file_path): - with open(self.name) as f: - # Skip Tags - tag_count = 0 - for line in f: - self.yaml_tags_line_count += 1 - if line.strip(" \n") == "---": - tag_count += 1 - - if tag_count == 2: - break - else: - raise ValueError( - "The README doesn't contain proper tags. Please ensure you add the correct YAML tags." - ) - super().parse(f) - - def _validate_section(self, section, structure): - # Text validation + Returns: + :obj: `ReadmeValidatorOutput`: The dictionary representation of the section, and the errors. + """ + # Header text validation error_list = [] + warning_list = [] if structure["allow_empty"] == False: - if section.is_empty: - error_list.append(f"Expected some text for section '{section.name}'") + # If header text is expected + if self.is_empty: + # If no header text is found, mention it in the error_list + error_list.append( + f"Expected some header text for section '{self.name}', reference at {known_readme_structure_url}." + ) + # Subsections Validation if structure["subsections"] is not None: - # If no subsections present - if section.content == {}: + # If subsections are expected + if self.content == {}: + # If no subsections are present values = [subsection["name"] for subsection in structure["subsections"]] - error_list.append(f"'{section.name}' expected the following subsections: {values}, found `None`.") + # Mention the expected values in the error_list + error_list.append( + f"Section '{self.name}' expected the following subsections: {values}, found `None`, reference at {known_readme_structure_url}." + ) else: - # Each key validation + # If some subsections are present structure_names = [subsection["name"] for subsection in structure["subsections"]] for idx, name in enumerate(structure_names): - if name not in section.content: - error_list.append(f"'{section.name}' is missing subsection: '{name}'.") + if name not in self.content: + # If the expected subsection is not present + error_list.append( + f"Section '{self.name}' is missing subsection: '{name}', reference at {known_readme_structure_url}." + ) else: - error_list += self._validate_section(section.content[name], structure["subsections"][idx]) + # If the subsection is present, validate subsection, return the result + # and concat the errors from subsection to section error_list + _, subsec_error_list, subsec_warning_list = self.content[name].validate( + structure["subsections"][idx] + ) + error_list += subsec_error_list + warning_list += subsec_warning_list - for name in section.content: + for name in self.content: if name not in structure_names: - error_list.append( - f"'{section.name}' has an extra subsection: '{name}'. Skipping validation checks for this subsection." + # If an extra subsection is present + warning_list.append( + f"'{self.name}' has an extra subsection: '{name}'. Skipping further validation checks for this subsection as expected structure is unknown." ) + if error_list: + # If there are errors, do not return the dictionary as it is invalid + return {}, error_list, warning_list + else: + return self.to_dict(), error_list, warning_list + + def to_dict(self) -> dict: + """Returns the dictionary representation of a section.""" + return { + "name": self.name, + "text": self.text, + "is_empty": self.is_empty, + "subsections": [value.to_dict() for value in self.content.values()], + } + + +class ReadMe(Section): # Level 0 + def __init__(self, name: str, lines: List[str], structure: dict = None): + super().__init__(name=name, level="") # Not using lines here as we need to use a child class parse + self.structure = structure + self.yaml_tags_line_count = -2 + self.lines = lines + if self.lines is not None: + self.parse() + if self.structure is None: + content, error_list, warning_list = self.validate(readme_structure) + else: + content, error_list, warning_list = self.validate(self.structure) + + if error_list != [] or warning_list != []: + errors = "\n".join(list(map(lambda x: "-\t" + x, error_list + warning_list))) + error_string = f"The following issues were found for the README at `{self.name}`:\n" + errors + raise ValueError(error_string) + + @classmethod + def from_readme(cls, path: Path, structure: dict = None): + with open(path) as f: + lines = f.readlines() + return cls(path, lines, structure) - return error_list + @classmethod + def from_string(cls, string: str, structure: dict = None, root_name:str="root"): + lines = string.split("\n") + return cls(root_name, lines, structure) + + def parse(self): + # Skip Tags + tag_count = 0 + line_count = 0 + + for line in self.lines: + self.yaml_tags_line_count += 1 + if line.strip(" \n") == "---": + tag_count += 1 + if tag_count == 2: + break + line_count+=1 + + self.lines = self.lines[line_count+1:] # Get the last + 1 th item. + super().parse() def __str__(self): + """Returns the string of dictionary representation of the ReadMe.""" return str(self.to_dict()) def validate(self, readme_structure): error_list = [] + warning_list = [] + if self.yaml_tags_line_count == 0: + warning_list.append(f"YAML Tags are not present in the README at `{self.name}`.") + elif self.yaml_tags_line_count == -1: + warning_list.append(f"Only the start of YAML tags present in the README at `{self.name}`.") + + # Check how many first level sections are present. num_first_level_keys = len(self.content.keys()) if num_first_level_keys > 1: + # If more than one, add to the error list, continue error_list.append( - f"The README has found several first-level headings: {list(self.content.keys())}. Only one heading is expected." + f"The README present at `{self.name}` has found several first-level headings: {list(self.content.keys())}. Only one heading is expected. Skipping further validation for this README." ) elif num_first_level_keys < 1: - error_list.append(f"The README has no first-level headings.") + # If less than one, append error. + error_list.append( + f"The README present as `{self.name}` has no first-level headings. One heading is expected. Skipping further validation for this README." + ) else: - start_key = list(self.content.keys())[0] - if start_key.startswith("Dataset Card for"): - error_list += self._validate_section(self.content[start_key], readme_structure["subsections"][0]) - else: - error_list.append("No first-level heading starting with `Dataset Card for` found.") - return error_list - + # If one exactly + start_key = list(self.content.keys())[0] # Get the key + if start_key.startswith("Dataset Card for"): # Check correct start -def validate_readme(file_path): - readme = ReadMe(file_path) - if readme.yaml_tags_line_count == 0: - raise Warning("YAML Tags are not present in this README.") - elif readme.yaml_tags_line_count == -1: - raise Warning("Only the start of YAML tags present in this README.") - error_list = readme.validate(readme_structure) - if error_list != []: - errors = "\n".join(list(map(lambda x: "-\t" + x, error_list))) - error_string = "The following issues were found with the README\n" + errors - raise ValueError(error_string) + # If the starting is correct, validate all the sections + _, sec_error_list, sec_warning_list = self.content[start_key].validate( + readme_structure["subsections"][0] + ) + error_list += sec_error_list + warning_list += sec_warning_list + else: + # If not found, append error + error_list.append( + f"No first-level heading starting with `Dataset Card for` found in README present at `{self.name}`. Skipping further validation for this README." + ) + if error_list: + # If there are errors, do not return the dictionary as it is invalid + return {}, error_list, warning_list + else: + return self.to_dict(), error_list, warning_list if __name__ == "__main__": @@ -170,4 +257,4 @@ def validate_readme(file_path): ap.add_argument("readme_filepath") args = ap.parse_args() readme_filepath = Path(args.readme_filepath) - validate_readme(readme_filepath) + readme = ReadMe.from_readme(readme_filepath) From ae60ce58e6ab4791f9d745fe574026e5aa6255f0 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Tue, 27 Apr 2021 14:09:14 +0530 Subject: [PATCH 09/28] Add from_string tests --- src/datasets/utils/readme.py | 8 +- .../utils/resources/readme_structure.yaml | 50 ++-- tests/test_readme_util.py | 242 ++++++++++++++++++ 3 files changed, 271 insertions(+), 29 deletions(-) create mode 100644 tests/test_readme_util.py diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py index 1e5d9b9a42e..ffc38b1596d 100644 --- a/src/datasets/utils/readme.py +++ b/src/datasets/utils/readme.py @@ -13,7 +13,7 @@ # Try backported to PY<37 `importlib_resources`. import importlib_resources as pkg_resources -from .import resources +from . import resources BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils" @@ -182,7 +182,7 @@ def from_readme(cls, path: Path, structure: dict = None): return cls(path, lines, structure) @classmethod - def from_string(cls, string: str, structure: dict = None, root_name:str="root"): + def from_string(cls, string: str, structure: dict = None, root_name: str = "root"): lines = string.split("\n") return cls(root_name, lines, structure) @@ -197,9 +197,9 @@ def parse(self): tag_count += 1 if tag_count == 2: break - line_count+=1 + line_count += 1 - self.lines = self.lines[line_count+1:] # Get the last + 1 th item. + self.lines = self.lines[line_count + 1 :] # Get the last + 1 th item. super().parse() def __str__(self): diff --git a/src/datasets/utils/resources/readme_structure.yaml b/src/datasets/utils/resources/readme_structure.yaml index 7fa2f663df0..fc3356f2675 100644 --- a/src/datasets/utils/resources/readme_structure.yaml +++ b/src/datasets/utils/resources/readme_structure.yaml @@ -34,30 +34,30 @@ subsections: - name: "Dataset Creation" allow_empty: true subsections: - - name: "Curation Rationale" - allow_empty: true - subsections: null - - name: "Source Data" - allow_empty: true - subsections: - - name: "Initial Data Collection and Normalization" - allow_empty: true - subsections: null - - name: "Who are the source language producers?" - allow_empty: true - subsections: null - - name: "Annotations" - allow_empty: true - subsections: - - name: "Annotation process" - allow_empty: true - subsections: null - - name: "Who are the annotators?" - allow_empty: true - subsections: null - - name: "Personal and Sensitive Information" - allow_empty: true - subsections: null + - name: "Curation Rationale" + allow_empty: true + subsections: null + - name: "Source Data" + allow_empty: true + subsections: + - name: "Initial Data Collection and Normalization" + allow_empty: true + subsections: null + - name: "Who are the source language producers?" + allow_empty: true + subsections: null + - name: "Annotations" + allow_empty: true + subsections: + - name: "Annotation process" + allow_empty: true + subsections: null + - name: "Who are the annotators?" + allow_empty: true + subsections: null + - name: "Personal and Sensitive Information" + allow_empty: true + subsections: null - name: "Considerations for Using the Data" allow_empty: true subsections: @@ -84,4 +84,4 @@ subsections: subsections: null - name: "Contributions" allow_empty: false - subsections: null \ No newline at end of file + subsections: null diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py new file mode 100644 index 00000000000..84ab24cde45 --- /dev/null +++ b/tests/test_readme_util.py @@ -0,0 +1,242 @@ +import tempfile +import unittest +from pathlib import Path + +import yaml + +from datasets.utils.readme import ReadMe + + +def _dedent(string: str) -> str: + return "\n".join([line.lstrip() for line in string.splitlines()]) + + +EXPECTED_STRUCTURE = yaml.safe_load( + """\ +name: "" +allow_empty: false +subsections: + - name: "Dataset Card for X" # First-level markdown heading + allow_empty: true + subsections: + - name: "Table of Contents" + allow_empty: false + subsections: null # meaning it should not be checked. + - name: "Dataset Description" + allow_empty: false + subsections: + - name: "Dataset Summary" + allow_empty: false + subsections: null + - name: "Supported Tasks and Leaderboards" + allow_empty: true + subsections: null + - name: Languages + allow_empty: true + subsections: null +""" +) + +README_CORRECT = """\ +--- +languages: +- zh +- en +--- + +# Dataset Card for My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +### Supported Tasks and Leaderboards +### Languages +""" + +README_EMPTY_YAML = """\ +--- +--- + +# Dataset Card for My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +### Supported Tasks and Leaderboards +### Languages +""" + +README_INCORRECT_YAML = """\ +--- + +# Dataset Card for My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +### Supported Tasks and Leaderboards +### Languages +""" + +README_NO_YAML = """\ +# Dataset Card for My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +### Supported Tasks and Leaderboards +### Languages +""" + +README_MISSING_TEXT = """\ +--- +languages: +- zh +- en +--- + +# Dataset Card for My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +### Supported Tasks and Leaderboards +### Languages +""" + +README_MISSING_SUBSECTION = """\ +--- +languages: +- zh +- en +--- + +# Dataset Card for My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +### Languages +""" + +README_MISSING_FIRST_LEVEL = """\ +--- +languages: +- zh +- en +--- + +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +### Supported Tasks and Leaderboards +### Languages +""" + + +README_MULTIPLE_WRONG_FIRST_LEVEL = """\ +--- +languages: +- zh +- en +--- + +# Dataset Card for My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +### Supported Tasks and Leaderboards +### Languages +# Dataset Card My Dataset +""" + +README_WRONG_FIRST_LEVEL = """\ +--- +languages: +- zh +- en +--- + +# Dataset Card My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +### Supported Tasks and Leaderboards +### Languages +""" + +README_EMPTY = "" + +README_MULTIPLE_SAME_HEADING_1 = """\ +--- +languages: +- zh +- en +--- + +# Dataset Card for My Dataset +# Dataset Card for My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +### Supported Tasks and Leaderboards +### Languages +""" + + +class TestReadMeUtils(unittest.TestCase): + def test_from_string(self): + ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE) + with self.assertRaises(ValueError): + ReadMe.from_string(README_EMPTY_YAML, EXPECTED_STRUCTURE) + with self.assertRaises(ValueError): + ReadMe.from_string(README_INCORRECT_YAML, EXPECTED_STRUCTURE) + with self.assertRaises(ValueError): + ReadMe.from_string(README_NO_YAML, EXPECTED_STRUCTURE) + with self.assertRaises(ValueError): + ReadMe.from_string(README_MISSING_TEXT, EXPECTED_STRUCTURE) + with self.assertRaises(ValueError): + ReadMe.from_string(README_MISSING_SUBSECTION, EXPECTED_STRUCTURE) + with self.assertRaises(ValueError): + ReadMe.from_string(README_MISSING_FIRST_LEVEL, EXPECTED_STRUCTURE) + with self.assertRaises(ValueError): + ReadMe.from_string(README_MULTIPLE_WRONG_FIRST_LEVEL, EXPECTED_STRUCTURE) + with self.assertRaises(ValueError): + ReadMe.from_string(README_WRONG_FIRST_LEVEL, EXPECTED_STRUCTURE) + with self.assertRaises(ValueError): + ReadMe.from_string(README_EMPTY, EXPECTED_STRUCTURE) + + ReadMe.from_string(README_MULTIPLE_SAME_HEADING_1, EXPECTED_STRUCTURE) + # ReadMe.from_string(MISSING_SUBSECTION, EXPECTED_STRUCTURE) + # ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE) + # ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE) + # ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE) + # ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE) + # ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE) + + +if __name__ == "__main__": + unittest.main() From 057d0d95aab96d3dce45901beb31c68eda233688 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Wed, 28 Apr 2021 21:30:09 +0530 Subject: [PATCH 10/28] Add PyTest tests --- src/datasets/utils/readme.py | 52 +++++++------ tests/test_readme_util.py | 137 +++++++++++++++++++++-------------- 2 files changed, 112 insertions(+), 77 deletions(-) diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py index ffc38b1596d..a102cda38c9 100644 --- a/src/datasets/utils/readme.py +++ b/src/datasets/utils/readme.py @@ -48,6 +48,8 @@ def __post_init__(self): self.text = "" self.is_empty = True self.content = {} + self.parsing_error_list = [] + self.parsing_warning_list = [] if self.lines is not None: self.parse() @@ -77,8 +79,8 @@ def parse(self): else: if current_sub_level != "": if current_sub_level in self.content: - print( - f"Multiple sections with the same heading '{current_sub_level}' have been found. Using the latest one found." + self.parsing_error_list.append( + f"Multiple sections with the same heading '{current_sub_level}' have been found. Please keep only one of these sections." ) self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines) else: @@ -103,9 +105,7 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput: # If header text is expected if self.is_empty: # If no header text is found, mention it in the error_list - error_list.append( - f"Expected some header text for section '{self.name}', reference at {known_readme_structure_url}." - ) + error_list.append(f"Expected some header text for section '{self.name}'.") # Subsections Validation if structure["subsections"] is not None: @@ -114,18 +114,14 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput: # If no subsections are present values = [subsection["name"] for subsection in structure["subsections"]] # Mention the expected values in the error_list - error_list.append( - f"Section '{self.name}' expected the following subsections: {values}, found `None`, reference at {known_readme_structure_url}." - ) + error_list.append(f"Section '{self.name}' expected the following subsections: {values}, found `None`.") else: # If some subsections are present structure_names = [subsection["name"] for subsection in structure["subsections"]] for idx, name in enumerate(structure_names): if name not in self.content: # If the expected subsection is not present - error_list.append( - f"Section '{self.name}' is missing subsection: '{name}', reference at {known_readme_structure_url}." - ) + error_list.append(f"Section '{self.name}' is missing subsection: '{name}'.") else: # If the subsection is present, validate subsection, return the result # and concat the errors from subsection to section error_list @@ -141,6 +137,8 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput: warning_list.append( f"'{self.name}' has an extra subsection: '{name}'. Skipping further validation checks for this subsection as expected structure is unknown." ) + error_list = self.parsing_error_list + error_list + warning_list = self.parsing_warning_list + warning_list if error_list: # If there are errors, do not return the dictionary as it is invalid return {}, error_list, warning_list @@ -162,14 +160,19 @@ def __init__(self, name: str, lines: List[str], structure: dict = None): super().__init__(name=name, level="") # Not using lines here as we need to use a child class parse self.structure = structure self.yaml_tags_line_count = -2 + self.tag_count = 0 self.lines = lines if self.lines is not None: self.parse() + + # Validation if self.structure is None: content, error_list, warning_list = self.validate(readme_structure) else: content, error_list, warning_list = self.validate(self.structure) + error_list = self.parsing_error_list + error_list + warning_list = self.parsing_warning_list + warning_list if error_list != [] or warning_list != []: errors = "\n".join(list(map(lambda x: "-\t" + x, error_list + warning_list))) error_string = f"The following issues were found for the README at `{self.name}`:\n" + errors @@ -188,18 +191,19 @@ def from_string(cls, string: str, structure: dict = None, root_name: str = "root def parse(self): # Skip Tags - tag_count = 0 line_count = 0 for line in self.lines: self.yaml_tags_line_count += 1 if line.strip(" \n") == "---": - tag_count += 1 - if tag_count == 2: + self.tag_count += 1 + if self.tag_count == 2: break line_count += 1 - - self.lines = self.lines[line_count + 1 :] # Get the last + 1 th item. + if self.tag_count == 2: + self.lines = self.lines[line_count + 1 :] # Get the last + 1 th item. + else: + self.lines = self.lines[self.tag_count :] super().parse() def __str__(self): @@ -210,21 +214,23 @@ def validate(self, readme_structure): error_list = [] warning_list = [] if self.yaml_tags_line_count == 0: - warning_list.append(f"YAML Tags are not present in the README at `{self.name}`.") - elif self.yaml_tags_line_count == -1: - warning_list.append(f"Only the start of YAML tags present in the README at `{self.name}`.") - + warning_list.append("Empty YAML markers are present in the README.") + elif self.tag_count == 0: + warning_list.append("No YAML markers are present in the README.") + elif self.tag_count == 1: + warning_list.append("Only the start of YAML tags present in the README.") # Check how many first level sections are present. num_first_level_keys = len(self.content.keys()) + print(self.content) if num_first_level_keys > 1: # If more than one, add to the error list, continue error_list.append( - f"The README present at `{self.name}` has found several first-level headings: {list(self.content.keys())}. Only one heading is expected. Skipping further validation for this README." + f"The README has several first-level headings: {list(self.content.keys())}. Only one heading is expected. Skipping further validation for this README." ) elif num_first_level_keys < 1: # If less than one, append error. error_list.append( - f"The README present as `{self.name}` has no first-level headings. One heading is expected. Skipping further validation for this README." + f"The README has no first-level headings. One heading is expected. Skipping further validation for this README." ) else: @@ -241,7 +247,7 @@ def validate(self, readme_structure): else: # If not found, append error error_list.append( - f"No first-level heading starting with `Dataset Card for` found in README present at `{self.name}`. Skipping further validation for this README." + f"No first-level heading starting with `Dataset Card for` found in README. Skipping further validation for this README." ) if error_list: # If there are errors, do not return the dictionary as it is invalid diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py index 84ab24cde45..ac84d2f64ef 100644 --- a/tests/test_readme_util.py +++ b/tests/test_readme_util.py @@ -1,17 +1,15 @@ import tempfile -import unittest from pathlib import Path +import pytest import yaml from datasets.utils.readme import ReadMe -def _dedent(string: str) -> str: - return "\n".join([line.lstrip() for line in string.splitlines()]) - - -EXPECTED_STRUCTURE = yaml.safe_load( +# @pytest.fixture +# def example_yaml_structure(): +example_yaml_structure = yaml.safe_load( """\ name: "" allow_empty: false @@ -37,6 +35,43 @@ def _dedent(string: str) -> str: """ ) + +CORRECT_DICT = { + "name": "root", + "text": "", + "is_empty": True, + "subsections": [ + { + "name": "Dataset Card for My Dataset", + "text": "", + "is_empty": True, + "subsections": [ + {"name": "Table of Contents", "text": "Some text here.", "is_empty": False, "subsections": []}, + { + "name": "Dataset Description", + "text": "Some text here.", + "is_empty": False, + "subsections": [ + { + "name": "Dataset Summary", + "text": "Some text here.", + "is_empty": False, + "subsections": [], + }, + { + "name": "Supported Tasks and Leaderboards", + "text": "", + "is_empty": True, + "subsections": [], + }, + {"name": "Languages", "text": "", "is_empty": True, "subsections": []}, + ], + }, + ], + } + ], +} + README_CORRECT = """\ --- languages: @@ -58,7 +93,6 @@ def _dedent(string: str) -> str: README_EMPTY_YAML = """\ --- --- - # Dataset Card for My Dataset ## Table of Contents Some text here. @@ -70,19 +104,9 @@ def _dedent(string: str) -> str: ### Languages """ -README_INCORRECT_YAML = """\ ---- - -# Dataset Card for My Dataset -## Table of Contents -Some text here. -## Dataset Description -Some text here. -### Dataset Summary -Some text here. -### Supported Tasks and Leaderboards -### Languages -""" +EXPECTED_ERROR_README_EMPTY_YAML = ( + "The following issues were found for the README at `root`:\n-\tEmpty YAML markers are present in the README." +) README_NO_YAML = """\ # Dataset Card for My Dataset @@ -96,6 +120,10 @@ def _dedent(string: str) -> str: ### Languages """ +EXPECTED_ERROR_README_NO_YAML = ( + "The following issues were found for the README at `root`:\n-\tNo YAML markers are present in the README." +) + README_MISSING_TEXT = """\ --- languages: @@ -112,6 +140,7 @@ def _dedent(string: str) -> str: ### Supported Tasks and Leaderboards ### Languages """ +EXPECTED_ERROR_README_MISSING_TEXT = "The following issues were found for the README at `root`:\n-\tExpected some header text for section 'Dataset Summary'." README_MISSING_SUBSECTION = """\ --- @@ -130,6 +159,8 @@ def _dedent(string: str) -> str: ### Languages """ +EXPECTED_ERROR_README_MISSING_SUBSECTION = "The following issues were found for the README at `root`:\n-\tSection 'Dataset Description' is missing subsection: 'Supported Tasks and Leaderboards'." + README_MISSING_FIRST_LEVEL = """\ --- languages: @@ -146,7 +177,7 @@ def _dedent(string: str) -> str: ### Supported Tasks and Leaderboards ### Languages """ - +EXPECTED_ERROR_README_MISSING_FIRST_LEVEL = "The following issues were found for the README at `root`:\n-\tThe README has no first-level headings. One heading is expected. Skipping further validation for this README." README_MULTIPLE_WRONG_FIRST_LEVEL = """\ --- @@ -167,6 +198,8 @@ def _dedent(string: str) -> str: # Dataset Card My Dataset """ +EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL = "The following issues were found for the README at `root`:\n-\tThe README has several first-level headings: \['Dataset Card for My Dataset', 'Dataset Card My Dataset'\]. Only one heading is expected. Skipping further validation for this README." + README_WRONG_FIRST_LEVEL = """\ --- languages: @@ -185,8 +218,12 @@ def _dedent(string: str) -> str: ### Languages """ +EXPECTED_ERROR_README_WRONG_FIRST_LEVEL = "The following issues were found for the README at `root`:\n-\tNo first-level heading starting with `Dataset Card for` found in README. Skipping further validation for this README." + README_EMPTY = "" +EXPECTED_ERROR_README_EMPTY = "The following issues were found for the README at `root`:\n-\tThe README has no first-level headings. One heading is expected. Skipping further validation for this README.\n-\tNo YAML markers are present in the README." + README_MULTIPLE_SAME_HEADING_1 = """\ --- languages: @@ -206,37 +243,29 @@ def _dedent(string: str) -> str: ### Languages """ +EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1 = "The following issues were found for the README at `root`:\n-\tMultiple sections with the same heading 'Dataset Card for My Dataset' have been found. Please keep only one of these sections." + + +def test_readme_from_string_correct(): + + assert ReadMe.from_string(README_CORRECT, example_yaml_structure).to_dict() == CORRECT_DICT + + +@pytest.mark.parametrize( + "readme_md, expected_error", + [ + (README_NO_YAML, EXPECTED_ERROR_README_NO_YAML), + (README_EMPTY_YAML, EXPECTED_ERROR_README_EMPTY_YAML), + (README_EMPTY, EXPECTED_ERROR_README_EMPTY), + (README_MISSING_FIRST_LEVEL, EXPECTED_ERROR_README_MISSING_FIRST_LEVEL), + (README_MISSING_SUBSECTION, EXPECTED_ERROR_README_MISSING_SUBSECTION), + (README_MISSING_TEXT, EXPECTED_ERROR_README_MISSING_TEXT), + (README_MULTIPLE_SAME_HEADING_1, EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1), + (README_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_WRONG_FIRST_LEVEL), + (README_MULTIPLE_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL), + ], +) +def test_readme_from_string_errors(readme_md, expected_error): -class TestReadMeUtils(unittest.TestCase): - def test_from_string(self): - ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE) - with self.assertRaises(ValueError): - ReadMe.from_string(README_EMPTY_YAML, EXPECTED_STRUCTURE) - with self.assertRaises(ValueError): - ReadMe.from_string(README_INCORRECT_YAML, EXPECTED_STRUCTURE) - with self.assertRaises(ValueError): - ReadMe.from_string(README_NO_YAML, EXPECTED_STRUCTURE) - with self.assertRaises(ValueError): - ReadMe.from_string(README_MISSING_TEXT, EXPECTED_STRUCTURE) - with self.assertRaises(ValueError): - ReadMe.from_string(README_MISSING_SUBSECTION, EXPECTED_STRUCTURE) - with self.assertRaises(ValueError): - ReadMe.from_string(README_MISSING_FIRST_LEVEL, EXPECTED_STRUCTURE) - with self.assertRaises(ValueError): - ReadMe.from_string(README_MULTIPLE_WRONG_FIRST_LEVEL, EXPECTED_STRUCTURE) - with self.assertRaises(ValueError): - ReadMe.from_string(README_WRONG_FIRST_LEVEL, EXPECTED_STRUCTURE) - with self.assertRaises(ValueError): - ReadMe.from_string(README_EMPTY, EXPECTED_STRUCTURE) - - ReadMe.from_string(README_MULTIPLE_SAME_HEADING_1, EXPECTED_STRUCTURE) - # ReadMe.from_string(MISSING_SUBSECTION, EXPECTED_STRUCTURE) - # ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE) - # ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE) - # ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE) - # ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE) - # ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE) - - -if __name__ == "__main__": - unittest.main() + with pytest.raises(ValueError, match=expected_error): + ReadMe.from_string(readme_md, example_yaml_structure) From 35e08d84584fc6d97f9f61f181a2ccb99f4bda3b Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Sun, 2 May 2021 14:18:51 +0530 Subject: [PATCH 11/28] Add tests for from_readme --- tests/test_readme_util.py | 56 +++++++++++++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 11 deletions(-) diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py index ac84d2f64ef..c792d79450a 100644 --- a/tests/test_readme_util.py +++ b/tests/test_readme_util.py @@ -105,7 +105,7 @@ """ EXPECTED_ERROR_README_EMPTY_YAML = ( - "The following issues were found for the README at `root`:\n-\tEmpty YAML markers are present in the README." + "The following issues were found for the README at `{path}`:\n-\tEmpty YAML markers are present in the README." ) README_NO_YAML = """\ @@ -121,7 +121,7 @@ """ EXPECTED_ERROR_README_NO_YAML = ( - "The following issues were found for the README at `root`:\n-\tNo YAML markers are present in the README." + "The following issues were found for the README at `{path}`:\n-\tNo YAML markers are present in the README." ) README_MISSING_TEXT = """\ @@ -140,7 +140,7 @@ ### Supported Tasks and Leaderboards ### Languages """ -EXPECTED_ERROR_README_MISSING_TEXT = "The following issues were found for the README at `root`:\n-\tExpected some header text for section 'Dataset Summary'." +EXPECTED_ERROR_README_MISSING_TEXT = "The following issues were found for the README at `{path}`:\n-\tExpected some header text for section 'Dataset Summary'." README_MISSING_SUBSECTION = """\ --- @@ -159,7 +159,7 @@ ### Languages """ -EXPECTED_ERROR_README_MISSING_SUBSECTION = "The following issues were found for the README at `root`:\n-\tSection 'Dataset Description' is missing subsection: 'Supported Tasks and Leaderboards'." +EXPECTED_ERROR_README_MISSING_SUBSECTION = "The following issues were found for the README at `{path}`:\n-\tSection 'Dataset Description' is missing subsection: 'Supported Tasks and Leaderboards'." README_MISSING_FIRST_LEVEL = """\ --- @@ -177,7 +177,7 @@ ### Supported Tasks and Leaderboards ### Languages """ -EXPECTED_ERROR_README_MISSING_FIRST_LEVEL = "The following issues were found for the README at `root`:\n-\tThe README has no first-level headings. One heading is expected. Skipping further validation for this README." +EXPECTED_ERROR_README_MISSING_FIRST_LEVEL = "The following issues were found for the README at `{path}`:\n-\tThe README has no first-level headings. One heading is expected. Skipping further validation for this README." README_MULTIPLE_WRONG_FIRST_LEVEL = """\ --- @@ -198,7 +198,7 @@ # Dataset Card My Dataset """ -EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL = "The following issues were found for the README at `root`:\n-\tThe README has several first-level headings: \['Dataset Card for My Dataset', 'Dataset Card My Dataset'\]. Only one heading is expected. Skipping further validation for this README." +EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL = "The following issues were found for the README at `{path}`:\n-\tThe README has several first-level headings: \['Dataset Card for My Dataset', 'Dataset Card My Dataset'\]. Only one heading is expected. Skipping further validation for this README." README_WRONG_FIRST_LEVEL = """\ --- @@ -218,11 +218,11 @@ ### Languages """ -EXPECTED_ERROR_README_WRONG_FIRST_LEVEL = "The following issues were found for the README at `root`:\n-\tNo first-level heading starting with `Dataset Card for` found in README. Skipping further validation for this README." +EXPECTED_ERROR_README_WRONG_FIRST_LEVEL = "The following issues were found for the README at `{path}`:\n-\tNo first-level heading starting with `Dataset Card for` found in README. Skipping further validation for this README." README_EMPTY = "" -EXPECTED_ERROR_README_EMPTY = "The following issues were found for the README at `root`:\n-\tThe README has no first-level headings. One heading is expected. Skipping further validation for this README.\n-\tNo YAML markers are present in the README." +EXPECTED_ERROR_README_EMPTY = "The following issues were found for the README at `{path}`:\n-\tThe README has no first-level headings. One heading is expected. Skipping further validation for this README.\n-\tNo YAML markers are present in the README." README_MULTIPLE_SAME_HEADING_1 = """\ --- @@ -243,7 +243,7 @@ ### Languages """ -EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1 = "The following issues were found for the README at `root`:\n-\tMultiple sections with the same heading 'Dataset Card for My Dataset' have been found. Please keep only one of these sections." +EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1 = "The following issues were found for the README at `{path}`:\n-\tMultiple sections with the same heading 'Dataset Card for My Dataset' have been found. Please keep only one of these sections." def test_readme_from_string_correct(): @@ -266,6 +266,40 @@ def test_readme_from_string_correct(): ], ) def test_readme_from_string_errors(readme_md, expected_error): - - with pytest.raises(ValueError, match=expected_error): + with pytest.raises(ValueError, match=expected_error.format(path='root')): ReadMe.from_string(readme_md, example_yaml_structure) + + +def test_readme_from_readme_correct(): + with tempfile.TemporaryDirectory() as tmp_dir: + path = Path(tmp_dir) / "README.md" + with open(path, "w+") as readme_file: + readme_file.write(README_CORRECT) + out = ReadMe.from_readme(path, example_yaml_structure).to_dict() + assert out['name']==path + assert out['text']=="" + assert out['is_empty']==True + assert out['subsections']==CORRECT_DICT['subsections'] + + +@pytest.mark.parametrize( + "readme_md, expected_error", + [ + (README_NO_YAML, EXPECTED_ERROR_README_NO_YAML), + (README_EMPTY_YAML, EXPECTED_ERROR_README_EMPTY_YAML), + (README_EMPTY, EXPECTED_ERROR_README_EMPTY), + (README_MISSING_FIRST_LEVEL, EXPECTED_ERROR_README_MISSING_FIRST_LEVEL), + (README_MISSING_SUBSECTION, EXPECTED_ERROR_README_MISSING_SUBSECTION), + (README_MISSING_TEXT, EXPECTED_ERROR_README_MISSING_TEXT), + (README_MULTIPLE_SAME_HEADING_1, EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1), + (README_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_WRONG_FIRST_LEVEL), + (README_MULTIPLE_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL), + ], +) +def test_readme_from_readme_error(readme_md, expected_error): + with tempfile.TemporaryDirectory() as tmp_dir: + path = Path(tmp_dir) / "README.md" + with open(path, "w+") as readme_file: + readme_file.write(readme_md) + with pytest.raises(ValueError, match=expected_error.format(path=path)): + ReadMe.from_readme(path, example_yaml_structure) \ No newline at end of file From a3de91abd9d76c0c528ee99504bbda0589419c55 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Sun, 2 May 2021 14:40:10 +0530 Subject: [PATCH 12/28] Add ReadMe validator script --- scripts/datasets_readme_validator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/datasets_readme_validator.py b/scripts/datasets_readme_validator.py index c56b59f2601..13f1ffe68fb 100644 --- a/scripts/datasets_readme_validator.py +++ b/scripts/datasets_readme_validator.py @@ -1,13 +1,13 @@ #!/usr/bin/env python -""" This script will run in CI and make sure all new changes to datasets readme files have valid content present. +""" This script will run in CI and make sure all new changes to datasets readme files have valid readme content. """ from pathlib import Path from subprocess import check_output from typing import List -from datasets.utils.readme import validate_readme +from datasets.utils.readme import ReadMe def get_changed_files(repo_path: Path) -> List[Path]: @@ -41,11 +41,11 @@ def get_changed_files(repo_path: Path) -> List[Path]: failed: List[Path] = [] for readme in sorted(readmes): try: - DatasetMetadata.from_readme(readme) + ReadMe.from_readme(readme) logging.debug(f"✅️ Validated '{readme.relative_to(repo_path)}'") - except TypeError as e: + except ValueError as e: failed.append(readme) - logging.warning(f"❌ Failed to validate '{readme.relative_to(repo_path)}':\n{e}") + logging.warning(f"❌ Validation failed for '{readme.relative_to(repo_path)}':\n{e}") except Exception as e: failed.append(readme) logging.warning(f"⁉️ Something unexpected happened on '{readme.relative_to(repo_path)}':\n{e}") From 8dd3feb9c18a1866c7ce536321774f3aefac044f Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Sun, 2 May 2021 14:40:44 +0530 Subject: [PATCH 13/28] Fix style --- tests/test_readme_util.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py index c792d79450a..539d5989267 100644 --- a/tests/test_readme_util.py +++ b/tests/test_readme_util.py @@ -266,7 +266,7 @@ def test_readme_from_string_correct(): ], ) def test_readme_from_string_errors(readme_md, expected_error): - with pytest.raises(ValueError, match=expected_error.format(path='root')): + with pytest.raises(ValueError, match=expected_error.format(path="root")): ReadMe.from_string(readme_md, example_yaml_structure) @@ -275,11 +275,11 @@ def test_readme_from_readme_correct(): path = Path(tmp_dir) / "README.md" with open(path, "w+") as readme_file: readme_file.write(README_CORRECT) - out = ReadMe.from_readme(path, example_yaml_structure).to_dict() - assert out['name']==path - assert out['text']=="" - assert out['is_empty']==True - assert out['subsections']==CORRECT_DICT['subsections'] + out = ReadMe.from_readme(path, example_yaml_structure).to_dict() + assert out["name"] == path + assert out["text"] == "" + assert out["is_empty"] == True + assert out["subsections"] == CORRECT_DICT["subsections"] @pytest.mark.parametrize( @@ -302,4 +302,4 @@ def test_readme_from_readme_error(readme_md, expected_error): with open(path, "w+") as readme_file: readme_file.write(readme_md) with pytest.raises(ValueError, match=expected_error.format(path=path)): - ReadMe.from_readme(path, example_yaml_structure) \ No newline at end of file + ReadMe.from_readme(path, example_yaml_structure) From 87b06683fc691df6f026fc47e3f2b4051407513b Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Sun, 2 May 2021 14:43:14 +0530 Subject: [PATCH 14/28] Remove print statement --- src/datasets/utils/readme.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py index a102cda38c9..615cf3c3afe 100644 --- a/src/datasets/utils/readme.py +++ b/src/datasets/utils/readme.py @@ -221,7 +221,6 @@ def validate(self, readme_structure): warning_list.append("Only the start of YAML tags present in the README.") # Check how many first level sections are present. num_first_level_keys = len(self.content.keys()) - print(self.content) if num_first_level_keys > 1: # If more than one, add to the error list, continue error_list.append( From 1d49a4da41a86d3edb0354a9630309d0a021864c Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Sun, 2 May 2021 14:44:48 +0530 Subject: [PATCH 15/28] Add validator to CircleCI --- .circleci/config.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 2361409c4c9..c1e870b8fd7 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -32,7 +32,6 @@ jobs: - run: pip install pyarrow==1.0.0 - run: HF_SCRIPTS_VERSION=master python -m pytest -sv ./tests/ - run_dataset_script_tests_pyarrow_latest_WIN: working_directory: ~/datasets executor: @@ -82,6 +81,7 @@ jobs: - run: isort --check-only tests src benchmarks datasets metrics - run: flake8 tests src benchmarks datasets metrics - run: ./scripts/datasets_metadata_validator.py + - run: ./scripts/datasets_readme_validator.py build_doc: working_directory: ~/datasets @@ -100,8 +100,8 @@ jobs: - image: circleci/python:3.6 steps: - add_ssh_keys: - fingerprints: - - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71" + fingerprints: + - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71" - checkout - run: sudo pip install .[docs] - run: ./.circleci/deploy.sh From d9f0ac3e57d37f23dcee28bcdbff2ae360fb1b19 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Sun, 2 May 2021 15:15:24 +0530 Subject: [PATCH 16/28] Fix style --- src/datasets/utils/readme.py | 4 ++-- tests/test_readme_util.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py index 615cf3c3afe..2a1134dbbec 100644 --- a/src/datasets/utils/readme.py +++ b/src/datasets/utils/readme.py @@ -101,7 +101,7 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput: # Header text validation error_list = [] warning_list = [] - if structure["allow_empty"] == False: + if structure["allow_empty"] is False: # If header text is expected if self.is_empty: # If no header text is found, mention it in the error_list @@ -224,7 +224,7 @@ def validate(self, readme_structure): if num_first_level_keys > 1: # If more than one, add to the error list, continue error_list.append( - f"The README has several first-level headings: {list(self.content.keys())}. Only one heading is expected. Skipping further validation for this README." + f"The README has several first-level headings: {', '.join(['`'+x+'`' for x in list(self.content.keys())])}. Only one heading is expected. Skipping further validation for this README." ) elif num_first_level_keys < 1: # If less than one, append error. diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py index 539d5989267..2eccc4af8f6 100644 --- a/tests/test_readme_util.py +++ b/tests/test_readme_util.py @@ -198,7 +198,7 @@ # Dataset Card My Dataset """ -EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL = "The following issues were found for the README at `{path}`:\n-\tThe README has several first-level headings: \['Dataset Card for My Dataset', 'Dataset Card My Dataset'\]. Only one heading is expected. Skipping further validation for this README." +EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL = "The following issues were found for the README at `{path}`:\n-\tThe README has several first-level headings: `Dataset Card for My Dataset`, `Dataset Card My Dataset`. Only one heading is expected. Skipping further validation for this README." README_WRONG_FIRST_LEVEL = """\ --- @@ -278,7 +278,7 @@ def test_readme_from_readme_correct(): out = ReadMe.from_readme(path, example_yaml_structure).to_dict() assert out["name"] == path assert out["text"] == "" - assert out["is_empty"] == True + assert out["is_empty"] assert out["subsections"] == CORRECT_DICT["subsections"] From 414fc2e997d4406c0257f58e207d2c1b70b1feb9 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Sun, 2 May 2021 15:36:39 +0530 Subject: [PATCH 17/28] Add YAML files to setup resources --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e745428377f..6a52d94d45b 100644 --- a/setup.py +++ b/setup.py @@ -216,7 +216,7 @@ license="Apache 2.0", package_dir={"": "src"}, packages=find_packages("src"), - package_data={"datasets": ["scripts/templates/*"], "datasets.utils.resources": ["*.json"]}, + package_data={"datasets": ["scripts/templates/*"], "datasets.utils.resources": ["*.json", "*.yaml"]}, entry_points={"console_scripts": ["datasets-cli=datasets.commands.datasets_cli:main"]}, install_requires=REQUIRED_PKGS, extras_require=EXTRAS_REQUIRE, From 0c3425a56a55d5889b5bbb49adac6f85d756c55f Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Sun, 2 May 2021 15:49:10 +0530 Subject: [PATCH 18/28] Make validator executable --- scripts/datasets_readme_validator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) mode change 100644 => 100755 scripts/datasets_readme_validator.py diff --git a/scripts/datasets_readme_validator.py b/scripts/datasets_readme_validator.py old mode 100644 new mode 100755 index 13f1ffe68fb..af0cc05445d --- a/scripts/datasets_readme_validator.py +++ b/scripts/datasets_readme_validator.py @@ -1,7 +1,6 @@ #!/usr/bin/env python -""" This script will run in CI and make sure all new changes to datasets readme files have valid readme content. -""" +""" This script will run in CI and make sure all new changes to datasets readme files have valid readme content.""" from pathlib import Path from subprocess import check_output From 933fdf76c1e3ecad4210cb66dd31bcda92da8928 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Sun, 2 May 2021 16:16:13 +0530 Subject: [PATCH 19/28] Add no subsections test --- src/datasets/utils/readme.py | 10 +++++----- tests/test_readme_util.py | 20 +++++++++++++++++--- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py index 2a1134dbbec..9b0a393d845 100644 --- a/src/datasets/utils/readme.py +++ b/src/datasets/utils/readme.py @@ -80,7 +80,7 @@ def parse(self): if current_sub_level != "": if current_sub_level in self.content: self.parsing_error_list.append( - f"Multiple sections with the same heading '{current_sub_level}' have been found. Please keep only one of these sections." + f"Multiple sections with the same heading `{current_sub_level}` have been found. Please keep only one of these sections." ) self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines) else: @@ -105,7 +105,7 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput: # If header text is expected if self.is_empty: # If no header text is found, mention it in the error_list - error_list.append(f"Expected some header text for section '{self.name}'.") + error_list.append(f"Expected some header text for section `{self.name}`.") # Subsections Validation if structure["subsections"] is not None: @@ -114,14 +114,14 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput: # If no subsections are present values = [subsection["name"] for subsection in structure["subsections"]] # Mention the expected values in the error_list - error_list.append(f"Section '{self.name}' expected the following subsections: {values}, found `None`.") + error_list.append(f"Section `{self.name}` expected the following subsections: {', '.join(['`'+x+'`' for x in values])}. Found 'None'.") else: # If some subsections are present structure_names = [subsection["name"] for subsection in structure["subsections"]] for idx, name in enumerate(structure_names): if name not in self.content: # If the expected subsection is not present - error_list.append(f"Section '{self.name}' is missing subsection: '{name}'.") + error_list.append(f"Section `{self.name}` is missing subsection: `{name}`.") else: # If the subsection is present, validate subsection, return the result # and concat the errors from subsection to section error_list @@ -135,7 +135,7 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput: if name not in structure_names: # If an extra subsection is present warning_list.append( - f"'{self.name}' has an extra subsection: '{name}'. Skipping further validation checks for this subsection as expected structure is unknown." + f"`{self.name}` has an extra subsection: `{name}`. Skipping further validation checks for this subsection as expected structure is unknown." ) error_list = self.parsing_error_list + error_list warning_list = self.parsing_warning_list + warning_list diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py index 2eccc4af8f6..6b1ef94dece 100644 --- a/tests/test_readme_util.py +++ b/tests/test_readme_util.py @@ -140,7 +140,19 @@ ### Supported Tasks and Leaderboards ### Languages """ -EXPECTED_ERROR_README_MISSING_TEXT = "The following issues were found for the README at `{path}`:\n-\tExpected some header text for section 'Dataset Summary'." +EXPECTED_ERROR_README_MISSING_TEXT = "The following issues were found for the README at `{path}`:\n-\tExpected some header text for section `Dataset Summary`." + + +README_NONE_SUBSECTION = """\ +--- +languages: +- zh +- en +--- + +# Dataset Card for My Dataset +""" +EXPECTED_ERROR_README_NONE_SUBSECTION = "The following issues were found for the README at `{path}`:\n-\tSection `Dataset Card for My Dataset` expected the following subsections: `Table of Contents`, `Dataset Description`. Found 'None'." README_MISSING_SUBSECTION = """\ --- @@ -159,7 +171,7 @@ ### Languages """ -EXPECTED_ERROR_README_MISSING_SUBSECTION = "The following issues were found for the README at `{path}`:\n-\tSection 'Dataset Description' is missing subsection: 'Supported Tasks and Leaderboards'." +EXPECTED_ERROR_README_MISSING_SUBSECTION = "The following issues were found for the README at `{path}`:\n-\tSection `Dataset Description` is missing subsection: `Supported Tasks and Leaderboards`." README_MISSING_FIRST_LEVEL = """\ --- @@ -243,7 +255,7 @@ ### Languages """ -EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1 = "The following issues were found for the README at `{path}`:\n-\tMultiple sections with the same heading 'Dataset Card for My Dataset' have been found. Please keep only one of these sections." +EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1 = "The following issues were found for the README at `{path}`:\n-\tMultiple sections with the same heading `Dataset Card for My Dataset` have been found. Please keep only one of these sections." def test_readme_from_string_correct(): @@ -257,6 +269,7 @@ def test_readme_from_string_correct(): (README_NO_YAML, EXPECTED_ERROR_README_NO_YAML), (README_EMPTY_YAML, EXPECTED_ERROR_README_EMPTY_YAML), (README_EMPTY, EXPECTED_ERROR_README_EMPTY), + (README_NONE_SUBSECTION, EXPECTED_ERROR_README_NONE_SUBSECTION), (README_MISSING_FIRST_LEVEL, EXPECTED_ERROR_README_MISSING_FIRST_LEVEL), (README_MISSING_SUBSECTION, EXPECTED_ERROR_README_MISSING_SUBSECTION), (README_MISSING_TEXT, EXPECTED_ERROR_README_MISSING_TEXT), @@ -288,6 +301,7 @@ def test_readme_from_readme_correct(): (README_NO_YAML, EXPECTED_ERROR_README_NO_YAML), (README_EMPTY_YAML, EXPECTED_ERROR_README_EMPTY_YAML), (README_EMPTY, EXPECTED_ERROR_README_EMPTY), + (README_NONE_SUBSECTION, EXPECTED_ERROR_README_NONE_SUBSECTION), (README_MISSING_FIRST_LEVEL, EXPECTED_ERROR_README_MISSING_FIRST_LEVEL), (README_MISSING_SUBSECTION, EXPECTED_ERROR_README_MISSING_SUBSECTION), (README_MISSING_TEXT, EXPECTED_ERROR_README_MISSING_TEXT), From cd895a1510705ee3c252f21d547b6db44a79ad7b Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Sun, 2 May 2021 16:23:09 +0530 Subject: [PATCH 20/28] Add incorrect YAML test --- tests/test_readme_util.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py index 6b1ef94dece..50bcc7af565 100644 --- a/tests/test_readme_util.py +++ b/tests/test_readme_util.py @@ -124,6 +124,23 @@ "The following issues were found for the README at `{path}`:\n-\tNo YAML markers are present in the README." ) +README_INCORRECT_YAML = """\ +--- +# Dataset Card for My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +### Supported Tasks and Leaderboards +### Languages +""" + +EXPECTED_ERROR_README_INCORRECT_YAML = ( + "The following issues were found for the README at `{path}`:\n-\tOnly the start of YAML tags present in the README." +) + README_MISSING_TEXT = """\ --- languages: @@ -268,6 +285,7 @@ def test_readme_from_string_correct(): [ (README_NO_YAML, EXPECTED_ERROR_README_NO_YAML), (README_EMPTY_YAML, EXPECTED_ERROR_README_EMPTY_YAML), + (README_INCORRECT_YAML, EXPECTED_ERROR_README_INCORRECT_YAML), (README_EMPTY, EXPECTED_ERROR_README_EMPTY), (README_NONE_SUBSECTION, EXPECTED_ERROR_README_NONE_SUBSECTION), (README_MISSING_FIRST_LEVEL, EXPECTED_ERROR_README_MISSING_FIRST_LEVEL), @@ -300,6 +318,7 @@ def test_readme_from_readme_correct(): [ (README_NO_YAML, EXPECTED_ERROR_README_NO_YAML), (README_EMPTY_YAML, EXPECTED_ERROR_README_EMPTY_YAML), + (README_INCORRECT_YAML, EXPECTED_ERROR_README_INCORRECT_YAML), (README_EMPTY, EXPECTED_ERROR_README_EMPTY), (README_NONE_SUBSECTION, EXPECTED_ERROR_README_NONE_SUBSECTION), (README_MISSING_FIRST_LEVEL, EXPECTED_ERROR_README_MISSING_FIRST_LEVEL), From a3bdb1f13a9cd35d4ac060daf8bf0269fd2c1995 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Sun, 2 May 2021 16:23:37 +0530 Subject: [PATCH 21/28] Fix style --- src/datasets/utils/readme.py | 4 +++- tests/test_readme_util.py | 4 +--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py index 9b0a393d845..b9814544bfb 100644 --- a/src/datasets/utils/readme.py +++ b/src/datasets/utils/readme.py @@ -114,7 +114,9 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput: # If no subsections are present values = [subsection["name"] for subsection in structure["subsections"]] # Mention the expected values in the error_list - error_list.append(f"Section `{self.name}` expected the following subsections: {', '.join(['`'+x+'`' for x in values])}. Found 'None'.") + error_list.append( + f"Section `{self.name}` expected the following subsections: {', '.join(['`'+x+'`' for x in values])}. Found 'None'." + ) else: # If some subsections are present structure_names = [subsection["name"] for subsection in structure["subsections"]] diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py index 50bcc7af565..2eed7947db0 100644 --- a/tests/test_readme_util.py +++ b/tests/test_readme_util.py @@ -137,9 +137,7 @@ ### Languages """ -EXPECTED_ERROR_README_INCORRECT_YAML = ( - "The following issues were found for the README at `{path}`:\n-\tOnly the start of YAML tags present in the README." -) +EXPECTED_ERROR_README_INCORRECT_YAML = "The following issues were found for the README at `{path}`:\n-\tOnly the start of YAML tags present in the README." README_MISSING_TEXT = """\ --- From 6e85d4a5c641088588c4f246ee32ea06cb444d49 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Sun, 2 May 2021 19:27:21 +0530 Subject: [PATCH 22/28] Fix tests --- tests/test_readme_util.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py index 2eed7947db0..c7a9d112fd6 100644 --- a/tests/test_readme_util.py +++ b/tests/test_readme_util.py @@ -332,5 +332,6 @@ def test_readme_from_readme_error(readme_md, expected_error): path = Path(tmp_dir) / "README.md" with open(path, "w+") as readme_file: readme_file.write(readme_md) - with pytest.raises(ValueError, match=expected_error.format(path=path)): + expected_error = expected_error.format(path=path) + with pytest.raises(ValueError, match=expected_error): ReadMe.from_readme(path, example_yaml_structure) From 10386e719283475598f39fc848c6c0a9cc7b039d Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Sun, 2 May 2021 19:42:43 +0530 Subject: [PATCH 23/28] Fix tests --- tests/test_readme_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py index c7a9d112fd6..bb994496bb9 100644 --- a/tests/test_readme_util.py +++ b/tests/test_readme_util.py @@ -332,6 +332,6 @@ def test_readme_from_readme_error(readme_md, expected_error): path = Path(tmp_dir) / "README.md" with open(path, "w+") as readme_file: readme_file.write(readme_md) - expected_error = expected_error.format(path=path) + expected_error = expected_error.format(path=path).encode('unicode_escape').decode('ascii') with pytest.raises(ValueError, match=expected_error): ReadMe.from_readme(path, example_yaml_structure) From b4ca9ca45495942c9508d0cc065d19f281172157 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Sun, 2 May 2021 19:44:32 +0530 Subject: [PATCH 24/28] Fix style --- tests/test_readme_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py index bb994496bb9..ff2ae4afc37 100644 --- a/tests/test_readme_util.py +++ b/tests/test_readme_util.py @@ -332,6 +332,6 @@ def test_readme_from_readme_error(readme_md, expected_error): path = Path(tmp_dir) / "README.md" with open(path, "w+") as readme_file: readme_file.write(readme_md) - expected_error = expected_error.format(path=path).encode('unicode_escape').decode('ascii') + expected_error = expected_error.format(path=path).encode("unicode_escape").decode("ascii") with pytest.raises(ValueError, match=expected_error): ReadMe.from_readme(path, example_yaml_structure) From a69c019274d7429ed121b960c8ffa2ccb2fafc33 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Tue, 4 May 2021 18:30:59 +0530 Subject: [PATCH 25/28] Fix escape character issue --- tests/test_readme_util.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py index ff2ae4afc37..9d06648823b 100644 --- a/tests/test_readme_util.py +++ b/tests/test_readme_util.py @@ -1,3 +1,4 @@ +import re import tempfile from pathlib import Path @@ -332,6 +333,6 @@ def test_readme_from_readme_error(readme_md, expected_error): path = Path(tmp_dir) / "README.md" with open(path, "w+") as readme_file: readme_file.write(readme_md) - expected_error = expected_error.format(path=path).encode("unicode_escape").decode("ascii") - with pytest.raises(ValueError, match=expected_error): + expected_error = expected_error.format(path=path) + with pytest.raises(ValueError, match=re.escape(expected_error)): ReadMe.from_readme(path, example_yaml_structure) From d45ec9be5bcf0035d33d2d9e27575f87afa5f379 Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Fri, 7 May 2021 21:47:03 +0530 Subject: [PATCH 26/28] Add three-level heading validation limit --- src/datasets/utils/readme.py | 13 ++++-- tests/test_readme_util.py | 88 ++++++++++++++++++++++++++++++++---- 2 files changed, 89 insertions(+), 12 deletions(-) diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py index b9814544bfb..803759e21e7 100644 --- a/src/datasets/utils/readme.py +++ b/src/datasets/utils/readme.py @@ -105,7 +105,9 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput: # If header text is expected if self.is_empty: # If no header text is found, mention it in the error_list - error_list.append(f"Expected some header text for section `{self.name}`.") + error_list.append( + f"Expected some text in section `{self.name}` but it is empty (text in subsections are ignored)." + ) # Subsections Validation if structure["subsections"] is not None: @@ -127,9 +129,12 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput: else: # If the subsection is present, validate subsection, return the result # and concat the errors from subsection to section error_list - _, subsec_error_list, subsec_warning_list = self.content[name].validate( - structure["subsections"][idx] - ) + if self.level == "###": + continue + else: + _, subsec_error_list, subsec_warning_list = self.content[name].validate( + structure["subsections"][idx] + ) error_list += subsec_error_list warning_list += subsec_warning_list diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py index 9d06648823b..2b4b2acf1de 100644 --- a/tests/test_readme_util.py +++ b/tests/test_readme_util.py @@ -10,6 +10,7 @@ # @pytest.fixture # def example_yaml_structure(): + example_yaml_structure = yaml.safe_load( """\ name: "" @@ -91,6 +92,64 @@ ### Languages """ + +README_CORRECT_FOUR_LEVEL = """\ +--- +languages: +- zh +- en +--- + +# Dataset Card for My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +#### Extra Ignored Subsection +### Supported Tasks and Leaderboards +### Languages +""" + +CORRECT_DICT_FOUR_LEVEL = { + "name": "root", + "text": "", + "is_empty": True, + "subsections": [ + { + "name": "Dataset Card for My Dataset", + "text": "", + "is_empty": True, + "subsections": [ + {"name": "Table of Contents", "text": "Some text here.", "is_empty": False, "subsections": []}, + { + "name": "Dataset Description", + "text": "Some text here.", + "is_empty": False, + "subsections": [ + { + "name": "Dataset Summary", + "text": "Some text here.", + "is_empty": False, + "subsections": [ + {"name": "Extra Ignored Subsection", "text": "", "is_empty": True, "subsections": []} + ], + }, + { + "name": "Supported Tasks and Leaderboards", + "text": "", + "is_empty": True, + "subsections": [], + }, + {"name": "Languages", "text": "", "is_empty": True, "subsections": []}, + ], + }, + ], + } + ], +} + README_EMPTY_YAML = """\ --- --- @@ -156,7 +215,7 @@ ### Supported Tasks and Leaderboards ### Languages """ -EXPECTED_ERROR_README_MISSING_TEXT = "The following issues were found for the README at `{path}`:\n-\tExpected some header text for section `Dataset Summary`." +EXPECTED_ERROR_README_MISSING_TEXT = "The following issues were found for the README at `{path}`:\n-\tExpected some text in section `Dataset Summary` but it is empty (text in subsections are ignored)." README_NONE_SUBSECTION = """\ @@ -274,9 +333,15 @@ EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1 = "The following issues were found for the README at `{path}`:\n-\tMultiple sections with the same heading `Dataset Card for My Dataset` have been found. Please keep only one of these sections." -def test_readme_from_string_correct(): - - assert ReadMe.from_string(README_CORRECT, example_yaml_structure).to_dict() == CORRECT_DICT +@pytest.mark.parametrize( + "readme_md, expected_dict", + [ + (README_CORRECT, CORRECT_DICT), + (README_CORRECT_FOUR_LEVEL, CORRECT_DICT_FOUR_LEVEL), + ], +) +def test_readme_from_string_correct(readme_md, expected_dict): + assert ReadMe.from_string(readme_md, example_yaml_structure).to_dict() == expected_dict @pytest.mark.parametrize( @@ -296,20 +361,27 @@ def test_readme_from_string_correct(): ], ) def test_readme_from_string_errors(readme_md, expected_error): - with pytest.raises(ValueError, match=expected_error.format(path="root")): + with pytest.raises(ValueError, match=re.escape(expected_error.format(path="root"))): ReadMe.from_string(readme_md, example_yaml_structure) -def test_readme_from_readme_correct(): +@pytest.mark.parametrize( + "readme_md, expected_dict", + [ + (README_CORRECT, CORRECT_DICT), + (README_CORRECT_FOUR_LEVEL, CORRECT_DICT_FOUR_LEVEL), + ], +) +def test_readme_from_readme_correct(readme_md, expected_dict): with tempfile.TemporaryDirectory() as tmp_dir: path = Path(tmp_dir) / "README.md" with open(path, "w+") as readme_file: - readme_file.write(README_CORRECT) + readme_file.write(readme_md) out = ReadMe.from_readme(path, example_yaml_structure).to_dict() assert out["name"] == path assert out["text"] == "" assert out["is_empty"] - assert out["subsections"] == CORRECT_DICT["subsections"] + assert out["subsections"] == expected_dict["subsections"] @pytest.mark.parametrize( From cdcffe0818fa5e8ca1e0a8f4a4215bd0e3a8a1bc Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Fri, 7 May 2021 23:08:18 +0530 Subject: [PATCH 27/28] Add either text or subsection option --- src/datasets/utils/readme.py | 23 +++-- .../utils/resources/readme_structure.yaml | 39 ++++++-- tests/test_readme_util.py | 89 ++++++++++++++----- 3 files changed, 117 insertions(+), 34 deletions(-) diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py index 803759e21e7..6d4b8cb1a10 100644 --- a/src/datasets/utils/readme.py +++ b/src/datasets/utils/readme.py @@ -46,7 +46,7 @@ class Section: def __post_init__(self): self.text = "" - self.is_empty = True + self.is_empty_text = True self.content = {} self.parsing_error_list = [] self.parsing_warning_list = [] @@ -70,7 +70,7 @@ def parse(self): if current_lines != []: self.text += "".join(current_lines).strip() if self.text != "" and self.text not in FILLER_TEXT: - self.is_empty = False + self.is_empty_text = False current_lines = [] current_sub_level = " ".join(line.split()[1:]).strip(" \n") @@ -87,7 +87,7 @@ def parse(self): if current_lines != []: self.text += "".join(current_lines).strip() if self.text != "" and self.text not in FILLER_TEXT: - self.is_empty = False + self.is_empty_text = False def validate(self, structure: dict) -> ReadmeValidatorOutput: """Validates a Section class object recursively using the structure provided as a dictionary. @@ -102,13 +102,18 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput: error_list = [] warning_list = [] if structure["allow_empty"] is False: - # If header text is expected - if self.is_empty: - # If no header text is found, mention it in the error_list + # If content is expected + if self.is_empty_text and self.content == {}: + # If no content is found, mention it in the error_list + error_list.append(f"Expected some content in section `{self.name}` but it is empty.") + + if structure["allow_empty_text"] is False: + # If some text is expected + if self.is_empty_text: + # If no text is found, mention it in the error_list error_list.append( f"Expected some text in section `{self.name}` but it is empty (text in subsections are ignored)." ) - # Subsections Validation if structure["subsections"] is not None: # If subsections are expected @@ -129,6 +134,8 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput: else: # If the subsection is present, validate subsection, return the result # and concat the errors from subsection to section error_list + + # Skip sublevel validation if current level is `###` if self.level == "###": continue else: @@ -157,7 +164,7 @@ def to_dict(self) -> dict: return { "name": self.name, "text": self.text, - "is_empty": self.is_empty, + "is_empty_text": self.is_empty_text, "subsections": [value.to_dict() for value in self.content.values()], } diff --git a/src/datasets/utils/resources/readme_structure.yaml b/src/datasets/utils/resources/readme_structure.yaml index fc3356f2675..755483d1d4f 100644 --- a/src/datasets/utils/resources/readme_structure.yaml +++ b/src/datasets/utils/resources/readme_structure.yaml @@ -1,87 +1,116 @@ name: "" # Filename comes here allow_empty: false +allow_empty_text: true subsections: - name: "Dataset Card for X" # First-level markdown heading - allow_empty: true + allow_empty: false + allow_empty_text: true subsections: - name: "Table of Contents" allow_empty: false + allow_empty_text: false subsections: null # meaning it should not be checked. - name: "Dataset Description" allow_empty: false + allow_empty_text: false subsections: - name: "Dataset Summary" allow_empty: false + allow_empty_text: false subsections: null - name: "Supported Tasks and Leaderboards" allow_empty: true + allow_empty_text: true subsections: null - name: Languages allow_empty: true + allow_empty_text: true subsections: null - name: "Dataset Structure" - allow_empty: true + allow_empty: false + allow_empty_text: true subsections: - name: "Data Instances" allow_empty: false + allow_empty_text: false subsections: null - name: "Data Fields" allow_empty: false + allow_empty_text: false subsections: null - name: "Data Splits" allow_empty: false + allow_empty_text: false subsections: null - name: "Dataset Creation" - allow_empty: true + allow_empty: false + allow_empty_text: true subsections: - name: "Curation Rationale" allow_empty: true + allow_empty_text: true subsections: null - name: "Source Data" - allow_empty: true + allow_empty: false + allow_empty_text: true subsections: - name: "Initial Data Collection and Normalization" allow_empty: true + allow_empty_text: true subsections: null - name: "Who are the source language producers?" allow_empty: true + allow_empty_text: true subsections: null - name: "Annotations" - allow_empty: true + allow_empty: false + allow_empty_text: true subsections: - name: "Annotation process" allow_empty: true + allow_empty_text: true subsections: null - name: "Who are the annotators?" allow_empty: true + allow_empty_text: true subsections: null - name: "Personal and Sensitive Information" allow_empty: true + allow_empty_text: true subsections: null - name: "Considerations for Using the Data" allow_empty: true + allow_empty_text: true subsections: - name: "Social Impact of Dataset" allow_empty: true + allow_empty_text: true subsections: null - name: "Discussion of Biases" allow_empty: true + allow_empty_text: true subsections: null - name: "Other Known Limitations" allow_empty: true + allow_empty_text: true subsections: null - name: "Additional Information" allow_empty: true + allow_empty_text: true subsections: - name: "Dataset Curators" allow_empty: true + allow_empty_text: true subsections: null - name: "Licensing Information" allow_empty: true + allow_empty_text: true subsections: null - name: "Citation Information" allow_empty: false + allow_empty_text: true subsections: null - name: "Contributions" allow_empty: false + allow_empty_text: false subsections: null diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py index 2b4b2acf1de..711c6ea55c5 100644 --- a/tests/test_readme_util.py +++ b/tests/test_readme_util.py @@ -15,24 +15,31 @@ """\ name: "" allow_empty: false +allow_empty_text: true subsections: - name: "Dataset Card for X" # First-level markdown heading - allow_empty: true + allow_empty: false + allow_empty_text: true subsections: - name: "Table of Contents" allow_empty: false - subsections: null # meaning it should not be checked. + allow_empty_text: false + subsections: null - name: "Dataset Description" allow_empty: false + allow_empty_text: false subsections: - name: "Dataset Summary" allow_empty: false + allow_empty_text: false subsections: null - name: "Supported Tasks and Leaderboards" allow_empty: true + allow_empty_text: true subsections: null - name: Languages - allow_empty: true + allow_empty: false + allow_empty_text: true subsections: null """ ) @@ -41,32 +48,32 @@ CORRECT_DICT = { "name": "root", "text": "", - "is_empty": True, + "is_empty_text": True, "subsections": [ { "name": "Dataset Card for My Dataset", "text": "", - "is_empty": True, + "is_empty_text": True, "subsections": [ - {"name": "Table of Contents", "text": "Some text here.", "is_empty": False, "subsections": []}, + {"name": "Table of Contents", "text": "Some text here.", "is_empty_text": False, "subsections": []}, { "name": "Dataset Description", "text": "Some text here.", - "is_empty": False, + "is_empty_text": False, "subsections": [ { "name": "Dataset Summary", "text": "Some text here.", - "is_empty": False, + "is_empty_text": False, "subsections": [], }, { "name": "Supported Tasks and Leaderboards", "text": "", - "is_empty": True, + "is_empty_text": True, "subsections": [], }, - {"name": "Languages", "text": "", "is_empty": True, "subsections": []}, + {"name": "Languages", "text": "Language Text", "is_empty_text": False, "subsections": []}, ], }, ], @@ -74,6 +81,7 @@ ], } + README_CORRECT = """\ --- languages: @@ -90,6 +98,7 @@ Some text here. ### Supported Tasks and Leaderboards ### Languages +Language Text """ @@ -110,39 +119,45 @@ #### Extra Ignored Subsection ### Supported Tasks and Leaderboards ### Languages +Language Text """ CORRECT_DICT_FOUR_LEVEL = { "name": "root", "text": "", - "is_empty": True, + "is_empty_text": True, "subsections": [ { "name": "Dataset Card for My Dataset", "text": "", - "is_empty": True, + "is_empty_text": True, "subsections": [ - {"name": "Table of Contents", "text": "Some text here.", "is_empty": False, "subsections": []}, + {"name": "Table of Contents", "text": "Some text here.", "is_empty_text": False, "subsections": []}, { "name": "Dataset Description", "text": "Some text here.", - "is_empty": False, + "is_empty_text": False, "subsections": [ { "name": "Dataset Summary", "text": "Some text here.", - "is_empty": False, + "is_empty_text": False, "subsections": [ - {"name": "Extra Ignored Subsection", "text": "", "is_empty": True, "subsections": []} + { + "name": "Extra Ignored Subsection", + "text": "", + "is_empty_text": True, + "subsections": [], + } ], }, { "name": "Supported Tasks and Leaderboards", "text": "", - "is_empty": True, + "is_empty_text": True, "subsections": [], }, - {"name": "Languages", "text": "", "is_empty": True, "subsections": []}, + {"name": "Languages", "text": "Language Text", "is_empty_text": False, "subsections": []}, ], }, ], @@ -162,6 +177,7 @@ Some text here. ### Supported Tasks and Leaderboards ### Languages +Language Text """ EXPECTED_ERROR_README_EMPTY_YAML = ( @@ -178,6 +194,7 @@ Some text here. ### Supported Tasks and Leaderboards ### Languages +Language Text """ EXPECTED_ERROR_README_NO_YAML = ( @@ -195,6 +212,7 @@ Some text here. ### Supported Tasks and Leaderboards ### Languages +Language Text """ EXPECTED_ERROR_README_INCORRECT_YAML = "The following issues were found for the README at `{path}`:\n-\tOnly the start of YAML tags present in the README." @@ -214,8 +232,9 @@ ### Dataset Summary ### Supported Tasks and Leaderboards ### Languages +Language Text """ -EXPECTED_ERROR_README_MISSING_TEXT = "The following issues were found for the README at `{path}`:\n-\tExpected some text in section `Dataset Summary` but it is empty (text in subsections are ignored)." +EXPECTED_ERROR_README_MISSING_TEXT = "The following issues were found for the README at `{path}`:\n-\tExpected some content in section `Dataset Summary` but it is empty.\n-\tExpected some text in section `Dataset Summary` but it is empty (text in subsections are ignored)." README_NONE_SUBSECTION = """\ @@ -227,7 +246,7 @@ # Dataset Card for My Dataset """ -EXPECTED_ERROR_README_NONE_SUBSECTION = "The following issues were found for the README at `{path}`:\n-\tSection `Dataset Card for My Dataset` expected the following subsections: `Table of Contents`, `Dataset Description`. Found 'None'." +EXPECTED_ERROR_README_NONE_SUBSECTION = "The following issues were found for the README at `{path}`:\n-\tExpected some content in section `Dataset Card for My Dataset` but it is empty.\n-\tSection `Dataset Card for My Dataset` expected the following subsections: `Table of Contents`, `Dataset Description`. Found 'None'." README_MISSING_SUBSECTION = """\ --- @@ -244,10 +263,32 @@ ### Dataset Summary Some text here. ### Languages +Language Text """ EXPECTED_ERROR_README_MISSING_SUBSECTION = "The following issues were found for the README at `{path}`:\n-\tSection `Dataset Description` is missing subsection: `Supported Tasks and Leaderboards`." + +README_MISSING_CONTENT = """\ +--- +languages: +- zh +- en +--- + +# Dataset Card for My Dataset +## Table of Contents +Some text here. +## Dataset Description +Some text here. +### Dataset Summary +Some text here. +### Supported Tasks and Leaderboards +### Languages +""" + +EXPECTED_ERROR_README_MISSING_CONTENT = "The following issues were found for the README at `{path}`:\n-\tExpected some content in section `Languages` but it is empty." + README_MISSING_FIRST_LEVEL = """\ --- languages: @@ -263,6 +304,7 @@ Some text here. ### Supported Tasks and Leaderboards ### Languages +Language Text """ EXPECTED_ERROR_README_MISSING_FIRST_LEVEL = "The following issues were found for the README at `{path}`:\n-\tThe README has no first-level headings. One heading is expected. Skipping further validation for this README." @@ -282,6 +324,7 @@ Some text here. ### Supported Tasks and Leaderboards ### Languages +Language Text # Dataset Card My Dataset """ @@ -303,6 +346,7 @@ Some text here. ### Supported Tasks and Leaderboards ### Languages +Language Text """ EXPECTED_ERROR_README_WRONG_FIRST_LEVEL = "The following issues were found for the README at `{path}`:\n-\tNo first-level heading starting with `Dataset Card for` found in README. Skipping further validation for this README." @@ -328,6 +372,7 @@ Some text here. ### Supported Tasks and Leaderboards ### Languages +Language Text """ EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1 = "The following issues were found for the README at `{path}`:\n-\tMultiple sections with the same heading `Dataset Card for My Dataset` have been found. Please keep only one of these sections." @@ -358,6 +403,7 @@ def test_readme_from_string_correct(readme_md, expected_dict): (README_MULTIPLE_SAME_HEADING_1, EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1), (README_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_WRONG_FIRST_LEVEL), (README_MULTIPLE_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL), + (README_MISSING_CONTENT, EXPECTED_ERROR_README_MISSING_CONTENT), ], ) def test_readme_from_string_errors(readme_md, expected_error): @@ -380,7 +426,7 @@ def test_readme_from_readme_correct(readme_md, expected_dict): out = ReadMe.from_readme(path, example_yaml_structure).to_dict() assert out["name"] == path assert out["text"] == "" - assert out["is_empty"] + assert out["is_empty_text"] assert out["subsections"] == expected_dict["subsections"] @@ -398,6 +444,7 @@ def test_readme_from_readme_correct(readme_md, expected_dict): (README_MULTIPLE_SAME_HEADING_1, EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1), (README_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_WRONG_FIRST_LEVEL), (README_MULTIPLE_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL), + (README_MISSING_CONTENT, EXPECTED_ERROR_README_MISSING_CONTENT), ], ) def test_readme_from_readme_error(readme_md, expected_error): From ffdfcb642dae76a0b80b000ab76984f8d4dd9f4f Mon Sep 17 00:00:00 2001 From: Gunjan Chhablani Date: Fri, 7 May 2021 23:18:57 +0530 Subject: [PATCH 28/28] Fix style --- src/datasets/utils/readme.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py index 6d4b8cb1a10..45b8713f085 100644 --- a/src/datasets/utils/readme.py +++ b/src/datasets/utils/readme.py @@ -106,7 +106,7 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput: if self.is_empty_text and self.content == {}: # If no content is found, mention it in the error_list error_list.append(f"Expected some content in section `{self.name}` but it is empty.") - + if structure["allow_empty_text"] is False: # If some text is expected if self.is_empty_text: