From 993acd6cf2acab834f7840e38a33cc04f4e3c504 Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Fri, 26 Mar 2021 22:29:33 +0530
Subject: [PATCH 01/28] Add Initial README parser

---
 src/datasets/utils/readme_parser.py | 77 +++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 src/datasets/utils/readme_parser.py

diff --git a/src/datasets/utils/readme_parser.py b/src/datasets/utils/readme_parser.py
new file mode 100644
index 00000000000..94343007e9f
--- /dev/null
+++ b/src/datasets/utils/readme_parser.py
@@ -0,0 +1,77 @@
+# class_mapping = {
+#     "Dataset Description": DatasetDescription,
+# }
+
+# key_mapping = {
+#     "Dataset Desription": 'dataset_desc'
+# }
+import json
+
+
+class Section:
+    def __init__(self, name, level, lines=None):
+        self.name = name
+        self.level = level
+        self.attributes = ""
+        self.content = {}
+        if lines is not None:
+            self.parse(lines)
+
+    def parse(self, lines):
+        current_sub_level = ""
+        current_lines = []
+        code_start = False
+        for line in lines:
+            if line.strip(" \n") == "":
+                continue
+            elif line.strip(" \n")[:3] == "```":
+                code_start = not code_start
+            elif line.split()[0] == self.level + "#" and not code_start:
+                if current_sub_level != "":
+                    self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines)
+                    current_lines = []
+                else:
+                    if current_lines != []:
+                        self.attributes += "".join(current_lines).strip()
+                        current_lines = []
+
+                current_sub_level = " ".join(line.split()[1:]).strip(" \n")
+            else:
+                current_lines.append(line)
+        else:
+            if current_sub_level != "":
+                self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines)
+            else:
+                if current_lines != []:
+                    self.attributes += "".join(current_lines).strip()
+
+    def to_dict(self):
+        return {
+            "name": self.name,
+            "attributes": self.attributes,
+            "subsections": [value.to_dict() for value in self.content.values()],
+        }
+
+
+class ReadMe(Section):  # Level 0
+    def __init__(self, file_path):
+        super().__init__(name=file_path, level="")
+        self.parse(file_path)
+
+    def parse(self, file_path):
+        with open(self.name) as f:
+            # Skip Tags
+            tag_count = 0
+            for line in f:
+                if line.strip(" \n") == "---":
+                    tag_count += 1
+                    if tag_count == 2:
+                        break
+            super().parse(f)
+
+
+if __name__ == "__main__":
+    readme = ReadMe("./datasets/fashion_mnist/README.md")
+    # print(readme.attributes)
+    json_obj = json.dumps(readme.to_dict(), indent=4)
+    print(json_obj)

From 014c49df8ccc9edc1c24f355863667f700d4339b Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Tue, 30 Mar 2021 00:44:10 +0530
Subject: [PATCH 02/28] Add basic validation checks

---
 src/datasets/utils/readme_parser.py | 160 ++++++++++++++++++++++++++--
 1 file changed, 151 insertions(+), 9 deletions(-)

diff --git a/src/datasets/utils/readme_parser.py b/src/datasets/utils/readme_parser.py
index 94343007e9f..3a04ec44d39 100644
--- a/src/datasets/utils/readme_parser.py
+++ b/src/datasets/utils/readme_parser.py
@@ -5,14 +5,109 @@
 # key_mapping = {
 #     "Dataset Desription": 'dataset_desc'
 # }
-import json
+
+# import json
+import yaml
+import pprint
+yaml_struc = """
+name: "" # Filename
+text: false
+subsections:
+  - name: "Dataset Card for X"
+    text: false
+    required: true
+    subsections:
+      - name: "Table of Contents"
+        text: true
+        subsections: null # meaning it should not be checked.
+      - name: "Dataset Description"
+        text: false
+        subsections:
+          - name: "Dataset Summary"
+            text: true
+            subsections: null
+          - name: "Supported Tasks and Leaderboards"
+            text: false
+            subsections: null
+          - name: Languages
+            text: false
+            subsections: null
+      - name: "Dataset Structure"
+        text: false
+        subsections:
+          - name: "Data Instances"
+            text: true
+            subsections: null
+          - name: "Data Fields"
+            text: true
+            subsections: null
+          - name: "Data Splits"
+            text: true
+            subsections: null
+      - name: "Dataset Creation"
+        text: false
+        subsections:
+        - name: "Curation Rationale"
+          text: false
+          subsections: null
+        - name: "Source Data"
+          text: false
+          subsections:
+            - name: "Initial Data Collection and Normalization"
+              text: false
+              subsections: null
+            - name: "Who are the source X producers?"
+              text: false
+              subsections: null
+        - name: "Annotations"
+          text: false
+          subsections:
+            - name: "Annotation process"
+              text: false
+              subsections: null
+            - name: "Who are the annotators?"
+              text: false
+              subsections: null
+        - name: "Personal and Sensitive Information"
+          text: false
+          subsections: null
+      - name: "Considerations for Using the Data"
+        text: false
+        subsections:
+          - name: "Social Impact of Dataset"
+            text: false
+            subsections: null
+          - name: "Discussion of Biases"
+            text: false
+            subsections: null
+          - name: "Other Known Limitations"
+            text: false
+            subsections: null
+      - name: "Additional Information"
+        text: false
+        subsections:
+          - name: "Dataset Curators"
+            text: false
+            subsections: null
+          - name: "Licensing Information"
+            text: false
+            subsections: null
+          - name: "Citation Information"
+            text: true
+            subsections: null
+          - name: "Contributions"
+            text: true
+            subsections: null
+"""
+
+filler_text = ["[Needs More Information]", "[More Information Needed]", "(https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)"]
 
 
 class Section:
     def __init__(self, name, level, lines=None):
         self.name = name
         self.level = level
-        self.attributes = ""
+        self.text = ""
         self.content = {}
         if lines is not None:
             self.parse(lines)
@@ -32,7 +127,7 @@ def parse(self, lines):
                     current_lines = []
                 else:
                     if current_lines != []:
-                        self.attributes += "".join(current_lines).strip()
+                        self.text += "".join(current_lines).strip()
                         current_lines = []
 
                 current_sub_level = " ".join(line.split()[1:]).strip(" \n")
@@ -43,12 +138,12 @@ def parse(self, lines):
                 self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines)
             else:
                 if current_lines != []:
-                    self.attributes += "".join(current_lines).strip()
+                    self.text += "".join(current_lines).strip()
 
     def to_dict(self):
         return {
             "name": self.name,
-            "attributes": self.attributes,
+            "test": self.text,
             "subsections": [value.to_dict() for value in self.content.values()],
         }
 
@@ -67,11 +162,58 @@ def parse(self, file_path):
                     tag_count += 1
                     if tag_count == 2:
                         break
+            else:
+                print("The README doesn't contain proper tags. Please ensure you add the correct YAML tags.")
+                return
             super().parse(f)
 
+    def _validate_section(self, section , structure):
+        # Text validation
+        if structure['text'] == True:
+            if section.text.strip() == '':
+                print(f"Expected some text for {section.name}")
+
+        if structure['subsections'] is not None:
+            # If no subsections present
+            if section.content == {}:
+                values = [subsection['name'] for subsection in structure['subsections']]
+                print(f"'{section.name}'' expected the following subsections: {values}, found `None`.")
+            else:
+                # Each key validation
+                structure_names = [subsection['name'] for subsection in structure['subsections']]
+                for idx, name in enumerate(structure_names):
+                    if name not in section.content:
+                        print(f"'{section.name}' is missing subsection: '{name}'.")
+                    else:
+                        self._validate_section(section.content[name], structure['subsections'][idx])
+
+                for name in section.content:
+                    if name not in structure_names:
+                        print(f"'{section.name}' has an extra subsection: '{name}'. Skipping validation checks for this subsection.")
+
+    def validate(self, yaml_struc):
+        structure = yaml.safe_load(yaml_struc)
+        num_first_level_keys = len(self.content.keys())
+        if num_first_level_keys > 1:
+            print(f"The README has found several first-level headings: {list(self.content.keys())}. Only one heading is expected.")
+        elif num_first_level_keys < 1:
+            print(f"The README has no first-level headings.")
+
+        else:
+            print(self.content.keys())
+            start_key = list(self.content.keys())[0]
+            if start_key.startswith("Dataset Card for"):
+                self._validate_section(self.content[start_key], structure['subsections'][0])
+            else:
+                print("No first-level hearding starting with `Dataset Card for` found.")
+
 
 if __name__ == "__main__":
-    readme = ReadMe("./datasets/fashion_mnist/README.md")
-    # print(readme.attributes)
-    json_obj = json.dumps(readme.to_dict(), indent=4)
-    print(json_obj)
+    readme = ReadMe("./dummy_readme.md")
+    print(readme.content["Dataset Card for FashionMNIST"].content["Additional Information"].content)
+    readme.validate(yaml_struc)
+    # print(readme.text)
+    # json_obj = json.dumps(readme.to_dict(), indent=4)
+    # print(json_obj)
+    # with open('dump.json', 'w') as f:
+    #     json.dump(readme.to_dict(), f)

From 204060249a44d9fb389237647e5d97acfe93daf6 Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Tue, 30 Mar 2021 00:47:42 +0530
Subject: [PATCH 03/28] Minor fix

---
 src/datasets/utils/readme_parser.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/datasets/utils/readme_parser.py b/src/datasets/utils/readme_parser.py
index 3a04ec44d39..7a842223e9b 100644
--- a/src/datasets/utils/readme_parser.py
+++ b/src/datasets/utils/readme_parser.py
@@ -170,8 +170,8 @@ def parse(self, file_path):
     def _validate_section(self, section , structure):
         # Text validation
         if structure['text'] == True:
-            if section.text.strip() == '':
-                print(f"Expected some text for {section.name}")
+            if section.text.strip() == '' or section.text.strip() in filler_text:
+                print(f"Expected some text for section '{section.name}'")
 
         if structure['subsections'] is not None:
             # If no subsections present
@@ -210,10 +210,4 @@ def validate(self, yaml_struc):
 
 if __name__ == "__main__":
     readme = ReadMe("./dummy_readme.md")
-    print(readme.content["Dataset Card for FashionMNIST"].content["Additional Information"].content)
     readme.validate(yaml_struc)
-    # print(readme.text)
-    # json_obj = json.dumps(readme.to_dict(), indent=4)
-    # print(json_obj)
-    # with open('dump.json', 'w') as f:
-    #     json.dump(readme.to_dict(), f)

From 7a1654bb741cbf57033d7389311797f5f7f0fe2c Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Thu, 1 Apr 2021 13:20:00 +0530
Subject: [PATCH 04/28] Changes from review

---
 src/datasets/utils/readme_parser.py | 131 +++++++++++++++++-----------
 1 file changed, 79 insertions(+), 52 deletions(-)

diff --git a/src/datasets/utils/readme_parser.py b/src/datasets/utils/readme_parser.py
index 7a842223e9b..a7b22076981 100644
--- a/src/datasets/utils/readme_parser.py
+++ b/src/datasets/utils/readme_parser.py
@@ -6,101 +6,107 @@
 #     "Dataset Desription": 'dataset_desc'
 # }
 
+import pprint
+
 # import json
 import yaml
-import pprint
+
+
 yaml_struc = """
 name: "" # Filename
-text: false
+allow_empty: false
 subsections:
   - name: "Dataset Card for X"
-    text: false
-    required: true
+    allow_empty: true
     subsections:
       - name: "Table of Contents"
-        text: true
+        allow_empty: false
         subsections: null # meaning it should not be checked.
       - name: "Dataset Description"
-        text: false
+        allow_empty: false
         subsections:
           - name: "Dataset Summary"
-            text: true
+            allow_empty: false
             subsections: null
           - name: "Supported Tasks and Leaderboards"
-            text: false
+            allow_empty: true
             subsections: null
           - name: Languages
-            text: false
+            allow_empty: true
             subsections: null
       - name: "Dataset Structure"
-        text: false
+        allow_empty: true
         subsections:
           - name: "Data Instances"
-            text: true
+            allow_empty: false
             subsections: null
           - name: "Data Fields"
-            text: true
+            allow_empty: false
             subsections: null
           - name: "Data Splits"
-            text: true
+            allow_empty: false
             subsections: null
       - name: "Dataset Creation"
-        text: false
+        allow_empty: true
         subsections:
         - name: "Curation Rationale"
-          text: false
+          allow_empty: true
           subsections: null
         - name: "Source Data"
-          text: false
+          allow_empty: true
           subsections:
             - name: "Initial Data Collection and Normalization"
-              text: false
+              allow_empty: true
               subsections: null
             - name: "Who are the source X producers?"
-              text: false
+              allow_empty: true
               subsections: null
         - name: "Annotations"
-          text: false
+          allow_empty: true
           subsections:
             - name: "Annotation process"
-              text: false
+              allow_empty: true
               subsections: null
             - name: "Who are the annotators?"
-              text: false
+              allow_empty: true
               subsections: null
         - name: "Personal and Sensitive Information"
-          text: false
+          allow_empty: true
           subsections: null
       - name: "Considerations for Using the Data"
-        text: false
+        allow_empty: true
         subsections:
           - name: "Social Impact of Dataset"
-            text: false
+            allow_empty: true
             subsections: null
           - name: "Discussion of Biases"
-            text: false
+            allow_empty: true
             subsections: null
           - name: "Other Known Limitations"
-            text: false
+            allow_empty: true
             subsections: null
       - name: "Additional Information"
-        text: false
+        allow_empty: true
         subsections:
           - name: "Dataset Curators"
-            text: false
+            allow_empty: true
             subsections: null
           - name: "Licensing Information"
-            text: false
+            allow_empty: true
             subsections: null
           - name: "Citation Information"
-            text: true
+            allow_empty: false
             subsections: null
           - name: "Contributions"
-            text: true
+            allow_empty: false
             subsections: null
 """
 
-filler_text = ["[Needs More Information]", "[More Information Needed]", "(https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)"]
+filler_text = [
+    "[Needs More Information]",
+    "[More Information Needed]",
+    "(https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)",
+]
 
 
 class Section:
@@ -108,6 +114,7 @@ def __init__(self, name, level, lines=None):
         self.name = name
         self.level = level
         self.text = ""
+        self.is_empty = True
         self.content = {}
         if lines is not None:
             self.parse(lines)
@@ -128,6 +135,8 @@ def parse(self, lines):
                 else:
                     if current_lines != []:
                         self.text += "".join(current_lines).strip()
+                        if self.text != "" and self.text not in filler_text:
+                            self.is_empty = False
                         current_lines = []
 
                 current_sub_level = " ".join(line.split()[1:]).strip(" \n")
@@ -139,11 +148,14 @@ def parse(self, lines):
             else:
                 if current_lines != []:
                     self.text += "".join(current_lines).strip()
+                    if self.text != "" and self.text not in filler_text:
+                        self.is_empty = False
 
     def to_dict(self):
         return {
             "name": self.name,
-            "test": self.text,
+            "text": self.text,
+            "is_empty": self.is_empty,
             "subsections": [value.to_dict() for value in self.content.values()],
         }
 
@@ -163,51 +175,66 @@ def parse(self, file_path):
                     if tag_count == 2:
                         break
             else:
-                print("The README doesn't contain proper tags. Please ensure you add the correct YAML tags.")
-                return
+                raise ValueError("The README doesn't contain proper tags. Please ensure you add the correct YAML tags.")
             super().parse(f)
 
-    def _validate_section(self, section , structure):
+    def _validate_section(self, section, structure):
         # Text validation
-        if structure['text'] == True:
-            if section.text.strip() == '' or section.text.strip() in filler_text:
-                print(f"Expected some text for section '{section.name}'")
+        error_list = []
+        if structure["allow_empty"] == False:
+            if section.is_empty:
+                print(section.text)
+                error_list.append(f"Expected some text for section '{section.name}'")
 
-        if structure['subsections'] is not None:
+        if structure["subsections"] is not None:
             # If no subsections present
             if section.content == {}:
-                values = [subsection['name'] for subsection in structure['subsections']]
-                print(f"'{section.name}'' expected the following subsections: {values}, found `None`.")
+                values = [subsection["name"] for subsection in structure["subsections"]]
+                error_list.append(f"'{section.name}'' expected the following subsections: {values}, found `None`.")
             else:
                 # Each key validation
-                structure_names = [subsection['name'] for subsection in structure['subsections']]
+                structure_names = [subsection["name"] for subsection in structure["subsections"]]
                 for idx, name in enumerate(structure_names):
                     if name not in section.content:
-                        print(f"'{section.name}' is missing subsection: '{name}'.")
+                        error_list.append(f"'{section.name}' is missing subsection: '{name}'.")
                     else:
-                        self._validate_section(section.content[name], structure['subsections'][idx])
+                        error_list += self._validate_section(section.content[name], structure["subsections"][idx])
 
                 for name in section.content:
                     if name not in structure_names:
-                        print(f"'{section.name}' has an extra subsection: '{name}'. Skipping validation checks for this subsection.")
+                        error_list.append(
+                            f"'{section.name}' has an extra subsection: '{name}'. Skipping validation checks for this subsection."
+                        )
+
+        return error_list
+
+    def __str__(self):
+        return str(self.to_dict())
 
     def validate(self, yaml_struc):
+        error_list = []
         structure = yaml.safe_load(yaml_struc)
         num_first_level_keys = len(self.content.keys())
         if num_first_level_keys > 1:
-            print(f"The README has found several first-level headings: {list(self.content.keys())}. Only one heading is expected.")
+            error_list.append(
+                f"The README has found several first-level headings: {list(self.content.keys())}. Only one heading is expected."
+            )
         elif num_first_level_keys < 1:
-            print(f"The README has no first-level headings.")
+            error_list.append(f"The README has no first-level headings.")
 
         else:
-            print(self.content.keys())
             start_key = list(self.content.keys())[0]
             if start_key.startswith("Dataset Card for"):
-                self._validate_section(self.content[start_key], structure['subsections'][0])
+                error_list += self._validate_section(self.content[start_key], structure["subsections"][0])
             else:
-                print("No first-level hearding starting with `Dataset Card for` found.")
+                error_list.append("No first-level heading starting with `Dataset Card for` found.")
+        return error_list
 
 
 if __name__ == "__main__":
     readme = ReadMe("./dummy_readme.md")
-    readme.validate(yaml_struc)
+    error_list = readme.validate(yaml_struc)
+    if error_list != []:
+        errors = "\n".join(list(map(lambda x: "-\t" + x, error_list)))
+        error_string = "The following issues were found with the README\n" + errors
+        raise ValueError(error_string)

From 99d22226b1184b2de166e40484740221cf3bfc5f Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Thu, 8 Apr 2021 18:53:14 +0530
Subject: [PATCH 05/28] Make main into a function in readme_parser

---
 src/datasets/utils/readme_parser.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/src/datasets/utils/readme_parser.py b/src/datasets/utils/readme_parser.py
index a7b22076981..dc4c1e54540 100644
--- a/src/datasets/utils/readme_parser.py
+++ b/src/datasets/utils/readme_parser.py
@@ -58,7 +58,7 @@
             - name: "Initial Data Collection and Normalization"
               allow_empty: true
               subsections: null
-            - name: "Who are the source X producers?"
+            - name: "Who are the source language producers?"
               allow_empty: true
               subsections: null
         - name: "Annotations"
@@ -163,6 +163,7 @@ def to_dict(self):
 class ReadMe(Section):  # Level 0
     def __init__(self, file_path):
         super().__init__(name=file_path, level="")
+        self.yaml_tags_line_count = -2
         self.parse(file_path)
 
     def parse(self, file_path):
@@ -170,12 +171,16 @@ def parse(self, file_path):
             # Skip Tags
             tag_count = 0
             for line in f:
+                self.yaml_tags_line_count += 1
                 if line.strip(" \n") == "---":
                     tag_count += 1
+
                     if tag_count == 2:
                         break
             else:
-                raise ValueError("The README doesn't contain proper tags. Please ensure you add the correct YAML tags.")
+                raise ValueError(
+                    "The README doesn't contain proper tags. Please ensure you add the correct YAML tags."
+                )
             super().parse(f)
 
     def _validate_section(self, section, structure):
@@ -183,14 +188,13 @@ def _validate_section(self, section, structure):
         error_list = []
         if structure["allow_empty"] == False:
             if section.is_empty:
-                print(section.text)
                 error_list.append(f"Expected some text for section '{section.name}'")
 
         if structure["subsections"] is not None:
             # If no subsections present
             if section.content == {}:
                 values = [subsection["name"] for subsection in structure["subsections"]]
-                error_list.append(f"'{section.name}'' expected the following subsections: {values}, found `None`.")
+                error_list.append(f"'{section.name}' expected the following subsections: {values}, found `None`.")
             else:
                 # Each key validation
                 structure_names = [subsection["name"] for subsection in structure["subsections"]]
@@ -231,8 +235,12 @@ def validate(self, yaml_struc):
         return error_list
 
 
-if __name__ == "__main__":
-    readme = ReadMe("./dummy_readme.md")
+def validate_readme(file_path):
+    readme = ReadMe(file_path)
+    if readme.yaml_tags_line_count == 0:
+        raise Warning("YAML Tags are not present in this README.")
+    elif readme.yaml_tags_line_count == -1:
+        raise Warning("Only the start of YAML tags present in this README.")
     error_list = readme.validate(yaml_struc)
     if error_list != []:
         errors = "\n".join(list(map(lambda x: "-\t" + x, error_list)))

From 1d788a9aa2931b1be706bdfe141a126eb9c970d5 Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Mon, 26 Apr 2021 02:04:48 +0530
Subject: [PATCH 06/28] Move README validator to scripts

---
 .../datasets_readme_validator.py              | 55 +++++++++++++++----
 1 file changed, 44 insertions(+), 11 deletions(-)
 rename src/datasets/utils/readme_parser.py => scripts/datasets_readme_validator.py (84%)

diff --git a/src/datasets/utils/readme_parser.py b/scripts/datasets_readme_validator.py
similarity index 84%
rename from src/datasets/utils/readme_parser.py
rename to scripts/datasets_readme_validator.py
index dc4c1e54540..b6088155d82 100644
--- a/src/datasets/utils/readme_parser.py
+++ b/scripts/datasets_readme_validator.py
@@ -1,14 +1,4 @@
-# class_mapping = {
-#     "Dataset Description": DatasetDescription,
-# }
-
-# key_mapping = {
-#     "Dataset Desription": 'dataset_desc'
-# }
-
-import pprint
-
-# import json
+import os
 import yaml
 
 
@@ -246,3 +236,46 @@ def validate_readme(file_path):
         errors = "\n".join(list(map(lambda x: "-\t" + x, error_list)))
         error_string = "The following issues were found with the README\n" + errors
         raise ValueError(error_string)
+
+
+if __name__ == '__main__':
+    datasets = os.listdir('./datasets')
+    for dataset in sorted(datasets):
+        if not dataset.startswith('.'):
+            file_path = os.path.join("./datasets", dataset, "README.md")
+            if os.path.exists(file_path):
+                try:
+                    validate_readme(file_path)
+                except Exception as e:
+                    print("=" * 30)
+                    print(dataset)
+                    print("=" * 30)
+                    print(e)
+            else:
+                try:
+                    raise FileNotFoundError(f"No such file: {file_path}")
+                except Exception as e:
+                    print("=" * 30)
+                    print(dataset)
+                    print("=" * 30)
+                    print(e)
+    datasets = os.listdir('./datasets')
+    for dataset in sorted(datasets):
+        if not dataset.startswith('.'):
+            file_path = os.path.join("./datasets", dataset, "README.md")
+            if os.path.exists(file_path):
+                try:
+                    validate_readme(file_path)
+                except Exception as e:
+                    print("=" * 30)
+                    print(dataset)
+                    print("=" * 30)
+                    print(e)
+            else:
+                try:
+                    raise FileNotFoundError(f"No such file: {file_path}")
+                except Exception as e:
+                    print("=" * 30)
+                    print(dataset)
+                    print("=" * 30)
+                    print(e)

From 2d13f70160699864c42769ed3248c13537372f30 Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Mon, 26 Apr 2021 13:35:34 +0530
Subject: [PATCH 07/28] Arrange README validation files

---
 scripts/datasets_readme_validator.py          | 333 +++---------------
 src/datasets/utils/readme.py                  | 173 +++++++++
 src/datasets/utils/resources/__init__.py      |   0
 .../utils/resources/readme_structure.yaml     |  87 +++++
 4 files changed, 315 insertions(+), 278 deletions(-)
 create mode 100644 src/datasets/utils/readme.py
 create mode 100644 src/datasets/utils/resources/__init__.py
 create mode 100644 src/datasets/utils/resources/readme_structure.yaml

diff --git a/scripts/datasets_readme_validator.py b/scripts/datasets_readme_validator.py
index b6088155d82..c56b59f2601 100644
--- a/scripts/datasets_readme_validator.py
+++ b/scripts/datasets_readme_validator.py
@@ -1,281 +1,58 @@
-import os
-import yaml
+#!/usr/bin/env python
 
-
-yaml_struc = """
-name: "" # Filename
-allow_empty: false
-subsections:
-  - name: "Dataset Card for X"
-    allow_empty: true
-    subsections:
-      - name: "Table of Contents"
-        allow_empty: false
-        subsections: null # meaning it should not be checked.
-      - name: "Dataset Description"
-        allow_empty: false
-        subsections:
-          - name: "Dataset Summary"
-            allow_empty: false
-            subsections: null
-          - name: "Supported Tasks and Leaderboards"
-            allow_empty: true
-            subsections: null
-          - name: Languages
-            allow_empty: true
-            subsections: null
-      - name: "Dataset Structure"
-        allow_empty: true
-        subsections:
-          - name: "Data Instances"
-            allow_empty: false
-            subsections: null
-          - name: "Data Fields"
-            allow_empty: false
-            subsections: null
-          - name: "Data Splits"
-            allow_empty: false
-            subsections: null
-      - name: "Dataset Creation"
-        allow_empty: true
-        subsections:
-        - name: "Curation Rationale"
-          allow_empty: true
-          subsections: null
-        - name: "Source Data"
-          allow_empty: true
-          subsections:
-            - name: "Initial Data Collection and Normalization"
-              allow_empty: true
-              subsections: null
-            - name: "Who are the source language producers?"
-              allow_empty: true
-              subsections: null
-        - name: "Annotations"
-          allow_empty: true
-          subsections:
-            - name: "Annotation process"
-              allow_empty: true
-              subsections: null
-            - name: "Who are the annotators?"
-              allow_empty: true
-              subsections: null
-        - name: "Personal and Sensitive Information"
-          allow_empty: true
-          subsections: null
-      - name: "Considerations for Using the Data"
-        allow_empty: true
-        subsections:
-          - name: "Social Impact of Dataset"
-            allow_empty: true
-            subsections: null
-          - name: "Discussion of Biases"
-            allow_empty: true
-            subsections: null
-          - name: "Other Known Limitations"
-            allow_empty: true
-            subsections: null
-      - name: "Additional Information"
-        allow_empty: true
-        subsections:
-          - name: "Dataset Curators"
-            allow_empty: true
-            subsections: null
-          - name: "Licensing Information"
-            allow_empty: true
-            subsections: null
-          - name: "Citation Information"
-            allow_empty: false
-            subsections: null
-          - name: "Contributions"
-            allow_empty: false
-            subsections: null
+""" This script will run in CI and make sure all new changes to datasets readme files have valid content present.
 """
 
-filler_text = [
-    "[Needs More Information]",
-    "[More Information Needed]",
-    "(https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)",
-]
-
-
-class Section:
-    def __init__(self, name, level, lines=None):
-        self.name = name
-        self.level = level
-        self.text = ""
-        self.is_empty = True
-        self.content = {}
-        if lines is not None:
-            self.parse(lines)
-
-    def parse(self, lines):
-        current_sub_level = ""
-        current_lines = []
-        code_start = False
-        for line in lines:
-            if line.strip(" \n") == "":
-                continue
-            elif line.strip(" \n")[:3] == "```":
-                code_start = not code_start
-            elif line.split()[0] == self.level + "#" and not code_start:
-                if current_sub_level != "":
-                    self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines)
-                    current_lines = []
-                else:
-                    if current_lines != []:
-                        self.text += "".join(current_lines).strip()
-                        if self.text != "" and self.text not in filler_text:
-                            self.is_empty = False
-                        current_lines = []
-
-                current_sub_level = " ".join(line.split()[1:]).strip(" \n")
-            else:
-                current_lines.append(line)
-        else:
-            if current_sub_level != "":
-                self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines)
-            else:
-                if current_lines != []:
-                    self.text += "".join(current_lines).strip()
-                    if self.text != "" and self.text not in filler_text:
-                        self.is_empty = False
-
-    def to_dict(self):
-        return {
-            "name": self.name,
-            "text": self.text,
-            "is_empty": self.is_empty,
-            "subsections": [value.to_dict() for value in self.content.values()],
-        }
-
-
-class ReadMe(Section):  # Level 0
-    def __init__(self, file_path):
-        super().__init__(name=file_path, level="")
-        self.yaml_tags_line_count = -2
-        self.parse(file_path)
-
-    def parse(self, file_path):
-        with open(self.name) as f:
-            # Skip Tags
-            tag_count = 0
-            for line in f:
-                self.yaml_tags_line_count += 1
-                if line.strip(" \n") == "---":
-                    tag_count += 1
-
-                    if tag_count == 2:
-                        break
-            else:
-                raise ValueError(
-                    "The README doesn't contain proper tags. Please ensure you add the correct YAML tags."
-                )
-            super().parse(f)
-
-    def _validate_section(self, section, structure):
-        # Text validation
-        error_list = []
-        if structure["allow_empty"] == False:
-            if section.is_empty:
-                error_list.append(f"Expected some text for section '{section.name}'")
-
-        if structure["subsections"] is not None:
-            # If no subsections present
-            if section.content == {}:
-                values = [subsection["name"] for subsection in structure["subsections"]]
-                error_list.append(f"'{section.name}' expected the following subsections: {values}, found `None`.")
-            else:
-                # Each key validation
-                structure_names = [subsection["name"] for subsection in structure["subsections"]]
-                for idx, name in enumerate(structure_names):
-                    if name not in section.content:
-                        error_list.append(f"'{section.name}' is missing subsection: '{name}'.")
-                    else:
-                        error_list += self._validate_section(section.content[name], structure["subsections"][idx])
-
-                for name in section.content:
-                    if name not in structure_names:
-                        error_list.append(
-                            f"'{section.name}' has an extra subsection: '{name}'. Skipping validation checks for this subsection."
-                        )
-
-        return error_list
-
-    def __str__(self):
-        return str(self.to_dict())
-
-    def validate(self, yaml_struc):
-        error_list = []
-        structure = yaml.safe_load(yaml_struc)
-        num_first_level_keys = len(self.content.keys())
-        if num_first_level_keys > 1:
-            error_list.append(
-                f"The README has found several first-level headings: {list(self.content.keys())}. Only one heading is expected."
-            )
-        elif num_first_level_keys < 1:
-            error_list.append(f"The README has no first-level headings.")
-
-        else:
-            start_key = list(self.content.keys())[0]
-            if start_key.startswith("Dataset Card for"):
-                error_list += self._validate_section(self.content[start_key], structure["subsections"][0])
-            else:
-                error_list.append("No first-level heading starting with `Dataset Card for` found.")
-        return error_list
-
-
-def validate_readme(file_path):
-    readme = ReadMe(file_path)
-    if readme.yaml_tags_line_count == 0:
-        raise Warning("YAML Tags are not present in this README.")
-    elif readme.yaml_tags_line_count == -1:
-        raise Warning("Only the start of YAML tags present in this README.")
-    error_list = readme.validate(yaml_struc)
-    if error_list != []:
-        errors = "\n".join(list(map(lambda x: "-\t" + x, error_list)))
-        error_string = "The following issues were found with the README\n" + errors
-        raise ValueError(error_string)
-
-
-if __name__ == '__main__':
-    datasets = os.listdir('./datasets')
-    for dataset in sorted(datasets):
-        if not dataset.startswith('.'):
-            file_path = os.path.join("./datasets", dataset, "README.md")
-            if os.path.exists(file_path):
-                try:
-                    validate_readme(file_path)
-                except Exception as e:
-                    print("=" * 30)
-                    print(dataset)
-                    print("=" * 30)
-                    print(e)
-            else:
-                try:
-                    raise FileNotFoundError(f"No such file: {file_path}")
-                except Exception as e:
-                    print("=" * 30)
-                    print(dataset)
-                    print("=" * 30)
-                    print(e)
-    datasets = os.listdir('./datasets')
-    for dataset in sorted(datasets):
-        if not dataset.startswith('.'):
-            file_path = os.path.join("./datasets", dataset, "README.md")
-            if os.path.exists(file_path):
-                try:
-                    validate_readme(file_path)
-                except Exception as e:
-                    print("=" * 30)
-                    print(dataset)
-                    print("=" * 30)
-                    print(e)
-            else:
-                try:
-                    raise FileNotFoundError(f"No such file: {file_path}")
-                except Exception as e:
-                    print("=" * 30)
-                    print(dataset)
-                    print("=" * 30)
-                    print(e)
+from pathlib import Path
+from subprocess import check_output
+from typing import List
+
+from datasets.utils.readme import validate_readme
+
+
+def get_changed_files(repo_path: Path) -> List[Path]:
+    diff_output = check_output(["git", "diff", "--name-only", "HEAD..origin/master"], cwd=repo_path)
+    changed_files = [Path(repo_path, f) for f in diff_output.decode().splitlines()]
+    return changed_files
+
+
+if __name__ == "__main__":
+    import logging
+    from argparse import ArgumentParser
+
+    logging.basicConfig(level=logging.DEBUG)
+
+    ap = ArgumentParser()
+    ap.add_argument("--repo_path", type=Path, default=Path.cwd())
+    ap.add_argument("--check_all", action="store_true")
+    args = ap.parse_args()
+
+    repo_path: Path = args.repo_path
+    if args.check_all:
+        readmes = [dd / "README.md" for dd in (repo_path / "datasets").iterdir()]
+    else:
+        changed_files = get_changed_files(repo_path)
+        readmes = [
+            f
+            for f in changed_files
+            if f.exists() and f.name.lower() == "readme.md" and f.parent.parent.name == "datasets"
+        ]
+
+    failed: List[Path] = []
+    for readme in sorted(readmes):
+        try:
+            DatasetMetadata.from_readme(readme)
+            logging.debug(f"✅️ Validated '{readme.relative_to(repo_path)}'")
+        except TypeError as e:
+            failed.append(readme)
+            logging.warning(f"❌ Failed to validate '{readme.relative_to(repo_path)}':\n{e}")
+        except Exception as e:
+            failed.append(readme)
+            logging.warning(f"⁉️ Something unexpected happened on '{readme.relative_to(repo_path)}':\n{e}")
+
+    if len(failed) > 0:
+        logging.info(f"❌ Failed on {len(failed)} files.")
+        exit(1)
+    else:
+        logging.info("All is well, keep up the good work 🤗!")
+        exit(0)
diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py
new file mode 100644
index 00000000000..c3500b675ab
--- /dev/null
+++ b/src/datasets/utils/readme.py
@@ -0,0 +1,173 @@
+import logging
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Tuple
+
+import yaml
+
+
+BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils"
+this_url = f"{BASE_REF_URL}/{__file__}"
+logger = logging.getLogger(__name__)
+
+
+def load_yaml_resource(resource: str) -> Tuple[Any, str]:
+    with open(resource) as f:
+        content = yaml.safe_load(f)
+    return content, f"{BASE_REF_URL}/resources/{resource}"
+
+
+readme_structure, known_readme_structure_url = load_yaml_resource("readme_structure.yaml")
+filler_text = [
+    "[Needs More Information]",
+    "[More Information Needed]",
+    "(https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)",
+]
+
+
+class Section:
+    def __init__(self, name, level, lines=None):
+        self.name = name
+        self.level = level
+        self.text = ""
+        self.is_empty = True
+        self.content = {}
+        if lines is not None:
+            self.parse(lines)
+
+    def parse(self, lines):
+        current_sub_level = ""
+        current_lines = []
+        code_start = False
+        for line in lines:
+            if line.strip(" \n") == "":
+                continue
+            elif line.strip(" \n")[:3] == "```":
+                code_start = not code_start
+            elif line.split()[0] == self.level + "#" and not code_start:
+                if current_sub_level != "":
+                    self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines)
+                    current_lines = []
+                else:
+                    if current_lines != []:
+                        self.text += "".join(current_lines).strip()
+                        if self.text != "" and self.text not in filler_text:
+                            self.is_empty = False
+                        current_lines = []
+
+                current_sub_level = " ".join(line.split()[1:]).strip(" \n")
+            else:
+                current_lines.append(line)
+        else:
+            if current_sub_level != "":
+                self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines)
+            else:
+                if current_lines != []:
+                    self.text += "".join(current_lines).strip()
+                    if self.text != "" and self.text not in filler_text:
+                        self.is_empty = False
+
+    def to_dict(self):
+        return {
+            "name": self.name,
+            "text": self.text,
+            "is_empty": self.is_empty,
+            "subsections": [value.to_dict() for value in self.content.values()],
+        }
+
+
+class ReadMe(Section):  # Level 0
+    def __init__(self, file_path):
+        super().__init__(name=file_path, level="")
+        self.yaml_tags_line_count = -2
+        self.parse(file_path)
+
+    def parse(self, file_path):
+        with open(self.name) as f:
+            # Skip Tags
+            tag_count = 0
+            for line in f:
+                self.yaml_tags_line_count += 1
+                if line.strip(" \n") == "---":
+                    tag_count += 1
+
+                    if tag_count == 2:
+                        break
+            else:
+                raise ValueError(
+                    "The README doesn't contain proper tags. Please ensure you add the correct YAML tags."
+                )
+            super().parse(f)
+
+    def _validate_section(self, section, structure):
+        # Text validation
+        error_list = []
+        if structure["allow_empty"] == False:
+            if section.is_empty:
+                error_list.append(f"Expected some text for section '{section.name}'")
+
+        if structure["subsections"] is not None:
+            # If no subsections present
+            if section.content == {}:
+                values = [subsection["name"] for subsection in structure["subsections"]]
+                error_list.append(f"'{section.name}' expected the following subsections: {values}, found `None`.")
+            else:
+                # Each key validation
+                structure_names = [subsection["name"] for subsection in structure["subsections"]]
+                for idx, name in enumerate(structure_names):
+                    if name not in section.content:
+                        error_list.append(f"'{section.name}' is missing subsection: '{name}'.")
+                    else:
+                        error_list += self._validate_section(section.content[name], structure["subsections"][idx])
+
+                for name in section.content:
+                    if name not in structure_names:
+                        error_list.append(
+                            f"'{section.name}' has an extra subsection: '{name}'. Skipping validation checks for this subsection."
+                        )
+
+        return error_list
+
+    def __str__(self):
+        return str(self.to_dict())
+
+    def validate(self, readme_structure):
+        error_list = []
+        num_first_level_keys = len(self.content.keys())
+        if num_first_level_keys > 1:
+            error_list.append(
+                f"The README has found several first-level headings: {list(self.content.keys())}. Only one heading is expected."
+            )
+        elif num_first_level_keys < 1:
+            error_list.append(f"The README has no first-level headings.")
+
+        else:
+            start_key = list(self.content.keys())[0]
+            if start_key.startswith("Dataset Card for"):
+                error_list += self._validate_section(self.content[start_key], readme_structure["subsections"][0])
+            else:
+                error_list.append("No first-level heading starting with `Dataset Card for` found.")
+        return error_list
+
+
+def validate_readme(file_path):
+    readme = ReadMe(file_path)
+    if readme.yaml_tags_line_count == 0:
+        raise Warning("YAML Tags are not present in this README.")
+    elif readme.yaml_tags_line_count == -1:
+        raise Warning("Only the start of YAML tags present in this README.")
+    error_list = readme.validate(readme_structure)
+    if error_list != []:
+        errors = "\n".join(list(map(lambda x: "-\t" + x, error_list)))
+        error_string = "The following issues were found with the README\n" + errors
+        raise ValueError(error_string)
+
+
+if __name__ == "__main__":
+    from argparse import ArgumentParser
+
+    ap = ArgumentParser(usage="Validate the content (excluding YAML tags) of a README.md file.")
+    ap.add_argument("readme_filepath")
+    args = ap.parse_args()
+    readme_filepath = Path(args.readme_filepath)
+    validate_readme(readme_filepath)
diff --git a/src/datasets/utils/resources/__init__.py b/src/datasets/utils/resources/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/src/datasets/utils/resources/readme_structure.yaml b/src/datasets/utils/resources/readme_structure.yaml
new file mode 100644
index 00000000000..7fa2f663df0
--- /dev/null
+++ b/src/datasets/utils/resources/readme_structure.yaml
@@ -0,0 +1,87 @@
+name: "" # Filename comes here
+allow_empty: false
+subsections:
+  - name: "Dataset Card for X" # First-level markdown heading
+    allow_empty: true
+    subsections:
+      - name: "Table of Contents"
+        allow_empty: false
+        subsections: null # meaning it should not be checked.
+      - name: "Dataset Description"
+        allow_empty: false
+        subsections:
+          - name: "Dataset Summary"
+            allow_empty: false
+            subsections: null
+          - name: "Supported Tasks and Leaderboards"
+            allow_empty: true
+            subsections: null
+          - name: Languages
+            allow_empty: true
+            subsections: null
+      - name: "Dataset Structure"
+        allow_empty: true
+        subsections:
+          - name: "Data Instances"
+            allow_empty: false
+            subsections: null
+          - name: "Data Fields"
+            allow_empty: false
+            subsections: null
+          - name: "Data Splits"
+            allow_empty: false
+            subsections: null
+      - name: "Dataset Creation"
+        allow_empty: true
+        subsections:
+        - name: "Curation Rationale"
+          allow_empty: true
+          subsections: null
+        - name: "Source Data"
+          allow_empty: true
+          subsections:
+            - name: "Initial Data Collection and Normalization"
+              allow_empty: true
+              subsections: null
+            - name: "Who are the source language producers?"
+              allow_empty: true
+              subsections: null
+        - name: "Annotations"
+          allow_empty: true
+          subsections:
+            - name: "Annotation process"
+              allow_empty: true
+              subsections: null
+            - name: "Who are the annotators?"
+              allow_empty: true
+              subsections: null
+        - name: "Personal and Sensitive Information"
+          allow_empty: true
+          subsections: null
+      - name: "Considerations for Using the Data"
+        allow_empty: true
+        subsections:
+          - name: "Social Impact of Dataset"
+            allow_empty: true
+            subsections: null
+          - name: "Discussion of Biases"
+            allow_empty: true
+            subsections: null
+          - name: "Other Known Limitations"
+            allow_empty: true
+            subsections: null
+      - name: "Additional Information"
+        allow_empty: true
+        subsections:
+          - name: "Dataset Curators"
+            allow_empty: true
+            subsections: null
+          - name: "Licensing Information"
+            allow_empty: true
+            subsections: null
+          - name: "Citation Information"
+            allow_empty: false
+            subsections: null
+          - name: "Contributions"
+            allow_empty: false
+            subsections: null
\ No newline at end of file

From ee31e1511ca056dcde579c9f5e0f0c7b232bbb69 Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Tue, 27 Apr 2021 13:54:52 +0530
Subject: [PATCH 08/28] Update readme validator class

---
 src/datasets/utils/readme.py | 247 +++++++++++++++++++++++------------
 1 file changed, 167 insertions(+), 80 deletions(-)

diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py
index c3500b675ab..1e5d9b9a42e 100644
--- a/src/datasets/utils/readme.py
+++ b/src/datasets/utils/readme.py
@@ -1,45 +1,61 @@
 import logging
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any, List, Tuple
 
 import yaml
 
 
+# loading package files: https://stackoverflow.com/a/20885799
+try:
+    import importlib.resources as pkg_resources
+except ImportError:
+    # Try backported to PY<37 `importlib_resources`.
+    import importlib_resources as pkg_resources
+
+from .import resources
+
+
 BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils"
 this_url = f"{BASE_REF_URL}/{__file__}"
 logger = logging.getLogger(__name__)
 
 
 def load_yaml_resource(resource: str) -> Tuple[Any, str]:
-    with open(resource) as f:
-        content = yaml.safe_load(f)
-    return content, f"{BASE_REF_URL}/resources/{resource}"
+    content = pkg_resources.read_text(resources, resource)
+    return yaml.safe_load(content), f"{BASE_REF_URL}/resources/{resource}"
 
 
 readme_structure, known_readme_structure_url = load_yaml_resource("readme_structure.yaml")
-filler_text = [
+
+FILLER_TEXT = [
     "[Needs More Information]",
     "[More Information Needed]",
     "(https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)",
 ]
 
+# Dictionary representation of section/readme, error_list, warning_list
+ReadmeValidatorOutput = Tuple[dict, List[str], List[str]]
 
+
+@dataclass
 class Section:
-    def __init__(self, name, level, lines=None):
-        self.name = name
-        self.level = level
+    name: str
+    level: str
+    lines: List[str] = None
+
+    def __post_init__(self):
         self.text = ""
         self.is_empty = True
         self.content = {}
-        if lines is not None:
-            self.parse(lines)
+        if self.lines is not None:
+            self.parse()
 
-    def parse(self, lines):
+    def parse(self):
         current_sub_level = ""
         current_lines = []
         code_start = False
-        for line in lines:
+        for line in self.lines:
             if line.strip(" \n") == "":
                 continue
             elif line.strip(" \n")[:3] == "```":
@@ -51,7 +67,7 @@ def parse(self, lines):
                 else:
                     if current_lines != []:
                         self.text += "".join(current_lines).strip()
-                        if self.text != "" and self.text not in filler_text:
+                        if self.text != "" and self.text not in FILLER_TEXT:
                             self.is_empty = False
                         current_lines = []
 
@@ -60,107 +76,178 @@ def parse(self, lines):
                 current_lines.append(line)
         else:
             if current_sub_level != "":
+                if current_sub_level in self.content:
+                    print(
+                        f"Multiple sections with the same heading '{current_sub_level}' have been found. Using the latest one found."
+                    )
                 self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines)
             else:
                 if current_lines != []:
                     self.text += "".join(current_lines).strip()
-                    if self.text != "" and self.text not in filler_text:
+                    if self.text != "" and self.text not in FILLER_TEXT:
                         self.is_empty = False
 
-    def to_dict(self):
-        return {
-            "name": self.name,
-            "text": self.text,
-            "is_empty": self.is_empty,
-            "subsections": [value.to_dict() for value in self.content.values()],
-        }
+    def validate(self, structure: dict) -> ReadmeValidatorOutput:
+        """Validates a Section class object recursively using the structure provided as a dictionary.
 
+        Args:
+            structute (:obj: `dict`): The dictionary representing expected structure.
 
-class ReadMe(Section):  # Level 0
-    def __init__(self, file_path):
-        super().__init__(name=file_path, level="")
-        self.yaml_tags_line_count = -2
-        self.parse(file_path)
-
-    def parse(self, file_path):
-        with open(self.name) as f:
-            # Skip Tags
-            tag_count = 0
-            for line in f:
-                self.yaml_tags_line_count += 1
-                if line.strip(" \n") == "---":
-                    tag_count += 1
-
-                    if tag_count == 2:
-                        break
-            else:
-                raise ValueError(
-                    "The README doesn't contain proper tags. Please ensure you add the correct YAML tags."
-                )
-            super().parse(f)
-
-    def _validate_section(self, section, structure):
-        # Text validation
+        Returns:
+            :obj: `ReadmeValidatorOutput`: The dictionary representation of the section, and the errors.
+        """
+        # Header text validation
         error_list = []
+        warning_list = []
         if structure["allow_empty"] == False:
-            if section.is_empty:
-                error_list.append(f"Expected some text for section '{section.name}'")
+            # If header text is expected
+            if self.is_empty:
+                # If no header text is found, mention it in the error_list
+                error_list.append(
+                    f"Expected some header text for section '{self.name}', reference at {known_readme_structure_url}."
+                )
 
+        # Subsections Validation
         if structure["subsections"] is not None:
-            # If no subsections present
-            if section.content == {}:
+            # If subsections are expected
+            if self.content == {}:
+                # If no subsections are present
                 values = [subsection["name"] for subsection in structure["subsections"]]
-                error_list.append(f"'{section.name}' expected the following subsections: {values}, found `None`.")
+                # Mention the expected values in the error_list
+                error_list.append(
+                    f"Section '{self.name}' expected the following subsections: {values}, found `None`, reference at {known_readme_structure_url}."
+                )
             else:
-                # Each key validation
+                # If some subsections are present
                 structure_names = [subsection["name"] for subsection in structure["subsections"]]
                 for idx, name in enumerate(structure_names):
-                    if name not in section.content:
-                        error_list.append(f"'{section.name}' is missing subsection: '{name}'.")
+                    if name not in self.content:
+                        # If the expected subsection is not present
+                        error_list.append(
+                            f"Section '{self.name}' is missing subsection: '{name}', reference at {known_readme_structure_url}."
+                        )
                     else:
-                        error_list += self._validate_section(section.content[name], structure["subsections"][idx])
+                        # If the subsection is present, validate subsection, return the result
+                        # and concat the errors from subsection to section error_list
+                        _, subsec_error_list, subsec_warning_list = self.content[name].validate(
+                            structure["subsections"][idx]
+                        )
+                        error_list += subsec_error_list
+                        warning_list += subsec_warning_list
 
-                for name in section.content:
+                for name in self.content:
                     if name not in structure_names:
-                        error_list.append(
-                            f"'{section.name}' has an extra subsection: '{name}'. Skipping validation checks for this subsection."
+                        # If an extra subsection is present
+                        warning_list.append(
+                            f"'{self.name}' has an extra subsection: '{name}'. Skipping further validation checks for this subsection as expected structure is unknown."
                         )
+        if error_list:
+            # If there are errors, do not return the dictionary as it is invalid
+            return {}, error_list, warning_list
+        else:
+            return self.to_dict(), error_list, warning_list
+
+    def to_dict(self) -> dict:
+        """Returns the dictionary representation of a section."""
+        return {
+            "name": self.name,
+            "text": self.text,
+            "is_empty": self.is_empty,
+            "subsections": [value.to_dict() for value in self.content.values()],
+        }
+
+
+class ReadMe(Section):  # Level 0
+    def __init__(self, name: str, lines: List[str], structure: dict = None):
+        super().__init__(name=name, level="")  # Not using lines here as we need to use a child class parse
+        self.structure = structure
+        self.yaml_tags_line_count = -2
+        self.lines = lines
+        if self.lines is not None:
+            self.parse()
+        if self.structure is None:
+            content, error_list, warning_list = self.validate(readme_structure)
+        else:
+            content, error_list, warning_list = self.validate(self.structure)
+
+        if error_list != [] or warning_list != []:
+            errors = "\n".join(list(map(lambda x: "-\t" + x, error_list + warning_list)))
+            error_string = f"The following issues were found for the README at `{self.name}`:\n" + errors
+            raise ValueError(error_string)
+
+    @classmethod
+    def from_readme(cls, path: Path, structure: dict = None):
+        with open(path) as f:
+            lines = f.readlines()
+        return cls(path, lines, structure)
 
-        return error_list
+    @classmethod
+    def from_string(cls, string: str, structure: dict = None, root_name:str="root"):
+        lines = string.split("\n")
+        return cls(root_name, lines, structure)
+
+    def parse(self):
+        # Skip Tags
+        tag_count = 0
+        line_count = 0
+
+        for line in self.lines:
+            self.yaml_tags_line_count += 1
+            if line.strip(" \n") == "---":
+                tag_count += 1
+                if tag_count == 2:
+                    break
+            line_count+=1
+
+        self.lines = self.lines[line_count+1:]  # Get the last + 1 th item.
+        super().parse()
 
     def __str__(self):
+        """Returns the string of dictionary representation of the ReadMe."""
         return str(self.to_dict())
 
     def validate(self, readme_structure):
         error_list = []
+        warning_list = []
+        if self.yaml_tags_line_count == 0:
+            warning_list.append(f"YAML Tags are not present in the README at `{self.name}`.")
+        elif self.yaml_tags_line_count == -1:
+            warning_list.append(f"Only the start of YAML tags present in the README at `{self.name}`.")
+
+        # Check how many first level sections are present.
         num_first_level_keys = len(self.content.keys())
         if num_first_level_keys > 1:
+            # If more than one, add to the error list, continue
             error_list.append(
-                f"The README has found several first-level headings: {list(self.content.keys())}. Only one heading is expected."
+                f"The README present at `{self.name}` has found several first-level headings: {list(self.content.keys())}. Only one heading is expected. Skipping further validation for this README."
             )
         elif num_first_level_keys < 1:
-            error_list.append(f"The README has no first-level headings.")
+            # If less than one, append error.
+            error_list.append(
+                f"The README present as `{self.name}` has no first-level headings. One heading is expected. Skipping further validation for this README."
+            )
 
         else:
-            start_key = list(self.content.keys())[0]
-            if start_key.startswith("Dataset Card for"):
-                error_list += self._validate_section(self.content[start_key], readme_structure["subsections"][0])
-            else:
-                error_list.append("No first-level heading starting with `Dataset Card for` found.")
-        return error_list
-
+            # If one exactly
+            start_key = list(self.content.keys())[0]  # Get the key
+            if start_key.startswith("Dataset Card for"):  # Check correct start
 
-def validate_readme(file_path):
-    readme = ReadMe(file_path)
-    if readme.yaml_tags_line_count == 0:
-        raise Warning("YAML Tags are not present in this README.")
-    elif readme.yaml_tags_line_count == -1:
-        raise Warning("Only the start of YAML tags present in this README.")
-    error_list = readme.validate(readme_structure)
-    if error_list != []:
-        errors = "\n".join(list(map(lambda x: "-\t" + x, error_list)))
-        error_string = "The following issues were found with the README\n" + errors
-        raise ValueError(error_string)
+                # If the starting is correct, validate all the sections
+                _, sec_error_list, sec_warning_list = self.content[start_key].validate(
+                    readme_structure["subsections"][0]
+                )
+                error_list += sec_error_list
+                warning_list += sec_warning_list
+            else:
+                # If not found, append error
+                error_list.append(
+                    f"No first-level heading starting with `Dataset Card for` found in README present at `{self.name}`. Skipping further validation for this README."
+                )
+        if error_list:
+            # If there are errors, do not return the dictionary as it is invalid
+            return {}, error_list, warning_list
+        else:
+            return self.to_dict(), error_list, warning_list
 
 
 if __name__ == "__main__":
@@ -170,4 +257,4 @@ def validate_readme(file_path):
     ap.add_argument("readme_filepath")
     args = ap.parse_args()
     readme_filepath = Path(args.readme_filepath)
-    validate_readme(readme_filepath)
+    readme = ReadMe.from_readme(readme_filepath)

From ae60ce58e6ab4791f9d745fe574026e5aa6255f0 Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Tue, 27 Apr 2021 14:09:14 +0530
Subject: [PATCH 09/28] Add from_string tests

---
 src/datasets/utils/readme.py                  |   8 +-
 .../utils/resources/readme_structure.yaml     |  50 ++--
 tests/test_readme_util.py                     | 242 ++++++++++++++++++
 3 files changed, 271 insertions(+), 29 deletions(-)
 create mode 100644 tests/test_readme_util.py

diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py
index 1e5d9b9a42e..ffc38b1596d 100644
--- a/src/datasets/utils/readme.py
+++ b/src/datasets/utils/readme.py
@@ -13,7 +13,7 @@
     # Try backported to PY<37 `importlib_resources`.
     import importlib_resources as pkg_resources
 
-from .import resources
+from . import resources
 
 
 BASE_REF_URL = "https://github.com/huggingface/datasets/tree/master/src/datasets/utils"
@@ -182,7 +182,7 @@ def from_readme(cls, path: Path, structure: dict = None):
         return cls(path, lines, structure)
 
     @classmethod
-    def from_string(cls, string: str, structure: dict = None, root_name:str="root"):
+    def from_string(cls, string: str, structure: dict = None, root_name: str = "root"):
         lines = string.split("\n")
         return cls(root_name, lines, structure)
 
@@ -197,9 +197,9 @@ def parse(self):
                 tag_count += 1
                 if tag_count == 2:
                     break
-            line_count+=1
+            line_count += 1
 
-        self.lines = self.lines[line_count+1:]  # Get the last + 1 th item.
+        self.lines = self.lines[line_count + 1 :]  # Get the last + 1 th item.
         super().parse()
 
     def __str__(self):
diff --git a/src/datasets/utils/resources/readme_structure.yaml b/src/datasets/utils/resources/readme_structure.yaml
index 7fa2f663df0..fc3356f2675 100644
--- a/src/datasets/utils/resources/readme_structure.yaml
+++ b/src/datasets/utils/resources/readme_structure.yaml
@@ -34,30 +34,30 @@ subsections:
       - name: "Dataset Creation"
         allow_empty: true
         subsections:
-        - name: "Curation Rationale"
-          allow_empty: true
-          subsections: null
-        - name: "Source Data"
-          allow_empty: true
-          subsections:
-            - name: "Initial Data Collection and Normalization"
-              allow_empty: true
-              subsections: null
-            - name: "Who are the source language producers?"
-              allow_empty: true
-              subsections: null
-        - name: "Annotations"
-          allow_empty: true
-          subsections:
-            - name: "Annotation process"
-              allow_empty: true
-              subsections: null
-            - name: "Who are the annotators?"
-              allow_empty: true
-              subsections: null
-        - name: "Personal and Sensitive Information"
-          allow_empty: true
-          subsections: null
+          - name: "Curation Rationale"
+            allow_empty: true
+            subsections: null
+          - name: "Source Data"
+            allow_empty: true
+            subsections:
+              - name: "Initial Data Collection and Normalization"
+                allow_empty: true
+                subsections: null
+              - name: "Who are the source language producers?"
+                allow_empty: true
+                subsections: null
+          - name: "Annotations"
+            allow_empty: true
+            subsections:
+              - name: "Annotation process"
+                allow_empty: true
+                subsections: null
+              - name: "Who are the annotators?"
+                allow_empty: true
+                subsections: null
+          - name: "Personal and Sensitive Information"
+            allow_empty: true
+            subsections: null
       - name: "Considerations for Using the Data"
         allow_empty: true
         subsections:
@@ -84,4 +84,4 @@ subsections:
             subsections: null
           - name: "Contributions"
             allow_empty: false
-            subsections: null
\ No newline at end of file
+            subsections: null
diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py
new file mode 100644
index 00000000000..84ab24cde45
--- /dev/null
+++ b/tests/test_readme_util.py
@@ -0,0 +1,242 @@
+import tempfile
+import unittest
+from pathlib import Path
+
+import yaml
+
+from datasets.utils.readme import ReadMe
+
+
+def _dedent(string: str) -> str:
+    return "\n".join([line.lstrip() for line in string.splitlines()])
+
+
+EXPECTED_STRUCTURE = yaml.safe_load(
+    """\
+name: ""
+allow_empty: false
+subsections:
+  - name: "Dataset Card for X" # First-level markdown heading
+    allow_empty: true
+    subsections:
+      - name: "Table of Contents"
+        allow_empty: false
+        subsections: null # meaning it should not be checked.
+      - name: "Dataset Description"
+        allow_empty: false
+        subsections:
+          - name: "Dataset Summary"
+            allow_empty: false
+            subsections: null
+          - name: "Supported Tasks and Leaderboards"
+            allow_empty: true
+            subsections: null
+          - name: Languages
+            allow_empty: true
+            subsections: null
+"""
+)
+
+README_CORRECT = """\
+---
+languages:
+- zh
+- en
+---
+
+# Dataset Card for My Dataset
+## Table of Contents
+Some text here.
+## Dataset Description
+Some text here.
+### Dataset Summary
+Some text here.
+### Supported Tasks and Leaderboards
+### Languages
+"""
+
+README_EMPTY_YAML = """\
+---
+---
+
+# Dataset Card for My Dataset
+## Table of Contents
+Some text here.
+## Dataset Description
+Some text here.
+### Dataset Summary
+Some text here.
+### Supported Tasks and Leaderboards
+### Languages
+"""
+
+README_INCORRECT_YAML = """\
+---
+
+# Dataset Card for My Dataset
+## Table of Contents
+Some text here.
+## Dataset Description
+Some text here.
+### Dataset Summary
+Some text here.
+### Supported Tasks and Leaderboards
+### Languages
+"""
+
+README_NO_YAML = """\
+# Dataset Card for My Dataset
+## Table of Contents
+Some text here.
+## Dataset Description
+Some text here.
+### Dataset Summary
+Some text here.
+### Supported Tasks and Leaderboards
+### Languages
+"""
+
+README_MISSING_TEXT = """\
+---
+languages:
+- zh
+- en
+---
+
+# Dataset Card for My Dataset
+## Table of Contents
+Some text here.
+## Dataset Description
+Some text here.
+### Dataset Summary
+### Supported Tasks and Leaderboards
+### Languages
+"""
+
+README_MISSING_SUBSECTION = """\
+---
+languages:
+- zh
+- en
+---
+
+# Dataset Card for My Dataset
+## Table of Contents
+Some text here.
+## Dataset Description
+Some text here.
+### Dataset Summary
+Some text here.
+### Languages
+"""
+
+README_MISSING_FIRST_LEVEL = """\
+---
+languages:
+- zh
+- en
+---
+
+## Table of Contents
+Some text here.
+## Dataset Description
+Some text here.
+### Dataset Summary
+Some text here.
+### Supported Tasks and Leaderboards
+### Languages
+"""
+
+
+README_MULTIPLE_WRONG_FIRST_LEVEL = """\
+---
+languages:
+- zh
+- en
+---
+
+# Dataset Card for My Dataset
+## Table of Contents
+Some text here.
+## Dataset Description
+Some text here.
+### Dataset Summary
+Some text here.
+### Supported Tasks and Leaderboards
+### Languages
+# Dataset Card My Dataset
+"""
+
+README_WRONG_FIRST_LEVEL = """\
+---
+languages:
+- zh
+- en
+---
+
+# Dataset Card My Dataset
+## Table of Contents
+Some text here.
+## Dataset Description
+Some text here.
+### Dataset Summary
+Some text here.
+### Supported Tasks and Leaderboards
+### Languages
+"""
+
+README_EMPTY = ""
+
+README_MULTIPLE_SAME_HEADING_1 = """\
+---
+languages:
+- zh
+- en
+---
+
+# Dataset Card for My Dataset
+# Dataset Card for My Dataset
+## Table of Contents
+Some text here.
+## Dataset Description
+Some text here.
+### Dataset Summary
+Some text here.
+### Supported Tasks and Leaderboards
+### Languages
+"""
+
+
+class TestReadMeUtils(unittest.TestCase):
+    def test_from_string(self):
+        ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE)
+        with self.assertRaises(ValueError):
+            ReadMe.from_string(README_EMPTY_YAML, EXPECTED_STRUCTURE)
+        with self.assertRaises(ValueError):
+            ReadMe.from_string(README_INCORRECT_YAML, EXPECTED_STRUCTURE)
+        with self.assertRaises(ValueError):
+            ReadMe.from_string(README_NO_YAML, EXPECTED_STRUCTURE)
+        with self.assertRaises(ValueError):
+            ReadMe.from_string(README_MISSING_TEXT, EXPECTED_STRUCTURE)
+        with self.assertRaises(ValueError):
+            ReadMe.from_string(README_MISSING_SUBSECTION, EXPECTED_STRUCTURE)
+        with self.assertRaises(ValueError):
+            ReadMe.from_string(README_MISSING_FIRST_LEVEL, EXPECTED_STRUCTURE)
+        with self.assertRaises(ValueError):
+            ReadMe.from_string(README_MULTIPLE_WRONG_FIRST_LEVEL, EXPECTED_STRUCTURE)
+        with self.assertRaises(ValueError):
+            ReadMe.from_string(README_WRONG_FIRST_LEVEL, EXPECTED_STRUCTURE)
+        with self.assertRaises(ValueError):
+            ReadMe.from_string(README_EMPTY, EXPECTED_STRUCTURE)
+
+        ReadMe.from_string(README_MULTIPLE_SAME_HEADING_1, EXPECTED_STRUCTURE)
+        # ReadMe.from_string(MISSING_SUBSECTION, EXPECTED_STRUCTURE)
+        # ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE)
+        # ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE)
+        # ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE)
+        # ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE)
+        # ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 057d0d95aab96d3dce45901beb31c68eda233688 Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Wed, 28 Apr 2021 21:30:09 +0530
Subject: [PATCH 10/28] Add PyTest tests

---
 src/datasets/utils/readme.py |  52 +++++++------
 tests/test_readme_util.py    | 137 +++++++++++++++++++++--------------
 2 files changed, 112 insertions(+), 77 deletions(-)

diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py
index ffc38b1596d..a102cda38c9 100644
--- a/src/datasets/utils/readme.py
+++ b/src/datasets/utils/readme.py
@@ -48,6 +48,8 @@ def __post_init__(self):
         self.text = ""
         self.is_empty = True
         self.content = {}
+        self.parsing_error_list = []
+        self.parsing_warning_list = []
         if self.lines is not None:
             self.parse()
 
@@ -77,8 +79,8 @@ def parse(self):
         else:
             if current_sub_level != "":
                 if current_sub_level in self.content:
-                    print(
-                        f"Multiple sections with the same heading '{current_sub_level}' have been found. Using the latest one found."
+                    self.parsing_error_list.append(
+                        f"Multiple sections with the same heading '{current_sub_level}' have been found. Please keep only one of these sections."
                     )
                 self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines)
             else:
@@ -103,9 +105,7 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput:
             # If header text is expected
             if self.is_empty:
                 # If no header text is found, mention it in the error_list
-                error_list.append(
-                    f"Expected some header text for section '{self.name}', reference at {known_readme_structure_url}."
-                )
+                error_list.append(f"Expected some header text for section '{self.name}'.")
 
         # Subsections Validation
         if structure["subsections"] is not None:
@@ -114,18 +114,14 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput:
                 # If no subsections are present
                 values = [subsection["name"] for subsection in structure["subsections"]]
                 # Mention the expected values in the error_list
-                error_list.append(
-                    f"Section '{self.name}' expected the following subsections: {values}, found `None`, reference at {known_readme_structure_url}."
-                )
+                error_list.append(f"Section '{self.name}' expected the following subsections: {values}, found `None`.")
             else:
                 # If some subsections are present
                 structure_names = [subsection["name"] for subsection in structure["subsections"]]
                 for idx, name in enumerate(structure_names):
                     if name not in self.content:
                         # If the expected subsection is not present
-                        error_list.append(
-                            f"Section '{self.name}' is missing subsection: '{name}', reference at {known_readme_structure_url}."
-                        )
+                        error_list.append(f"Section '{self.name}' is missing subsection: '{name}'.")
                     else:
                         # If the subsection is present, validate subsection, return the result
                         # and concat the errors from subsection to section error_list
@@ -141,6 +137,8 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput:
                         warning_list.append(
                             f"'{self.name}' has an extra subsection: '{name}'. Skipping further validation checks for this subsection as expected structure is unknown."
                         )
+        error_list = self.parsing_error_list + error_list
+        warning_list = self.parsing_warning_list + warning_list
         if error_list:
             # If there are errors, do not return the dictionary as it is invalid
             return {}, error_list, warning_list
@@ -162,14 +160,19 @@ def __init__(self, name: str, lines: List[str], structure: dict = None):
         super().__init__(name=name, level="")  # Not using lines here as we need to use a child class parse
         self.structure = structure
         self.yaml_tags_line_count = -2
+        self.tag_count = 0
         self.lines = lines
         if self.lines is not None:
             self.parse()
+
+        # Validation
         if self.structure is None:
             content, error_list, warning_list = self.validate(readme_structure)
         else:
             content, error_list, warning_list = self.validate(self.structure)
 
+        error_list = self.parsing_error_list + error_list
+        warning_list = self.parsing_warning_list + warning_list
         if error_list != [] or warning_list != []:
             errors = "\n".join(list(map(lambda x: "-\t" + x, error_list + warning_list)))
             error_string = f"The following issues were found for the README at `{self.name}`:\n" + errors
@@ -188,18 +191,19 @@ def from_string(cls, string: str, structure: dict = None, root_name: str = "root
 
     def parse(self):
         # Skip Tags
-        tag_count = 0
         line_count = 0
 
         for line in self.lines:
             self.yaml_tags_line_count += 1
             if line.strip(" \n") == "---":
-                tag_count += 1
-                if tag_count == 2:
+                self.tag_count += 1
+                if self.tag_count == 2:
                     break
             line_count += 1
-
-        self.lines = self.lines[line_count + 1 :]  # Get the last + 1 th item.
+        if self.tag_count == 2:
+            self.lines = self.lines[line_count + 1 :]  # Get the last + 1 th item.
+        else:
+            self.lines = self.lines[self.tag_count :]
         super().parse()
 
     def __str__(self):
@@ -210,21 +214,23 @@ def validate(self, readme_structure):
         error_list = []
         warning_list = []
         if self.yaml_tags_line_count == 0:
-            warning_list.append(f"YAML Tags are not present in the README at `{self.name}`.")
-        elif self.yaml_tags_line_count == -1:
-            warning_list.append(f"Only the start of YAML tags present in the README at `{self.name}`.")
-
+            warning_list.append("Empty YAML markers are present in the README.")
+        elif self.tag_count == 0:
+            warning_list.append("No YAML markers are present in the README.")
+        elif self.tag_count == 1:
+            warning_list.append("Only the start of YAML tags present in the README.")
         # Check how many first level sections are present.
         num_first_level_keys = len(self.content.keys())
+        print(self.content)
         if num_first_level_keys > 1:
             # If more than one, add to the error list, continue
             error_list.append(
-                f"The README present at `{self.name}` has found several first-level headings: {list(self.content.keys())}. Only one heading is expected. Skipping further validation for this README."
+                f"The README has several first-level headings: {list(self.content.keys())}. Only one heading is expected. Skipping further validation for this README."
             )
         elif num_first_level_keys < 1:
             # If less than one, append error.
             error_list.append(
-                f"The README present as `{self.name}` has no first-level headings. One heading is expected. Skipping further validation for this README."
+                f"The README has no first-level headings. One heading is expected. Skipping further validation for this README."
             )
 
         else:
@@ -241,7 +247,7 @@ def validate(self, readme_structure):
             else:
                 # If not found, append error
                 error_list.append(
-                    f"No first-level heading starting with `Dataset Card for` found in README present at `{self.name}`. Skipping further validation for this README."
+                    f"No first-level heading starting with `Dataset Card for` found in README. Skipping further validation for this README."
                 )
         if error_list:
             # If there are errors, do not return the dictionary as it is invalid
diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py
index 84ab24cde45..ac84d2f64ef 100644
--- a/tests/test_readme_util.py
+++ b/tests/test_readme_util.py
@@ -1,17 +1,15 @@
 import tempfile
-import unittest
 from pathlib import Path
 
+import pytest
 import yaml
 
 from datasets.utils.readme import ReadMe
 
 
-def _dedent(string: str) -> str:
-    return "\n".join([line.lstrip() for line in string.splitlines()])
-
-
-EXPECTED_STRUCTURE = yaml.safe_load(
+# @pytest.fixture
+# def example_yaml_structure():
+example_yaml_structure = yaml.safe_load(
     """\
 name: ""
 allow_empty: false
@@ -37,6 +35,43 @@ def _dedent(string: str) -> str:
 """
 )
 
+
+CORRECT_DICT = {
+    "name": "root",
+    "text": "",
+    "is_empty": True,
+    "subsections": [
+        {
+            "name": "Dataset Card for My Dataset",
+            "text": "",
+            "is_empty": True,
+            "subsections": [
+                {"name": "Table of Contents", "text": "Some text here.", "is_empty": False, "subsections": []},
+                {
+                    "name": "Dataset Description",
+                    "text": "Some text here.",
+                    "is_empty": False,
+                    "subsections": [
+                        {
+                            "name": "Dataset Summary",
+                            "text": "Some text here.",
+                            "is_empty": False,
+                            "subsections": [],
+                        },
+                        {
+                            "name": "Supported Tasks and Leaderboards",
+                            "text": "",
+                            "is_empty": True,
+                            "subsections": [],
+                        },
+                        {"name": "Languages", "text": "", "is_empty": True, "subsections": []},
+                    ],
+                },
+            ],
+        }
+    ],
+}
+
 README_CORRECT = """\
 ---
 languages:
@@ -58,7 +93,6 @@ def _dedent(string: str) -> str:
 README_EMPTY_YAML = """\
 ---
 ---
-
 # Dataset Card for My Dataset
 ## Table of Contents
 Some text here.
@@ -70,19 +104,9 @@ def _dedent(string: str) -> str:
 ### Languages
 """
 
-README_INCORRECT_YAML = """\
----
-
-# Dataset Card for My Dataset
-## Table of Contents
-Some text here.
-## Dataset Description
-Some text here.
-### Dataset Summary
-Some text here.
-### Supported Tasks and Leaderboards
-### Languages
-"""
+EXPECTED_ERROR_README_EMPTY_YAML = (
+    "The following issues were found for the README at `root`:\n-\tEmpty YAML markers are present in the README."
+)
 
 README_NO_YAML = """\
 # Dataset Card for My Dataset
@@ -96,6 +120,10 @@ def _dedent(string: str) -> str:
 ### Languages
 """
 
+EXPECTED_ERROR_README_NO_YAML = (
+    "The following issues were found for the README at `root`:\n-\tNo YAML markers are present in the README."
+)
+
 README_MISSING_TEXT = """\
 ---
 languages:
@@ -112,6 +140,7 @@ def _dedent(string: str) -> str:
 ### Supported Tasks and Leaderboards
 ### Languages
 """
+EXPECTED_ERROR_README_MISSING_TEXT = "The following issues were found for the README at `root`:\n-\tExpected some header text for section 'Dataset Summary'."
 
 README_MISSING_SUBSECTION = """\
 ---
@@ -130,6 +159,8 @@ def _dedent(string: str) -> str:
 ### Languages
 """
 
+EXPECTED_ERROR_README_MISSING_SUBSECTION = "The following issues were found for the README at `root`:\n-\tSection 'Dataset Description' is missing subsection: 'Supported Tasks and Leaderboards'."
+
 README_MISSING_FIRST_LEVEL = """\
 ---
 languages:
@@ -146,7 +177,7 @@ def _dedent(string: str) -> str:
 ### Supported Tasks and Leaderboards
 ### Languages
 """
-
+EXPECTED_ERROR_README_MISSING_FIRST_LEVEL = "The following issues were found for the README at `root`:\n-\tThe README has no first-level headings. One heading is expected. Skipping further validation for this README."
 
 README_MULTIPLE_WRONG_FIRST_LEVEL = """\
 ---
@@ -167,6 +198,8 @@ def _dedent(string: str) -> str:
 # Dataset Card My Dataset
 """
 
+EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL = "The following issues were found for the README at `root`:\n-\tThe README has several first-level headings: \['Dataset Card for My Dataset', 'Dataset Card My Dataset'\]. Only one heading is expected. Skipping further validation for this README."
+
 README_WRONG_FIRST_LEVEL = """\
 ---
 languages:
@@ -185,8 +218,12 @@ def _dedent(string: str) -> str:
 ### Languages
 """
 
+EXPECTED_ERROR_README_WRONG_FIRST_LEVEL = "The following issues were found for the README at `root`:\n-\tNo first-level heading starting with `Dataset Card for` found in README. Skipping further validation for this README."
+
 README_EMPTY = ""
 
+EXPECTED_ERROR_README_EMPTY = "The following issues were found for the README at `root`:\n-\tThe README has no first-level headings. One heading is expected. Skipping further validation for this README.\n-\tNo YAML markers are present in the README."
+
 README_MULTIPLE_SAME_HEADING_1 = """\
 ---
 languages:
@@ -206,37 +243,29 @@ def _dedent(string: str) -> str:
 ### Languages
 """
 
+EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1 = "The following issues were found for the README at `root`:\n-\tMultiple sections with the same heading 'Dataset Card for My Dataset' have been found. Please keep only one of these sections."
+
+
+def test_readme_from_string_correct():
+
+    assert ReadMe.from_string(README_CORRECT, example_yaml_structure).to_dict() == CORRECT_DICT
+
+
+@pytest.mark.parametrize(
+    "readme_md, expected_error",
+    [
+        (README_NO_YAML, EXPECTED_ERROR_README_NO_YAML),
+        (README_EMPTY_YAML, EXPECTED_ERROR_README_EMPTY_YAML),
+        (README_EMPTY, EXPECTED_ERROR_README_EMPTY),
+        (README_MISSING_FIRST_LEVEL, EXPECTED_ERROR_README_MISSING_FIRST_LEVEL),
+        (README_MISSING_SUBSECTION, EXPECTED_ERROR_README_MISSING_SUBSECTION),
+        (README_MISSING_TEXT, EXPECTED_ERROR_README_MISSING_TEXT),
+        (README_MULTIPLE_SAME_HEADING_1, EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1),
+        (README_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_WRONG_FIRST_LEVEL),
+        (README_MULTIPLE_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL),
+    ],
+)
+def test_readme_from_string_errors(readme_md, expected_error):
 
-class TestReadMeUtils(unittest.TestCase):
-    def test_from_string(self):
-        ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE)
-        with self.assertRaises(ValueError):
-            ReadMe.from_string(README_EMPTY_YAML, EXPECTED_STRUCTURE)
-        with self.assertRaises(ValueError):
-            ReadMe.from_string(README_INCORRECT_YAML, EXPECTED_STRUCTURE)
-        with self.assertRaises(ValueError):
-            ReadMe.from_string(README_NO_YAML, EXPECTED_STRUCTURE)
-        with self.assertRaises(ValueError):
-            ReadMe.from_string(README_MISSING_TEXT, EXPECTED_STRUCTURE)
-        with self.assertRaises(ValueError):
-            ReadMe.from_string(README_MISSING_SUBSECTION, EXPECTED_STRUCTURE)
-        with self.assertRaises(ValueError):
-            ReadMe.from_string(README_MISSING_FIRST_LEVEL, EXPECTED_STRUCTURE)
-        with self.assertRaises(ValueError):
-            ReadMe.from_string(README_MULTIPLE_WRONG_FIRST_LEVEL, EXPECTED_STRUCTURE)
-        with self.assertRaises(ValueError):
-            ReadMe.from_string(README_WRONG_FIRST_LEVEL, EXPECTED_STRUCTURE)
-        with self.assertRaises(ValueError):
-            ReadMe.from_string(README_EMPTY, EXPECTED_STRUCTURE)
-
-        ReadMe.from_string(README_MULTIPLE_SAME_HEADING_1, EXPECTED_STRUCTURE)
-        # ReadMe.from_string(MISSING_SUBSECTION, EXPECTED_STRUCTURE)
-        # ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE)
-        # ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE)
-        # ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE)
-        # ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE)
-        # ReadMe.from_string(README_CORRECT, EXPECTED_STRUCTURE)
-
-
-if __name__ == "__main__":
-    unittest.main()
+    with pytest.raises(ValueError, match=expected_error):
+        ReadMe.from_string(readme_md, example_yaml_structure)

From 35e08d84584fc6d97f9f61f181a2ccb99f4bda3b Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Sun, 2 May 2021 14:18:51 +0530
Subject: [PATCH 11/28] Add tests for from_readme

---
 tests/test_readme_util.py | 56 +++++++++++++++++++++++++++++++--------
 1 file changed, 45 insertions(+), 11 deletions(-)

diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py
index ac84d2f64ef..c792d79450a 100644
--- a/tests/test_readme_util.py
+++ b/tests/test_readme_util.py
@@ -105,7 +105,7 @@
 """
 
 EXPECTED_ERROR_README_EMPTY_YAML = (
-    "The following issues were found for the README at `root`:\n-\tEmpty YAML markers are present in the README."
+    "The following issues were found for the README at `{path}`:\n-\tEmpty YAML markers are present in the README."
 )
 
 README_NO_YAML = """\
@@ -121,7 +121,7 @@
 """
 
 EXPECTED_ERROR_README_NO_YAML = (
-    "The following issues were found for the README at `root`:\n-\tNo YAML markers are present in the README."
+    "The following issues were found for the README at `{path}`:\n-\tNo YAML markers are present in the README."
 )
 
 README_MISSING_TEXT = """\
@@ -140,7 +140,7 @@
 ### Supported Tasks and Leaderboards
 ### Languages
 """
-EXPECTED_ERROR_README_MISSING_TEXT = "The following issues were found for the README at `root`:\n-\tExpected some header text for section 'Dataset Summary'."
+EXPECTED_ERROR_README_MISSING_TEXT = "The following issues were found for the README at `{path}`:\n-\tExpected some header text for section 'Dataset Summary'."
 
 README_MISSING_SUBSECTION = """\
 ---
@@ -159,7 +159,7 @@
 ### Languages
 """
 
-EXPECTED_ERROR_README_MISSING_SUBSECTION = "The following issues were found for the README at `root`:\n-\tSection 'Dataset Description' is missing subsection: 'Supported Tasks and Leaderboards'."
+EXPECTED_ERROR_README_MISSING_SUBSECTION = "The following issues were found for the README at `{path}`:\n-\tSection 'Dataset Description' is missing subsection: 'Supported Tasks and Leaderboards'."
 
 README_MISSING_FIRST_LEVEL = """\
 ---
@@ -177,7 +177,7 @@
 ### Supported Tasks and Leaderboards
 ### Languages
 """
-EXPECTED_ERROR_README_MISSING_FIRST_LEVEL = "The following issues were found for the README at `root`:\n-\tThe README has no first-level headings. One heading is expected. Skipping further validation for this README."
+EXPECTED_ERROR_README_MISSING_FIRST_LEVEL = "The following issues were found for the README at `{path}`:\n-\tThe README has no first-level headings. One heading is expected. Skipping further validation for this README."
 
 README_MULTIPLE_WRONG_FIRST_LEVEL = """\
 ---
@@ -198,7 +198,7 @@
 # Dataset Card My Dataset
 """
 
-EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL = "The following issues were found for the README at `root`:\n-\tThe README has several first-level headings: \['Dataset Card for My Dataset', 'Dataset Card My Dataset'\]. Only one heading is expected. Skipping further validation for this README."
+EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL = "The following issues were found for the README at `{path}`:\n-\tThe README has several first-level headings: \['Dataset Card for My Dataset', 'Dataset Card My Dataset'\]. Only one heading is expected. Skipping further validation for this README."
 
 README_WRONG_FIRST_LEVEL = """\
 ---
@@ -218,11 +218,11 @@
 ### Languages
 """
 
-EXPECTED_ERROR_README_WRONG_FIRST_LEVEL = "The following issues were found for the README at `root`:\n-\tNo first-level heading starting with `Dataset Card for` found in README. Skipping further validation for this README."
+EXPECTED_ERROR_README_WRONG_FIRST_LEVEL = "The following issues were found for the README at `{path}`:\n-\tNo first-level heading starting with `Dataset Card for` found in README. Skipping further validation for this README."
 
 README_EMPTY = ""
 
-EXPECTED_ERROR_README_EMPTY = "The following issues were found for the README at `root`:\n-\tThe README has no first-level headings. One heading is expected. Skipping further validation for this README.\n-\tNo YAML markers are present in the README."
+EXPECTED_ERROR_README_EMPTY = "The following issues were found for the README at `{path}`:\n-\tThe README has no first-level headings. One heading is expected. Skipping further validation for this README.\n-\tNo YAML markers are present in the README."
 
 README_MULTIPLE_SAME_HEADING_1 = """\
 ---
@@ -243,7 +243,7 @@
 ### Languages
 """
 
-EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1 = "The following issues were found for the README at `root`:\n-\tMultiple sections with the same heading 'Dataset Card for My Dataset' have been found. Please keep only one of these sections."
+EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1 = "The following issues were found for the README at `{path}`:\n-\tMultiple sections with the same heading 'Dataset Card for My Dataset' have been found. Please keep only one of these sections."
 
 
 def test_readme_from_string_correct():
@@ -266,6 +266,40 @@ def test_readme_from_string_correct():
     ],
 )
 def test_readme_from_string_errors(readme_md, expected_error):
-
-    with pytest.raises(ValueError, match=expected_error):
+    with pytest.raises(ValueError, match=expected_error.format(path='root')):
         ReadMe.from_string(readme_md, example_yaml_structure)
+
+
+def test_readme_from_readme_correct():
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        path = Path(tmp_dir) / "README.md"
+        with open(path, "w+") as readme_file:
+            readme_file.write(README_CORRECT)
+        out =  ReadMe.from_readme(path, example_yaml_structure).to_dict()
+        assert out['name']==path
+        assert out['text']==""
+        assert out['is_empty']==True
+        assert out['subsections']==CORRECT_DICT['subsections']
+
+
+@pytest.mark.parametrize(
+    "readme_md, expected_error",
+    [
+        (README_NO_YAML, EXPECTED_ERROR_README_NO_YAML),
+        (README_EMPTY_YAML, EXPECTED_ERROR_README_EMPTY_YAML),
+        (README_EMPTY, EXPECTED_ERROR_README_EMPTY),
+        (README_MISSING_FIRST_LEVEL, EXPECTED_ERROR_README_MISSING_FIRST_LEVEL),
+        (README_MISSING_SUBSECTION, EXPECTED_ERROR_README_MISSING_SUBSECTION),
+        (README_MISSING_TEXT, EXPECTED_ERROR_README_MISSING_TEXT),
+        (README_MULTIPLE_SAME_HEADING_1, EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1),
+        (README_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_WRONG_FIRST_LEVEL),
+        (README_MULTIPLE_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL),
+    ],
+)
+def test_readme_from_readme_error(readme_md, expected_error):
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        path = Path(tmp_dir) / "README.md"
+        with open(path, "w+") as readme_file:
+            readme_file.write(readme_md)
+        with pytest.raises(ValueError, match=expected_error.format(path=path)):
+            ReadMe.from_readme(path, example_yaml_structure)
\ No newline at end of file

From a3de91abd9d76c0c528ee99504bbda0589419c55 Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Sun, 2 May 2021 14:40:10 +0530
Subject: [PATCH 12/28] Add ReadMe validator script

---
 scripts/datasets_readme_validator.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/datasets_readme_validator.py b/scripts/datasets_readme_validator.py
index c56b59f2601..13f1ffe68fb 100644
--- a/scripts/datasets_readme_validator.py
+++ b/scripts/datasets_readme_validator.py
@@ -1,13 +1,13 @@
 #!/usr/bin/env python
 
-""" This script will run in CI and make sure all new changes to datasets readme files have valid content present.
+""" This script will run in CI and make sure all new changes to datasets readme files have valid readme content.
 """
 
 from pathlib import Path
 from subprocess import check_output
 from typing import List
 
-from datasets.utils.readme import validate_readme
+from datasets.utils.readme import ReadMe
 
 
 def get_changed_files(repo_path: Path) -> List[Path]:
@@ -41,11 +41,11 @@ def get_changed_files(repo_path: Path) -> List[Path]:
     failed: List[Path] = []
     for readme in sorted(readmes):
         try:
-            DatasetMetadata.from_readme(readme)
+            ReadMe.from_readme(readme)
             logging.debug(f"✅️ Validated '{readme.relative_to(repo_path)}'")
-        except TypeError as e:
+        except ValueError as e:
             failed.append(readme)
-            logging.warning(f"❌ Failed to validate '{readme.relative_to(repo_path)}':\n{e}")
+            logging.warning(f"❌ Validation failed for '{readme.relative_to(repo_path)}':\n{e}")
         except Exception as e:
             failed.append(readme)
             logging.warning(f"⁉️ Something unexpected happened on '{readme.relative_to(repo_path)}':\n{e}")

From 8dd3feb9c18a1866c7ce536321774f3aefac044f Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Sun, 2 May 2021 14:40:44 +0530
Subject: [PATCH 13/28] Fix style

---
 tests/test_readme_util.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py
index c792d79450a..539d5989267 100644
--- a/tests/test_readme_util.py
+++ b/tests/test_readme_util.py
@@ -266,7 +266,7 @@ def test_readme_from_string_correct():
     ],
 )
 def test_readme_from_string_errors(readme_md, expected_error):
-    with pytest.raises(ValueError, match=expected_error.format(path='root')):
+    with pytest.raises(ValueError, match=expected_error.format(path="root")):
         ReadMe.from_string(readme_md, example_yaml_structure)
 
 
@@ -275,11 +275,11 @@ def test_readme_from_readme_correct():
         path = Path(tmp_dir) / "README.md"
         with open(path, "w+") as readme_file:
             readme_file.write(README_CORRECT)
-        out =  ReadMe.from_readme(path, example_yaml_structure).to_dict()
-        assert out['name']==path
-        assert out['text']==""
-        assert out['is_empty']==True
-        assert out['subsections']==CORRECT_DICT['subsections']
+        out = ReadMe.from_readme(path, example_yaml_structure).to_dict()
+        assert out["name"] == path
+        assert out["text"] == ""
+        assert out["is_empty"] == True
+        assert out["subsections"] == CORRECT_DICT["subsections"]
 
 
 @pytest.mark.parametrize(
@@ -302,4 +302,4 @@ def test_readme_from_readme_error(readme_md, expected_error):
         with open(path, "w+") as readme_file:
             readme_file.write(readme_md)
         with pytest.raises(ValueError, match=expected_error.format(path=path)):
-            ReadMe.from_readme(path, example_yaml_structure)
\ No newline at end of file
+            ReadMe.from_readme(path, example_yaml_structure)

From 87b06683fc691df6f026fc47e3f2b4051407513b Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Sun, 2 May 2021 14:43:14 +0530
Subject: [PATCH 14/28] Remove print statement

---
 src/datasets/utils/readme.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py
index a102cda38c9..615cf3c3afe 100644
--- a/src/datasets/utils/readme.py
+++ b/src/datasets/utils/readme.py
@@ -221,7 +221,6 @@ def validate(self, readme_structure):
             warning_list.append("Only the start of YAML tags present in the README.")
         # Check how many first level sections are present.
         num_first_level_keys = len(self.content.keys())
-        print(self.content)
         if num_first_level_keys > 1:
             # If more than one, add to the error list, continue
             error_list.append(

From 1d49a4da41a86d3edb0354a9630309d0a021864c Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Sun, 2 May 2021 14:44:48 +0530
Subject: [PATCH 15/28] Add validator to CircleCI

---
 .circleci/config.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 2361409c4c9..c1e870b8fd7 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -32,7 +32,6 @@ jobs:
             - run: pip install pyarrow==1.0.0
             - run: HF_SCRIPTS_VERSION=master python -m pytest -sv ./tests/
 
-
     run_dataset_script_tests_pyarrow_latest_WIN:
         working_directory: ~/datasets
         executor:
@@ -82,6 +81,7 @@ jobs:
             - run: isort --check-only tests src benchmarks datasets metrics
             - run: flake8 tests src benchmarks datasets metrics
             - run: ./scripts/datasets_metadata_validator.py
+            - run: ./scripts/datasets_readme_validator.py
 
     build_doc:
         working_directory: ~/datasets
@@ -100,8 +100,8 @@ jobs:
             - image: circleci/python:3.6
         steps:
             - add_ssh_keys:
-                fingerprints:
-                    - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71"
+                  fingerprints:
+                      - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71"
             - checkout
             - run: sudo pip install .[docs]
             - run: ./.circleci/deploy.sh

From d9f0ac3e57d37f23dcee28bcdbff2ae360fb1b19 Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Sun, 2 May 2021 15:15:24 +0530
Subject: [PATCH 16/28] Fix style

---
 src/datasets/utils/readme.py | 4 ++--
 tests/test_readme_util.py    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py
index 615cf3c3afe..2a1134dbbec 100644
--- a/src/datasets/utils/readme.py
+++ b/src/datasets/utils/readme.py
@@ -101,7 +101,7 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput:
         # Header text validation
         error_list = []
         warning_list = []
-        if structure["allow_empty"] == False:
+        if structure["allow_empty"] is False:
             # If header text is expected
             if self.is_empty:
                 # If no header text is found, mention it in the error_list
@@ -224,7 +224,7 @@ def validate(self, readme_structure):
         if num_first_level_keys > 1:
             # If more than one, add to the error list, continue
             error_list.append(
-                f"The README has several first-level headings: {list(self.content.keys())}. Only one heading is expected. Skipping further validation for this README."
+                f"The README has several first-level headings: {', '.join(['`'+x+'`' for x in list(self.content.keys())])}. Only one heading is expected. Skipping further validation for this README."
             )
         elif num_first_level_keys < 1:
             # If less than one, append error.
diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py
index 539d5989267..2eccc4af8f6 100644
--- a/tests/test_readme_util.py
+++ b/tests/test_readme_util.py
@@ -198,7 +198,7 @@
 # Dataset Card My Dataset
 """
 
-EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL = "The following issues were found for the README at `{path}`:\n-\tThe README has several first-level headings: \['Dataset Card for My Dataset', 'Dataset Card My Dataset'\]. Only one heading is expected. Skipping further validation for this README."
+EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL = "The following issues were found for the README at `{path}`:\n-\tThe README has several first-level headings: `Dataset Card for My Dataset`, `Dataset Card My Dataset`. Only one heading is expected. Skipping further validation for this README."
 
 README_WRONG_FIRST_LEVEL = """\
 ---
@@ -278,7 +278,7 @@ def test_readme_from_readme_correct():
         out = ReadMe.from_readme(path, example_yaml_structure).to_dict()
         assert out["name"] == path
         assert out["text"] == ""
-        assert out["is_empty"] == True
+        assert out["is_empty"]
         assert out["subsections"] == CORRECT_DICT["subsections"]
 
 

From 414fc2e997d4406c0257f58e207d2c1b70b1feb9 Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Sun, 2 May 2021 15:36:39 +0530
Subject: [PATCH 17/28] Add YAML files to setup resources

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index e745428377f..6a52d94d45b 100644
--- a/setup.py
+++ b/setup.py
@@ -216,7 +216,7 @@
     license="Apache 2.0",
     package_dir={"": "src"},
     packages=find_packages("src"),
-    package_data={"datasets": ["scripts/templates/*"], "datasets.utils.resources": ["*.json"]},
+    package_data={"datasets": ["scripts/templates/*"], "datasets.utils.resources": ["*.json", "*.yaml"]},
     entry_points={"console_scripts": ["datasets-cli=datasets.commands.datasets_cli:main"]},
     install_requires=REQUIRED_PKGS,
     extras_require=EXTRAS_REQUIRE,

From 0c3425a56a55d5889b5bbb49adac6f85d756c55f Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Sun, 2 May 2021 15:49:10 +0530
Subject: [PATCH 18/28] Make validator executable

---
 scripts/datasets_readme_validator.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)
 mode change 100644 => 100755 scripts/datasets_readme_validator.py

diff --git a/scripts/datasets_readme_validator.py b/scripts/datasets_readme_validator.py
old mode 100644
new mode 100755
index 13f1ffe68fb..af0cc05445d
--- a/scripts/datasets_readme_validator.py
+++ b/scripts/datasets_readme_validator.py
@@ -1,7 +1,6 @@
 #!/usr/bin/env python
 
-""" This script will run in CI and make sure all new changes to datasets readme files have valid readme content.
-"""
+""" This script will run in CI and make sure all new changes to datasets readme files have valid readme content."""
 
 from pathlib import Path
 from subprocess import check_output

From 933fdf76c1e3ecad4210cb66dd31bcda92da8928 Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Sun, 2 May 2021 16:16:13 +0530
Subject: [PATCH 19/28] Add no subsections test

---
 src/datasets/utils/readme.py | 10 +++++-----
 tests/test_readme_util.py    | 20 +++++++++++++++++---
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py
index 2a1134dbbec..9b0a393d845 100644
--- a/src/datasets/utils/readme.py
+++ b/src/datasets/utils/readme.py
@@ -80,7 +80,7 @@ def parse(self):
             if current_sub_level != "":
                 if current_sub_level in self.content:
                     self.parsing_error_list.append(
-                        f"Multiple sections with the same heading '{current_sub_level}' have been found. Please keep only one of these sections."
+                        f"Multiple sections with the same heading `{current_sub_level}` have been found. Please keep only one of these sections."
                     )
                 self.content[current_sub_level] = Section(current_sub_level, self.level + "#", current_lines)
             else:
@@ -105,7 +105,7 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput:
             # If header text is expected
             if self.is_empty:
                 # If no header text is found, mention it in the error_list
-                error_list.append(f"Expected some header text for section '{self.name}'.")
+                error_list.append(f"Expected some header text for section `{self.name}`.")
 
         # Subsections Validation
         if structure["subsections"] is not None:
@@ -114,14 +114,14 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput:
                 # If no subsections are present
                 values = [subsection["name"] for subsection in structure["subsections"]]
                 # Mention the expected values in the error_list
-                error_list.append(f"Section '{self.name}' expected the following subsections: {values}, found `None`.")
+                error_list.append(f"Section `{self.name}` expected the following subsections: {', '.join(['`'+x+'`' for x in values])}. Found 'None'.")
             else:
                 # If some subsections are present
                 structure_names = [subsection["name"] for subsection in structure["subsections"]]
                 for idx, name in enumerate(structure_names):
                     if name not in self.content:
                         # If the expected subsection is not present
-                        error_list.append(f"Section '{self.name}' is missing subsection: '{name}'.")
+                        error_list.append(f"Section `{self.name}` is missing subsection: `{name}`.")
                     else:
                         # If the subsection is present, validate subsection, return the result
                         # and concat the errors from subsection to section error_list
@@ -135,7 +135,7 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput:
                     if name not in structure_names:
                         # If an extra subsection is present
                         warning_list.append(
-                            f"'{self.name}' has an extra subsection: '{name}'. Skipping further validation checks for this subsection as expected structure is unknown."
+                            f"`{self.name}` has an extra subsection: `{name}`. Skipping further validation checks for this subsection as expected structure is unknown."
                         )
         error_list = self.parsing_error_list + error_list
         warning_list = self.parsing_warning_list + warning_list
diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py
index 2eccc4af8f6..6b1ef94dece 100644
--- a/tests/test_readme_util.py
+++ b/tests/test_readme_util.py
@@ -140,7 +140,19 @@
 ### Supported Tasks and Leaderboards
 ### Languages
 """
-EXPECTED_ERROR_README_MISSING_TEXT = "The following issues were found for the README at `{path}`:\n-\tExpected some header text for section 'Dataset Summary'."
+EXPECTED_ERROR_README_MISSING_TEXT = "The following issues were found for the README at `{path}`:\n-\tExpected some header text for section `Dataset Summary`."
+
+
+README_NONE_SUBSECTION = """\
+---
+languages:
+- zh
+- en
+---
+
+# Dataset Card for My Dataset
+"""
+EXPECTED_ERROR_README_NONE_SUBSECTION = "The following issues were found for the README at `{path}`:\n-\tSection `Dataset Card for My Dataset` expected the following subsections: `Table of Contents`, `Dataset Description`. Found 'None'."
 
 README_MISSING_SUBSECTION = """\
 ---
@@ -159,7 +171,7 @@
 ### Languages
 """
 
-EXPECTED_ERROR_README_MISSING_SUBSECTION = "The following issues were found for the README at `{path}`:\n-\tSection 'Dataset Description' is missing subsection: 'Supported Tasks and Leaderboards'."
+EXPECTED_ERROR_README_MISSING_SUBSECTION = "The following issues were found for the README at `{path}`:\n-\tSection `Dataset Description` is missing subsection: `Supported Tasks and Leaderboards`."
 
 README_MISSING_FIRST_LEVEL = """\
 ---
@@ -243,7 +255,7 @@
 ### Languages
 """
 
-EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1 = "The following issues were found for the README at `{path}`:\n-\tMultiple sections with the same heading 'Dataset Card for My Dataset' have been found. Please keep only one of these sections."
+EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1 = "The following issues were found for the README at `{path}`:\n-\tMultiple sections with the same heading `Dataset Card for My Dataset` have been found. Please keep only one of these sections."
 
 
 def test_readme_from_string_correct():
@@ -257,6 +269,7 @@ def test_readme_from_string_correct():
         (README_NO_YAML, EXPECTED_ERROR_README_NO_YAML),
         (README_EMPTY_YAML, EXPECTED_ERROR_README_EMPTY_YAML),
         (README_EMPTY, EXPECTED_ERROR_README_EMPTY),
+        (README_NONE_SUBSECTION, EXPECTED_ERROR_README_NONE_SUBSECTION),
         (README_MISSING_FIRST_LEVEL, EXPECTED_ERROR_README_MISSING_FIRST_LEVEL),
         (README_MISSING_SUBSECTION, EXPECTED_ERROR_README_MISSING_SUBSECTION),
         (README_MISSING_TEXT, EXPECTED_ERROR_README_MISSING_TEXT),
@@ -288,6 +301,7 @@ def test_readme_from_readme_correct():
         (README_NO_YAML, EXPECTED_ERROR_README_NO_YAML),
         (README_EMPTY_YAML, EXPECTED_ERROR_README_EMPTY_YAML),
         (README_EMPTY, EXPECTED_ERROR_README_EMPTY),
+        (README_NONE_SUBSECTION, EXPECTED_ERROR_README_NONE_SUBSECTION),
         (README_MISSING_FIRST_LEVEL, EXPECTED_ERROR_README_MISSING_FIRST_LEVEL),
         (README_MISSING_SUBSECTION, EXPECTED_ERROR_README_MISSING_SUBSECTION),
         (README_MISSING_TEXT, EXPECTED_ERROR_README_MISSING_TEXT),

From cd895a1510705ee3c252f21d547b6db44a79ad7b Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Sun, 2 May 2021 16:23:09 +0530
Subject: [PATCH 20/28] Add incorrect YAML test

---
 tests/test_readme_util.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py
index 6b1ef94dece..50bcc7af565 100644
--- a/tests/test_readme_util.py
+++ b/tests/test_readme_util.py
@@ -124,6 +124,23 @@
     "The following issues were found for the README at `{path}`:\n-\tNo YAML markers are present in the README."
 )
 
+README_INCORRECT_YAML = """\
+---
+# Dataset Card for My Dataset
+## Table of Contents
+Some text here.
+## Dataset Description
+Some text here.
+### Dataset Summary
+Some text here.
+### Supported Tasks and Leaderboards
+### Languages
+"""
+
+EXPECTED_ERROR_README_INCORRECT_YAML = (
+    "The following issues were found for the README at `{path}`:\n-\tOnly the start of YAML tags present in the README."
+)
+
 README_MISSING_TEXT = """\
 ---
 languages:
@@ -268,6 +285,7 @@ def test_readme_from_string_correct():
     [
         (README_NO_YAML, EXPECTED_ERROR_README_NO_YAML),
         (README_EMPTY_YAML, EXPECTED_ERROR_README_EMPTY_YAML),
+        (README_INCORRECT_YAML, EXPECTED_ERROR_README_INCORRECT_YAML),
         (README_EMPTY, EXPECTED_ERROR_README_EMPTY),
         (README_NONE_SUBSECTION, EXPECTED_ERROR_README_NONE_SUBSECTION),
         (README_MISSING_FIRST_LEVEL, EXPECTED_ERROR_README_MISSING_FIRST_LEVEL),
@@ -300,6 +318,7 @@ def test_readme_from_readme_correct():
     [
         (README_NO_YAML, EXPECTED_ERROR_README_NO_YAML),
         (README_EMPTY_YAML, EXPECTED_ERROR_README_EMPTY_YAML),
+        (README_INCORRECT_YAML, EXPECTED_ERROR_README_INCORRECT_YAML),
         (README_EMPTY, EXPECTED_ERROR_README_EMPTY),
         (README_NONE_SUBSECTION, EXPECTED_ERROR_README_NONE_SUBSECTION),
         (README_MISSING_FIRST_LEVEL, EXPECTED_ERROR_README_MISSING_FIRST_LEVEL),

From a3bdb1f13a9cd35d4ac060daf8bf0269fd2c1995 Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Sun, 2 May 2021 16:23:37 +0530
Subject: [PATCH 21/28] Fix style

---
 src/datasets/utils/readme.py | 4 +++-
 tests/test_readme_util.py    | 4 +---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py
index 9b0a393d845..b9814544bfb 100644
--- a/src/datasets/utils/readme.py
+++ b/src/datasets/utils/readme.py
@@ -114,7 +114,9 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput:
                 # If no subsections are present
                 values = [subsection["name"] for subsection in structure["subsections"]]
                 # Mention the expected values in the error_list
-                error_list.append(f"Section `{self.name}` expected the following subsections: {', '.join(['`'+x+'`' for x in values])}. Found 'None'.")
+                error_list.append(
+                    f"Section `{self.name}` expected the following subsections: {', '.join(['`'+x+'`' for x in values])}. Found 'None'."
+                )
             else:
                 # If some subsections are present
                 structure_names = [subsection["name"] for subsection in structure["subsections"]]
diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py
index 50bcc7af565..2eed7947db0 100644
--- a/tests/test_readme_util.py
+++ b/tests/test_readme_util.py
@@ -137,9 +137,7 @@
 ### Languages
 """
 
-EXPECTED_ERROR_README_INCORRECT_YAML = (
-    "The following issues were found for the README at `{path}`:\n-\tOnly the start of YAML tags present in the README."
-)
+EXPECTED_ERROR_README_INCORRECT_YAML = "The following issues were found for the README at `{path}`:\n-\tOnly the start of YAML tags present in the README."
 
 README_MISSING_TEXT = """\
 ---

From 6e85d4a5c641088588c4f246ee32ea06cb444d49 Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Sun, 2 May 2021 19:27:21 +0530
Subject: [PATCH 22/28] Fix tests

---
 tests/test_readme_util.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py
index 2eed7947db0..c7a9d112fd6 100644
--- a/tests/test_readme_util.py
+++ b/tests/test_readme_util.py
@@ -332,5 +332,6 @@ def test_readme_from_readme_error(readme_md, expected_error):
         path = Path(tmp_dir) / "README.md"
         with open(path, "w+") as readme_file:
             readme_file.write(readme_md)
-        with pytest.raises(ValueError, match=expected_error.format(path=path)):
+        expected_error = expected_error.format(path=path)
+        with pytest.raises(ValueError, match=expected_error):
             ReadMe.from_readme(path, example_yaml_structure)

From 10386e719283475598f39fc848c6c0a9cc7b039d Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Sun, 2 May 2021 19:42:43 +0530
Subject: [PATCH 23/28] Fix tests

---
 tests/test_readme_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py
index c7a9d112fd6..bb994496bb9 100644
--- a/tests/test_readme_util.py
+++ b/tests/test_readme_util.py
@@ -332,6 +332,6 @@ def test_readme_from_readme_error(readme_md, expected_error):
         path = Path(tmp_dir) / "README.md"
         with open(path, "w+") as readme_file:
             readme_file.write(readme_md)
-        expected_error = expected_error.format(path=path)
+        expected_error = expected_error.format(path=path).encode('unicode_escape').decode('ascii')
         with pytest.raises(ValueError, match=expected_error):
             ReadMe.from_readme(path, example_yaml_structure)

From b4ca9ca45495942c9508d0cc065d19f281172157 Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Sun, 2 May 2021 19:44:32 +0530
Subject: [PATCH 24/28] Fix style

---
 tests/test_readme_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py
index bb994496bb9..ff2ae4afc37 100644
--- a/tests/test_readme_util.py
+++ b/tests/test_readme_util.py
@@ -332,6 +332,6 @@ def test_readme_from_readme_error(readme_md, expected_error):
         path = Path(tmp_dir) / "README.md"
         with open(path, "w+") as readme_file:
             readme_file.write(readme_md)
-        expected_error = expected_error.format(path=path).encode('unicode_escape').decode('ascii')
+        expected_error = expected_error.format(path=path).encode("unicode_escape").decode("ascii")
         with pytest.raises(ValueError, match=expected_error):
             ReadMe.from_readme(path, example_yaml_structure)

From a69c019274d7429ed121b960c8ffa2ccb2fafc33 Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Tue, 4 May 2021 18:30:59 +0530
Subject: [PATCH 25/28] Fix escape character issue

---
 tests/test_readme_util.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py
index ff2ae4afc37..9d06648823b 100644
--- a/tests/test_readme_util.py
+++ b/tests/test_readme_util.py
@@ -1,3 +1,4 @@
+import re
 import tempfile
 from pathlib import Path
 
@@ -332,6 +333,6 @@ def test_readme_from_readme_error(readme_md, expected_error):
         path = Path(tmp_dir) / "README.md"
         with open(path, "w+") as readme_file:
             readme_file.write(readme_md)
-        expected_error = expected_error.format(path=path).encode("unicode_escape").decode("ascii")
-        with pytest.raises(ValueError, match=expected_error):
+        expected_error = expected_error.format(path=path)
+        with pytest.raises(ValueError, match=re.escape(expected_error)):
             ReadMe.from_readme(path, example_yaml_structure)

From d45ec9be5bcf0035d33d2d9e27575f87afa5f379 Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Fri, 7 May 2021 21:47:03 +0530
Subject: [PATCH 26/28] Add three-level heading validation limit

---
 src/datasets/utils/readme.py | 13 ++++--
 tests/test_readme_util.py    | 88 ++++++++++++++++++++++++++++++++----
 2 files changed, 89 insertions(+), 12 deletions(-)

diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py
index b9814544bfb..803759e21e7 100644
--- a/src/datasets/utils/readme.py
+++ b/src/datasets/utils/readme.py
@@ -105,7 +105,9 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput:
             # If header text is expected
             if self.is_empty:
                 # If no header text is found, mention it in the error_list
-                error_list.append(f"Expected some header text for section `{self.name}`.")
+                error_list.append(
+                    f"Expected some text in section `{self.name}` but it is empty (text in subsections are ignored)."
+                )
 
         # Subsections Validation
         if structure["subsections"] is not None:
@@ -127,9 +129,12 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput:
                     else:
                         # If the subsection is present, validate subsection, return the result
                         # and concat the errors from subsection to section error_list
-                        _, subsec_error_list, subsec_warning_list = self.content[name].validate(
-                            structure["subsections"][idx]
-                        )
+                        if self.level == "###":
+                            continue
+                        else:
+                            _, subsec_error_list, subsec_warning_list = self.content[name].validate(
+                                structure["subsections"][idx]
+                            )
                         error_list += subsec_error_list
                         warning_list += subsec_warning_list
 
diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py
index 9d06648823b..2b4b2acf1de 100644
--- a/tests/test_readme_util.py
+++ b/tests/test_readme_util.py
@@ -10,6 +10,7 @@
 
 # @pytest.fixture
 # def example_yaml_structure():
+
 example_yaml_structure = yaml.safe_load(
     """\
 name: ""
@@ -91,6 +92,64 @@
 ### Languages
 """
 
+
+README_CORRECT_FOUR_LEVEL = """\
+---
+languages:
+- zh
+- en
+---
+
+# Dataset Card for My Dataset
+## Table of Contents
+Some text here.
+## Dataset Description
+Some text here.
+### Dataset Summary
+Some text here.
+#### Extra Ignored Subsection
+### Supported Tasks and Leaderboards
+### Languages
+"""
+
+CORRECT_DICT_FOUR_LEVEL = {
+    "name": "root",
+    "text": "",
+    "is_empty": True,
+    "subsections": [
+        {
+            "name": "Dataset Card for My Dataset",
+            "text": "",
+            "is_empty": True,
+            "subsections": [
+                {"name": "Table of Contents", "text": "Some text here.", "is_empty": False, "subsections": []},
+                {
+                    "name": "Dataset Description",
+                    "text": "Some text here.",
+                    "is_empty": False,
+                    "subsections": [
+                        {
+                            "name": "Dataset Summary",
+                            "text": "Some text here.",
+                            "is_empty": False,
+                            "subsections": [
+                                {"name": "Extra Ignored Subsection", "text": "", "is_empty": True, "subsections": []}
+                            ],
+                        },
+                        {
+                            "name": "Supported Tasks and Leaderboards",
+                            "text": "",
+                            "is_empty": True,
+                            "subsections": [],
+                        },
+                        {"name": "Languages", "text": "", "is_empty": True, "subsections": []},
+                    ],
+                },
+            ],
+        }
+    ],
+}
+
 README_EMPTY_YAML = """\
 ---
 ---
@@ -156,7 +215,7 @@
 ### Supported Tasks and Leaderboards
 ### Languages
 """
-EXPECTED_ERROR_README_MISSING_TEXT = "The following issues were found for the README at `{path}`:\n-\tExpected some header text for section `Dataset Summary`."
+EXPECTED_ERROR_README_MISSING_TEXT = "The following issues were found for the README at `{path}`:\n-\tExpected some text in section `Dataset Summary` but it is empty (text in subsections are ignored)."
 
 
 README_NONE_SUBSECTION = """\
@@ -274,9 +333,15 @@
 EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1 = "The following issues were found for the README at `{path}`:\n-\tMultiple sections with the same heading `Dataset Card for My Dataset` have been found. Please keep only one of these sections."
 
 
-def test_readme_from_string_correct():
-
-    assert ReadMe.from_string(README_CORRECT, example_yaml_structure).to_dict() == CORRECT_DICT
+@pytest.mark.parametrize(
+    "readme_md, expected_dict",
+    [
+        (README_CORRECT, CORRECT_DICT),
+        (README_CORRECT_FOUR_LEVEL, CORRECT_DICT_FOUR_LEVEL),
+    ],
+)
+def test_readme_from_string_correct(readme_md, expected_dict):
+    assert ReadMe.from_string(readme_md, example_yaml_structure).to_dict() == expected_dict
 
 
 @pytest.mark.parametrize(
@@ -296,20 +361,27 @@ def test_readme_from_string_correct():
     ],
 )
 def test_readme_from_string_errors(readme_md, expected_error):
-    with pytest.raises(ValueError, match=expected_error.format(path="root")):
+    with pytest.raises(ValueError, match=re.escape(expected_error.format(path="root"))):
         ReadMe.from_string(readme_md, example_yaml_structure)
 
 
-def test_readme_from_readme_correct():
+@pytest.mark.parametrize(
+    "readme_md, expected_dict",
+    [
+        (README_CORRECT, CORRECT_DICT),
+        (README_CORRECT_FOUR_LEVEL, CORRECT_DICT_FOUR_LEVEL),
+    ],
+)
+def test_readme_from_readme_correct(readme_md, expected_dict):
     with tempfile.TemporaryDirectory() as tmp_dir:
         path = Path(tmp_dir) / "README.md"
         with open(path, "w+") as readme_file:
-            readme_file.write(README_CORRECT)
+            readme_file.write(readme_md)
         out = ReadMe.from_readme(path, example_yaml_structure).to_dict()
         assert out["name"] == path
         assert out["text"] == ""
         assert out["is_empty"]
-        assert out["subsections"] == CORRECT_DICT["subsections"]
+        assert out["subsections"] == expected_dict["subsections"]
 
 
 @pytest.mark.parametrize(

From cdcffe0818fa5e8ca1e0a8f4a4215bd0e3a8a1bc Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Fri, 7 May 2021 23:08:18 +0530
Subject: [PATCH 27/28] Add either text or subsection option

---
 src/datasets/utils/readme.py                  | 23 +++--
 .../utils/resources/readme_structure.yaml     | 39 ++++++--
 tests/test_readme_util.py                     | 89 ++++++++++++++-----
 3 files changed, 117 insertions(+), 34 deletions(-)

diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py
index 803759e21e7..6d4b8cb1a10 100644
--- a/src/datasets/utils/readme.py
+++ b/src/datasets/utils/readme.py
@@ -46,7 +46,7 @@ class Section:
 
     def __post_init__(self):
         self.text = ""
-        self.is_empty = True
+        self.is_empty_text = True
         self.content = {}
         self.parsing_error_list = []
         self.parsing_warning_list = []
@@ -70,7 +70,7 @@ def parse(self):
                     if current_lines != []:
                         self.text += "".join(current_lines).strip()
                         if self.text != "" and self.text not in FILLER_TEXT:
-                            self.is_empty = False
+                            self.is_empty_text = False
                         current_lines = []
 
                 current_sub_level = " ".join(line.split()[1:]).strip(" \n")
@@ -87,7 +87,7 @@ def parse(self):
                 if current_lines != []:
                     self.text += "".join(current_lines).strip()
                     if self.text != "" and self.text not in FILLER_TEXT:
-                        self.is_empty = False
+                        self.is_empty_text = False
 
     def validate(self, structure: dict) -> ReadmeValidatorOutput:
         """Validates a Section class object recursively using the structure provided as a dictionary.
@@ -102,13 +102,18 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput:
         error_list = []
         warning_list = []
         if structure["allow_empty"] is False:
-            # If header text is expected
-            if self.is_empty:
-                # If no header text is found, mention it in the error_list
+            # If content is expected
+            if self.is_empty_text and self.content == {}:
+                # If no content is found, mention it in the error_list
+                error_list.append(f"Expected some content in section `{self.name}` but it is empty.")
+        
+        if structure["allow_empty_text"] is False:
+            # If some text is expected
+            if self.is_empty_text:
+                # If no text is found, mention it in the error_list
                 error_list.append(
                     f"Expected some text in section `{self.name}` but it is empty (text in subsections are ignored)."
                 )
-
         # Subsections Validation
         if structure["subsections"] is not None:
             # If subsections are expected
@@ -129,6 +134,8 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput:
                     else:
                         # If the subsection is present, validate subsection, return the result
                         # and concat the errors from subsection to section error_list
+
+                        # Skip sublevel validation if current level is `###`
                         if self.level == "###":
                             continue
                         else:
@@ -157,7 +164,7 @@ def to_dict(self) -> dict:
         return {
             "name": self.name,
             "text": self.text,
-            "is_empty": self.is_empty,
+            "is_empty_text": self.is_empty_text,
             "subsections": [value.to_dict() for value in self.content.values()],
         }
 
diff --git a/src/datasets/utils/resources/readme_structure.yaml b/src/datasets/utils/resources/readme_structure.yaml
index fc3356f2675..755483d1d4f 100644
--- a/src/datasets/utils/resources/readme_structure.yaml
+++ b/src/datasets/utils/resources/readme_structure.yaml
@@ -1,87 +1,116 @@
 name: "" # Filename comes here
 allow_empty: false
+allow_empty_text: true
 subsections:
   - name: "Dataset Card for X" # First-level markdown heading
-    allow_empty: true
+    allow_empty: false
+    allow_empty_text: true
     subsections:
       - name: "Table of Contents"
         allow_empty: false
+        allow_empty_text: false
         subsections: null # meaning it should not be checked.
       - name: "Dataset Description"
         allow_empty: false
+        allow_empty_text: false
         subsections:
           - name: "Dataset Summary"
             allow_empty: false
+            allow_empty_text: false
             subsections: null
           - name: "Supported Tasks and Leaderboards"
             allow_empty: true
+            allow_empty_text: true
             subsections: null
           - name: Languages
             allow_empty: true
+            allow_empty_text: true
             subsections: null
       - name: "Dataset Structure"
-        allow_empty: true
+        allow_empty: false
+        allow_empty_text: true
         subsections:
           - name: "Data Instances"
             allow_empty: false
+            allow_empty_text: false
             subsections: null
           - name: "Data Fields"
             allow_empty: false
+            allow_empty_text: false
             subsections: null
           - name: "Data Splits"
             allow_empty: false
+            allow_empty_text: false
             subsections: null
       - name: "Dataset Creation"
-        allow_empty: true
+        allow_empty: false
+        allow_empty_text: true
         subsections:
           - name: "Curation Rationale"
             allow_empty: true
+            allow_empty_text: true
             subsections: null
           - name: "Source Data"
-            allow_empty: true
+            allow_empty: false
+            allow_empty_text: true
             subsections:
               - name: "Initial Data Collection and Normalization"
                 allow_empty: true
+                allow_empty_text: true
                 subsections: null
               - name: "Who are the source language producers?"
                 allow_empty: true
+                allow_empty_text: true
                 subsections: null
           - name: "Annotations"
-            allow_empty: true
+            allow_empty: false
+            allow_empty_text: true
             subsections:
               - name: "Annotation process"
                 allow_empty: true
+                allow_empty_text: true
                 subsections: null
               - name: "Who are the annotators?"
                 allow_empty: true
+                allow_empty_text: true
                 subsections: null
           - name: "Personal and Sensitive Information"
             allow_empty: true
+            allow_empty_text: true
             subsections: null
       - name: "Considerations for Using the Data"
         allow_empty: true
+        allow_empty_text: true
         subsections:
           - name: "Social Impact of Dataset"
             allow_empty: true
+            allow_empty_text: true
             subsections: null
           - name: "Discussion of Biases"
             allow_empty: true
+            allow_empty_text: true
             subsections: null
           - name: "Other Known Limitations"
             allow_empty: true
+            allow_empty_text: true
             subsections: null
       - name: "Additional Information"
         allow_empty: true
+        allow_empty_text: true
         subsections:
           - name: "Dataset Curators"
             allow_empty: true
+            allow_empty_text: true
             subsections: null
           - name: "Licensing Information"
             allow_empty: true
+            allow_empty_text: true
             subsections: null
           - name: "Citation Information"
             allow_empty: false
+            allow_empty_text: true
             subsections: null
           - name: "Contributions"
             allow_empty: false
+            allow_empty_text: false
             subsections: null
diff --git a/tests/test_readme_util.py b/tests/test_readme_util.py
index 2b4b2acf1de..711c6ea55c5 100644
--- a/tests/test_readme_util.py
+++ b/tests/test_readme_util.py
@@ -15,24 +15,31 @@
     """\
 name: ""
 allow_empty: false
+allow_empty_text: true
 subsections:
   - name: "Dataset Card for X" # First-level markdown heading
-    allow_empty: true
+    allow_empty: false
+    allow_empty_text: true
     subsections:
       - name: "Table of Contents"
         allow_empty: false
-        subsections: null # meaning it should not be checked.
+        allow_empty_text: false
+        subsections: null
       - name: "Dataset Description"
         allow_empty: false
+        allow_empty_text: false
         subsections:
           - name: "Dataset Summary"
             allow_empty: false
+            allow_empty_text: false
             subsections: null
           - name: "Supported Tasks and Leaderboards"
             allow_empty: true
+            allow_empty_text: true
             subsections: null
           - name: Languages
-            allow_empty: true
+            allow_empty: false
+            allow_empty_text: true
             subsections: null
 """
 )
@@ -41,32 +48,32 @@
 CORRECT_DICT = {
     "name": "root",
     "text": "",
-    "is_empty": True,
+    "is_empty_text": True,
     "subsections": [
         {
             "name": "Dataset Card for My Dataset",
             "text": "",
-            "is_empty": True,
+            "is_empty_text": True,
             "subsections": [
-                {"name": "Table of Contents", "text": "Some text here.", "is_empty": False, "subsections": []},
+                {"name": "Table of Contents", "text": "Some text here.", "is_empty_text": False, "subsections": []},
                 {
                     "name": "Dataset Description",
                     "text": "Some text here.",
-                    "is_empty": False,
+                    "is_empty_text": False,
                     "subsections": [
                         {
                             "name": "Dataset Summary",
                             "text": "Some text here.",
-                            "is_empty": False,
+                            "is_empty_text": False,
                             "subsections": [],
                         },
                         {
                             "name": "Supported Tasks and Leaderboards",
                             "text": "",
-                            "is_empty": True,
+                            "is_empty_text": True,
                             "subsections": [],
                         },
-                        {"name": "Languages", "text": "", "is_empty": True, "subsections": []},
+                        {"name": "Languages", "text": "Language Text", "is_empty_text": False, "subsections": []},
                     ],
                 },
             ],
@@ -74,6 +81,7 @@
     ],
 }
 
+
 README_CORRECT = """\
 ---
 languages:
@@ -90,6 +98,7 @@
 Some text here.
 ### Supported Tasks and Leaderboards
 ### Languages
+Language Text
 """
 
 
@@ -110,39 +119,45 @@
 #### Extra Ignored Subsection
 ### Supported Tasks and Leaderboards
 ### Languages
+Language Text
 """
 
 CORRECT_DICT_FOUR_LEVEL = {
     "name": "root",
     "text": "",
-    "is_empty": True,
+    "is_empty_text": True,
     "subsections": [
         {
             "name": "Dataset Card for My Dataset",
             "text": "",
-            "is_empty": True,
+            "is_empty_text": True,
             "subsections": [
-                {"name": "Table of Contents", "text": "Some text here.", "is_empty": False, "subsections": []},
+                {"name": "Table of Contents", "text": "Some text here.", "is_empty_text": False, "subsections": []},
                 {
                     "name": "Dataset Description",
                     "text": "Some text here.",
-                    "is_empty": False,
+                    "is_empty_text": False,
                     "subsections": [
                         {
                             "name": "Dataset Summary",
                             "text": "Some text here.",
-                            "is_empty": False,
+                            "is_empty_text": False,
                             "subsections": [
-                                {"name": "Extra Ignored Subsection", "text": "", "is_empty": True, "subsections": []}
+                                {
+                                    "name": "Extra Ignored Subsection",
+                                    "text": "",
+                                    "is_empty_text": True,
+                                    "subsections": [],
+                                }
                             ],
                         },
                         {
                             "name": "Supported Tasks and Leaderboards",
                             "text": "",
-                            "is_empty": True,
+                            "is_empty_text": True,
                             "subsections": [],
                         },
-                        {"name": "Languages", "text": "", "is_empty": True, "subsections": []},
+                        {"name": "Languages", "text": "Language Text", "is_empty_text": False, "subsections": []},
                     ],
                 },
             ],
@@ -162,6 +177,7 @@
 Some text here.
 ### Supported Tasks and Leaderboards
 ### Languages
+Language Text
 """
 
 EXPECTED_ERROR_README_EMPTY_YAML = (
@@ -178,6 +194,7 @@
 Some text here.
 ### Supported Tasks and Leaderboards
 ### Languages
+Language Text
 """
 
 EXPECTED_ERROR_README_NO_YAML = (
@@ -195,6 +212,7 @@
 Some text here.
 ### Supported Tasks and Leaderboards
 ### Languages
+Language Text
 """
 
 EXPECTED_ERROR_README_INCORRECT_YAML = "The following issues were found for the README at `{path}`:\n-\tOnly the start of YAML tags present in the README."
@@ -214,8 +232,9 @@
 ### Dataset Summary
 ### Supported Tasks and Leaderboards
 ### Languages
+Language Text
 """
-EXPECTED_ERROR_README_MISSING_TEXT = "The following issues were found for the README at `{path}`:\n-\tExpected some text in section `Dataset Summary` but it is empty (text in subsections are ignored)."
+EXPECTED_ERROR_README_MISSING_TEXT = "The following issues were found for the README at `{path}`:\n-\tExpected some content in section `Dataset Summary` but it is empty.\n-\tExpected some text in section `Dataset Summary` but it is empty (text in subsections are ignored)."
 
 
 README_NONE_SUBSECTION = """\
@@ -227,7 +246,7 @@
 
 # Dataset Card for My Dataset
 """
-EXPECTED_ERROR_README_NONE_SUBSECTION = "The following issues were found for the README at `{path}`:\n-\tSection `Dataset Card for My Dataset` expected the following subsections: `Table of Contents`, `Dataset Description`. Found 'None'."
+EXPECTED_ERROR_README_NONE_SUBSECTION = "The following issues were found for the README at `{path}`:\n-\tExpected some content in section `Dataset Card for My Dataset` but it is empty.\n-\tSection `Dataset Card for My Dataset` expected the following subsections: `Table of Contents`, `Dataset Description`. Found 'None'."
 
 README_MISSING_SUBSECTION = """\
 ---
@@ -244,10 +263,32 @@
 ### Dataset Summary
 Some text here.
 ### Languages
+Language Text
 """
 
 EXPECTED_ERROR_README_MISSING_SUBSECTION = "The following issues were found for the README at `{path}`:\n-\tSection `Dataset Description` is missing subsection: `Supported Tasks and Leaderboards`."
 
+
+README_MISSING_CONTENT = """\
+---
+languages:
+- zh
+- en
+---
+
+# Dataset Card for My Dataset
+## Table of Contents
+Some text here.
+## Dataset Description
+Some text here.
+### Dataset Summary
+Some text here.
+### Supported Tasks and Leaderboards
+### Languages
+"""
+
+EXPECTED_ERROR_README_MISSING_CONTENT = "The following issues were found for the README at `{path}`:\n-\tExpected some content in section `Languages` but it is empty."
+
 README_MISSING_FIRST_LEVEL = """\
 ---
 languages:
@@ -263,6 +304,7 @@
 Some text here.
 ### Supported Tasks and Leaderboards
 ### Languages
+Language Text
 """
 EXPECTED_ERROR_README_MISSING_FIRST_LEVEL = "The following issues were found for the README at `{path}`:\n-\tThe README has no first-level headings. One heading is expected. Skipping further validation for this README."
 
@@ -282,6 +324,7 @@
 Some text here.
 ### Supported Tasks and Leaderboards
 ### Languages
+Language Text
 # Dataset Card My Dataset
 """
 
@@ -303,6 +346,7 @@
 Some text here.
 ### Supported Tasks and Leaderboards
 ### Languages
+Language Text
 """
 
 EXPECTED_ERROR_README_WRONG_FIRST_LEVEL = "The following issues were found for the README at `{path}`:\n-\tNo first-level heading starting with `Dataset Card for` found in README. Skipping further validation for this README."
@@ -328,6 +372,7 @@
 Some text here.
 ### Supported Tasks and Leaderboards
 ### Languages
+Language Text
 """
 
 EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1 = "The following issues were found for the README at `{path}`:\n-\tMultiple sections with the same heading `Dataset Card for My Dataset` have been found. Please keep only one of these sections."
@@ -358,6 +403,7 @@ def test_readme_from_string_correct(readme_md, expected_dict):
         (README_MULTIPLE_SAME_HEADING_1, EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1),
         (README_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_WRONG_FIRST_LEVEL),
         (README_MULTIPLE_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL),
+        (README_MISSING_CONTENT, EXPECTED_ERROR_README_MISSING_CONTENT),
     ],
 )
 def test_readme_from_string_errors(readme_md, expected_error):
@@ -380,7 +426,7 @@ def test_readme_from_readme_correct(readme_md, expected_dict):
         out = ReadMe.from_readme(path, example_yaml_structure).to_dict()
         assert out["name"] == path
         assert out["text"] == ""
-        assert out["is_empty"]
+        assert out["is_empty_text"]
         assert out["subsections"] == expected_dict["subsections"]
 
 
@@ -398,6 +444,7 @@ def test_readme_from_readme_correct(readme_md, expected_dict):
         (README_MULTIPLE_SAME_HEADING_1, EXPECTED_ERROR_README_MULTIPLE_SAME_HEADING_1),
         (README_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_WRONG_FIRST_LEVEL),
         (README_MULTIPLE_WRONG_FIRST_LEVEL, EXPECTED_ERROR_README_MULTIPLE_WRONG_FIRST_LEVEL),
+        (README_MISSING_CONTENT, EXPECTED_ERROR_README_MISSING_CONTENT),
     ],
 )
 def test_readme_from_readme_error(readme_md, expected_error):

From ffdfcb642dae76a0b80b000ab76984f8d4dd9f4f Mon Sep 17 00:00:00 2001
From: Gunjan Chhablani <chhablani.gunjan@gmail.com>
Date: Fri, 7 May 2021 23:18:57 +0530
Subject: [PATCH 28/28] Fix style

---
 src/datasets/utils/readme.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/datasets/utils/readme.py b/src/datasets/utils/readme.py
index 6d4b8cb1a10..45b8713f085 100644
--- a/src/datasets/utils/readme.py
+++ b/src/datasets/utils/readme.py
@@ -106,7 +106,7 @@ def validate(self, structure: dict) -> ReadmeValidatorOutput:
             if self.is_empty_text and self.content == {}:
                 # If no content is found, mention it in the error_list
                 error_list.append(f"Expected some content in section `{self.name}` but it is empty.")
-        
+
         if structure["allow_empty_text"] is False:
             # If some text is expected
             if self.is_empty_text: