Add The Pile dataset and PubMed Central subset

huggingface · Nov 17, 2021 · c88b62b · c88b62b · github-actions · Nov 17, 2021
1 parent d8a998c
commit c88b62b
Showing 1 changed file with 156 additions and 0 deletions.
diff --git a/datasets/the_pile/the_pile.py b/datasets/the_pile/the_pile.py
@@ -0,0 +1,156 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The Pile dataset."""
+
+import json
+
+import datasets
+
+
+_CITATION = """\
+@misc{gao2020pile,
+      title={The Pile: An 800GB Dataset of Diverse Text for Language Modeling}, 
+      author={Leo Gao and Stella Biderman and Sid Black and Laurence Golding and Travis Hoppe and Charles Foster and Jason Phang and Horace He and Anish Thite and Noa Nabeshima and Shawn Presser and Connor Leahy},
+      year={2020},
+      eprint={2101.00027},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+"""
+
+_DESCRIPTION = """\
+The Pile is a 825 GiB diverse, open source language modelling data set that consists of 22 smaller, high-quality
+datasets combined together.
+"""
+
+_HOMEPAGE = "https://pile.eleuther.ai/"
+
+_LICENSES = {
+    "all": "MIT License",
+    "pubmed_central": "MIT License",
+}
+
+_DATA_URLS = {
+    "all": {
+        "train": [f"https://the-eye.eu/public/AI/pile/train/{i:0>2}.jsonl.zst" for i in range(30)],
+        "validation": ["https://the-eye.eu/public/AI/pile/val.jsonl.zst"],
+        "test": ["https://the-eye.eu/public/AI/pile/test.jsonl.zst"],
+    },
+    "pubmed_central": "https://the-eye.eu/public/AI/pile_preliminary_components/PMC_extracts.tar.gz",
+}
+
+_FEATURES = {
+    "all": datasets.Features({
+        "text": datasets.Value("string"),
+        "meta": {"pile_set_name": datasets.Value("string")},
+    }),
+    "pubmed_central": datasets.Features({
+        "id": datasets.Value("string"),
+        "text": datasets.Value("string"),
+    }),
+}
+
+
+class ThePileConfig(datasets.BuilderConfig):
+    """BuilderConfig for The Pile."""
+
+    def __init__(self, *args, subsets, **kwargs):
+        """BuilderConfig for The Pile.
+
+        Args:
+            subsets (:obj:`List[str]`): List of subsets to load.
+            **kwargs: keyword arguments forwarded to super.
+        """
+        super().__init__(
+            *args,
+            name="+".join(subsets),
+            **kwargs,
+        )
+        self.subsets = subsets
+
+
+class ThePile(datasets.GeneratorBasedBuilder):
+    """The Pile dataset."""
+
+    VERSION = datasets.Version("1.1.0")
+
+    BUILDER_CONFIG_CLASS = ThePileConfig
+    BUILDER_CONFIGS = [ThePileConfig(subsets=[subset]) for subset in _DATA_URLS]
+    DEFAULT_CONFIG_NAME = "all"
+
+    def _info(self):
+        """Give information and typings for the dataset."""
+        return datasets.DatasetInfo(
+            # This is the description that will appear on the datasets page.
+            description=_DESCRIPTION,
+            # This defines the different columns of the dataset and their types
+            features=_FEATURES[self.config.name],
+            # If there's a common (input, target) tuple from the features,
+            # specify them here. They'll be used if as_supervised=True in
+            # builder.as_dataset.
+            supervised_keys=None,
+            # Homepage of the dataset for documentation
+            homepage=_HOMEPAGE,
+            # License for the dataset if available
+            license=_LICENSE,
+            # Citation for the dataset
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        """Return SplitGenerators."""
+        if self.config.name == "all":
+            data_dir = dl_manager.download_and_extract(_DATA_URLS[self.config.name])
+            return[
+                datasets.SplitGenerator(
+                    name=split,
+                    gen_kwargs={
+                        "files": data_dir[split],
+                    },
+                ) for split in [datasets.Split.TRAIN, datasets.Split.VALIDATION, datasets.Split.TEST]
+            ]
+        else:
+            data_urls = {subset: _DATA_URLS[subset] for subset in self.config.subsets}
+            archive = dl_manager.download(data_urls)
+            return [
+                datasets.SplitGenerator(
+                    name=datasets.Split.TRAIN,
+                    gen_kwargs={
+                        "files": {subset: dl_manager.iter_archive(archive[subset]) for subset in self.config.subsets},
+                    },
+                ),
+            ]
+
+    def _generate_examples(self, files):
+        """Yield examples as (key, example) tuples."""
+        key = 0
+        if isinstance(files, list):
+            for path in files:
+                with open(path, encoding="utf-8") as f:
+                    for row in f:
+                        data = json.loads(row)
+                        yield key, data
+                        key += 1
+        else:
+            for subset in files:
+                if subset == "pubmed_central":
+                    for path, file in files[subset]:
+                        id_ = path.split("/")[-1].split(".")[0]
+                        text = file.read().decode("utf-8")
+                        yield key, {
+                            "id": id_,
+                            "text": text,
+                        }
+                        key += 1