Skip to content

Commit

Permalink
Add The Pile dataset and PubMed Central subset
Browse files Browse the repository at this point in the history
  • Loading branch information
albertvillanova committed Nov 17, 2021
1 parent d8a998c commit c88b62b
Showing 1 changed file with 156 additions and 0 deletions.
156 changes: 156 additions & 0 deletions datasets/the_pile/the_pile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
# coding=utf-8
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The Pile dataset."""

import json

import datasets


_CITATION = """\
@misc{gao2020pile,
title={The Pile: An 800GB Dataset of Diverse Text for Language Modeling},
author={Leo Gao and Stella Biderman and Sid Black and Laurence Golding and Travis Hoppe and Charles Foster and Jason Phang and Horace He and Anish Thite and Noa Nabeshima and Shawn Presser and Connor Leahy},
year={2020},
eprint={2101.00027},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""

_DESCRIPTION = """\
The Pile is a 825 GiB diverse, open source language modelling data set that consists of 22 smaller, high-quality
datasets combined together.
"""

_HOMEPAGE = "https://pile.eleuther.ai/"

_LICENSES = {
"all": "MIT License",
"pubmed_central": "MIT License",
}

_DATA_URLS = {
"all": {
"train": [f"https://the-eye.eu/public/AI/pile/train/{i:0>2}.jsonl.zst" for i in range(30)],
"validation": ["https://the-eye.eu/public/AI/pile/val.jsonl.zst"],
"test": ["https://the-eye.eu/public/AI/pile/test.jsonl.zst"],
},
"pubmed_central": "https://the-eye.eu/public/AI/pile_preliminary_components/PMC_extracts.tar.gz",
}

_FEATURES = {
"all": datasets.Features({
"text": datasets.Value("string"),
"meta": {"pile_set_name": datasets.Value("string")},
}),
"pubmed_central": datasets.Features({
"id": datasets.Value("string"),
"text": datasets.Value("string"),
}),
}


class ThePileConfig(datasets.BuilderConfig):
"""BuilderConfig for The Pile."""

def __init__(self, *args, subsets, **kwargs):
"""BuilderConfig for The Pile.
Args:
subsets (:obj:`List[str]`): List of subsets to load.
**kwargs: keyword arguments forwarded to super.
"""
super().__init__(
*args,
name="+".join(subsets),
**kwargs,
)
self.subsets = subsets


class ThePile(datasets.GeneratorBasedBuilder):
"""The Pile dataset."""

VERSION = datasets.Version("1.1.0")

BUILDER_CONFIG_CLASS = ThePileConfig
BUILDER_CONFIGS = [ThePileConfig(subsets=[subset]) for subset in _DATA_URLS]
DEFAULT_CONFIG_NAME = "all"

def _info(self):
"""Give information and typings for the dataset."""
return datasets.DatasetInfo(
# This is the description that will appear on the datasets page.
description=_DESCRIPTION,
# This defines the different columns of the dataset and their types
features=_FEATURES[self.config.name],
# If there's a common (input, target) tuple from the features,
# specify them here. They'll be used if as_supervised=True in
# builder.as_dataset.
supervised_keys=None,
# Homepage of the dataset for documentation
homepage=_HOMEPAGE,
# License for the dataset if available
license=_LICENSE,
# Citation for the dataset
citation=_CITATION,
)

def _split_generators(self, dl_manager):
"""Return SplitGenerators."""
if self.config.name == "all":
data_dir = dl_manager.download_and_extract(_DATA_URLS[self.config.name])
return[
datasets.SplitGenerator(
name=split,
gen_kwargs={
"files": data_dir[split],
},
) for split in [datasets.Split.TRAIN, datasets.Split.VALIDATION, datasets.Split.TEST]
]
else:
data_urls = {subset: _DATA_URLS[subset] for subset in self.config.subsets}
archive = dl_manager.download(data_urls)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"files": {subset: dl_manager.iter_archive(archive[subset]) for subset in self.config.subsets},
},
),
]

def _generate_examples(self, files):
"""Yield examples as (key, example) tuples."""
key = 0
if isinstance(files, list):
for path in files:
with open(path, encoding="utf-8") as f:
for row in f:
data = json.loads(row)
yield key, data
key += 1
else:
for subset in files:
if subset == "pubmed_central":
for path, file in files[subset]:
id_ = path.split("/")[-1].split(".")[0]
text = file.read().decode("utf-8")
yield key, {
"id": id_,
"text": text,
}
key += 1

1 comment on commit c88b62b

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==3.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.075907 / 0.011353 (0.064554) 0.004166 / 0.011008 (-0.006842) 0.031569 / 0.038508 (-0.006939) 0.038632 / 0.023109 (0.015523) 0.288825 / 0.275898 (0.012927) 0.332129 / 0.323480 (0.008650) 0.088952 / 0.007986 (0.080966) 0.004483 / 0.004328 (0.000154) 0.009353 / 0.004250 (0.005103) 0.046482 / 0.037052 (0.009430) 0.292578 / 0.258489 (0.034089) 0.331134 / 0.293841 (0.037293) 0.086066 / 0.128546 (-0.042481) 0.009292 / 0.075646 (-0.066354) 0.253058 / 0.419271 (-0.166213) 0.046510 / 0.043533 (0.002977) 0.297827 / 0.255139 (0.042688) 0.316470 / 0.283200 (0.033271) 0.087673 / 0.141683 (-0.054010) 1.793391 / 1.452155 (0.341236) 1.811539 / 1.492716 (0.318822)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.310523 / 0.018006 (0.292517) 0.569203 / 0.000490 (0.568713) 0.003604 / 0.000200 (0.003404) 0.000092 / 0.000054 (0.000038)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.037822 / 0.037411 (0.000411) 0.023094 / 0.014526 (0.008568) 0.030106 / 0.176557 (-0.146450) 0.200195 / 0.737135 (-0.536940) 0.030655 / 0.296338 (-0.265683)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.421769 / 0.215209 (0.206560) 4.222471 / 2.077655 (2.144817) 1.797840 / 1.504120 (0.293721) 1.586083 / 1.541195 (0.044889) 1.711199 / 1.468490 (0.242708) 0.419395 / 4.584777 (-4.165382) 4.539257 / 3.745712 (0.793545) 3.564953 / 5.269862 (-1.704909) 0.883815 / 4.565676 (-3.681861) 0.050310 / 0.424275 (-0.373965) 0.011073 / 0.007607 (0.003466) 0.529015 / 0.226044 (0.302971) 5.270088 / 2.268929 (3.001159) 2.296465 / 55.444624 (-53.148159) 1.906762 / 6.876477 (-4.969714) 2.067494 / 2.142072 (-0.074578) 0.535147 / 4.805227 (-4.270081) 0.115240 / 6.500664 (-6.385424) 0.057062 / 0.075469 (-0.018407)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.558582 / 1.841788 (-0.283206) 12.098505 / 8.074308 (4.024197) 24.948519 / 10.191392 (14.757127) 0.607663 / 0.680424 (-0.072761) 0.452559 / 0.534201 (-0.081642) 0.327603 / 0.579283 (-0.251680) 0.535659 / 0.434364 (0.101296) 0.225629 / 0.540337 (-0.314709) 0.255296 / 1.386936 (-1.131640)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.062234 / 0.011353 (0.050881) 0.003767 / 0.011008 (-0.007241) 0.026582 / 0.038508 (-0.011926) 0.030977 / 0.023109 (0.007868) 0.259232 / 0.275898 (-0.016666) 0.289746 / 0.323480 (-0.033734) 0.083406 / 0.007986 (0.075420) 0.003524 / 0.004328 (-0.000804) 0.006831 / 0.004250 (0.002580) 0.036162 / 0.037052 (-0.000890) 0.257081 / 0.258489 (-0.001408) 0.297362 / 0.293841 (0.003521) 0.075848 / 0.128546 (-0.052698) 0.008329 / 0.075646 (-0.067317) 0.221083 / 0.419271 (-0.198189) 0.041994 / 0.043533 (-0.001539) 0.259268 / 0.255139 (0.004129) 0.276895 / 0.283200 (-0.006305) 0.084217 / 0.141683 (-0.057466) 1.481090 / 1.452155 (0.028935) 1.524700 / 1.492716 (0.031984)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.329197 / 0.018006 (0.311191) 0.537441 / 0.000490 (0.536951) 0.002080 / 0.000200 (0.001880) 0.000089 / 0.000054 (0.000035)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.030658 / 0.037411 (-0.006754) 0.019124 / 0.014526 (0.004599) 0.026923 / 0.176557 (-0.149634) 0.179464 / 0.737135 (-0.557671) 0.028582 / 0.296338 (-0.267756)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.394403 / 0.215209 (0.179194) 3.970659 / 2.077655 (1.893004) 1.781225 / 1.504120 (0.277105) 1.592347 / 1.541195 (0.051152) 1.710659 / 1.468490 (0.242169) 0.373069 / 4.584777 (-4.211708) 4.285758 / 3.745712 (0.540046) 1.933385 / 5.269862 (-3.336477) 0.796450 / 4.565676 (-3.769226) 0.044108 / 0.424275 (-0.380167) 0.010035 / 0.007607 (0.002428) 0.487690 / 0.226044 (0.261646) 4.883693 / 2.268929 (2.614764) 2.156229 / 55.444624 (-53.288395) 1.816015 / 6.876477 (-5.060461) 1.974982 / 2.142072 (-0.167090) 0.472388 / 4.805227 (-4.332839) 0.101413 / 6.500664 (-6.399251) 0.050419 / 0.075469 (-0.025050)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.369683 / 1.841788 (-0.472105) 11.836493 / 8.074308 (3.762185) 26.244468 / 10.191392 (16.053076) 0.755372 / 0.680424 (0.074948) 0.531234 / 0.534201 (-0.002967) 0.374510 / 0.579283 (-0.204773) 0.524679 / 0.434364 (0.090315) 0.226800 / 0.540337 (-0.313538) 0.253324 / 1.386936 (-1.133612)

CML watermark

Please sign in to comment.