diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 27309cad610..dcd7d8895c2 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -73,7 +73,7 @@ from .formatting import format_table, get_format_type_from_alias, get_formatter, query_table from .info import DatasetInfo from .search import IndexableMixin -from .splits import NamedSplit, Split +from .splits import NamedSplit, Split, SplitInfo from .table import ( InMemoryTable, MemoryMappedTable, @@ -3404,7 +3404,7 @@ def to_parquet( return ParquetDatasetWriter(self, path_or_buf, batch_size=batch_size, **parquet_writer_kwargs).write() - def push_to_hub( + def _push_parquet_shards_to_hub( self, repo_id: str, split: Optional[str] = None, @@ -3412,7 +3412,7 @@ def push_to_hub( token: Optional[str] = None, branch: Optional[str] = None, shard_size: Optional[int] = 500 << 20, - ): + ) -> Tuple[str, str, int, int]: """Pushes the dataset to the hub. The dataset is pushed using HTTP requests and does not need to have neither git or git-lfs installed. @@ -3437,12 +3437,18 @@ def push_to_hub( The size of the dataset shards to be uploaded to the hub. The dataset will be pushed in files of the size specified here, in bytes. Defaults to a shard size of 500MB. + Returns: + repo_id (:obj:`str`): ID of the repository in /` or `/` format + split (:obj:`str`): name of the uploaded split + uploaded_size (:obj:`int`): number of uploaded bytes + dataset_nbytes (:obj:`int`): approximate size in bytes of the uploaded dataset afer uncompression + Example: .. code-block:: python >>> dataset.push_to_hub("/", split="evaluation") """ - api = HfApi() + api = HfApi(endpoint=config.HF_ENDPOINT) token = token if token is not None else HfFolder.get_token() if token is None: @@ -3518,6 +3524,7 @@ def delete_file(file): ): delete_file(file) + uploaded_size = 0 for index, shard in utils.tqdm( enumerate(shards), desc="Pushing dataset shards to the dataset hub", @@ -3526,6 +3533,7 @@ def delete_file(file): ): buffer = BytesIO() shard.to_parquet(buffer) + uploaded_size += buffer.tell() api.upload_file( path_or_fileobj=buffer.getvalue(), path_in_repo=path_in_repo(index), @@ -3535,6 +3543,71 @@ def delete_file(file): revision=branch, identical_ok=True, ) + return repo_id, split, uploaded_size, dataset_nbytes + + def push_to_hub( + self, + repo_id: str, + split: Optional[str] = None, + private: Optional[bool] = False, + token: Optional[str] = None, + branch: Optional[str] = None, + shard_size: Optional[int] = 500 << 20, + ): + """Pushes the dataset to the hub. + The dataset is pushed using HTTP requests and does not need to have neither git or git-lfs installed. + + Args: + repo_id (:obj:`str`): + The ID of the repository to push to in the following format: `/` or + `/`. Also accepts ``, which will default to the namespace + of the logged-in user. + split (Optional, :obj:`str`): + The name of the split that will be given to that dataset. Defaults to `self.split`. + private (Optional :obj:`bool`, defaults to :obj:`False`): + Whether the dataset repository should be set to private or not. Only affects repository creation: + a repository that already exists will not be affected by that parameter. + token (Optional :obj:`str`): + An optional authentication token for the Hugging Face Hub. If no token is passed, will default + to the token saved locally when logging in with ``huggingface-cli login``. Will raise an error + if no token is passed and the user is not logged-in. + branch (Optional :obj:`str`): + The git branch on which to push the dataset. This defaults to the default branch as specified + in your repository, which defaults to `"main"`. + shard_size (Optional :obj:`int`): + The size of the dataset shards to be uploaded to the hub. The dataset will be pushed in files + of the size specified here, in bytes. Defaults to a shard size of 500MB. + + Example: + .. code-block:: python + + >>> dataset.push_to_hub("/", split="evaluation") + """ + repo_id, split, uploaded_size, dataset_nbytes = self._push_parquet_shards_to_hub( + repo_id=repo_id, split=split, private=private, token=token, branch=branch, shard_size=shard_size + ) + organization, dataset_name = repo_id.split("/") + info_to_dump = self.info.copy() + info_to_dump.download_checksums = None + info_to_dump.download_size = uploaded_size + info_to_dump.dataset_size = dataset_nbytes + info_to_dump.size_in_bytes = uploaded_size + dataset_nbytes + info_to_dump.splits = { + split: SplitInfo(split, num_bytes=dataset_nbytes, num_examples=len(self), dataset_name=dataset_name) + } + buffer = BytesIO() + buffer.write(f'{{"{organization}--{dataset_name}": '.encode()) + info_to_dump._dump_info(buffer) + buffer.write(b"}") + HfApi(endpoint=config.HF_ENDPOINT).upload_file( + path_or_fileobj=buffer.getvalue(), + path_in_repo=config.DATASETDICT_INFOS_FILENAME, + repo_id=repo_id, + token=token, + repo_type="dataset", + revision=branch, + identical_ok=True, + ) @transmit_format @fingerprint_transform(inplace=False) diff --git a/src/datasets/builder.py b/src/datasets/builder.py index df4edce8160..331ea132105 100644 --- a/src/datasets/builder.py +++ b/src/datasets/builder.py @@ -203,6 +203,7 @@ def __init__( name: Optional[str] = None, hash: Optional[str] = None, base_path: Optional[str] = None, + info: Optional[DatasetInfo] = None, features: Optional[Features] = None, use_auth_token: Optional[Union[bool, str]] = None, namespace: Optional[str] = None, @@ -263,11 +264,12 @@ def __init__( # prepare info: DatasetInfo are a standardized dataclass across all datasets # Prefill datasetinfo - info = self.get_exported_dataset_info() - info.update(self._info()) - info.builder_name = self.name - info.config_name = self.config.name - info.version = self.config.version + if info is None: + info = self.get_exported_dataset_info() + info.update(self._info()) + info.builder_name = self.name + info.config_name = self.config.name + info.version = self.config.version self.info = info # update info with user specified infos if features is not None: diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index 859b5fc1db3..5fd1db0fae7 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -3,23 +3,25 @@ import json import os import re +from io import BytesIO from pathlib import Path from typing import Any, Callable, Dict, List, Optional, Tuple, Union import fsspec import numpy as np - -from datasets.splits import NamedSplit, Split -from datasets.utils.doc_utils import is_documented_by +from huggingface_hub import HfApi from . import config from .arrow_dataset import Dataset from .features import Features from .filesystems import extract_path_from_uri, is_remote_filesystem +from .info import DatasetInfo +from .splits import NamedSplit, Split, SplitDict, SplitInfo from .table import Table from .tasks import TaskTemplate from .utils import logging from .utils.deprecation_utils import deprecated +from .utils.doc_utils import is_documented_by from .utils.typing import PathLike @@ -931,12 +933,41 @@ def push_to_hub( >>> dataset_dict.push_to_hub("/") """ - for key in self.keys(): - logger.warning(f"Pushing split {key} to the Hub.") + self._check_values_type() + total_uploaded_size = 0 + total_dataset_nbytes = 0 + info_to_dump: DatasetInfo = next(iter(self.values())).info.copy() + dataset_name = repo_id.split("/")[-1] + info_to_dump.splits = SplitDict(dataset_name=dataset_name) + for split in self.keys(): + logger.warning(f"Pushing split {split} to the Hub.") # The split=key needs to be removed before merging - self[key].push_to_hub( - repo_id, split=key, private=private, token=token, branch=branch, shard_size=shard_size + repo_id, split, uploaded_size, dataset_nbytes = self[split]._push_parquet_shards_to_hub( + repo_id, split=split, private=private, token=token, branch=branch, shard_size=shard_size ) + total_uploaded_size += uploaded_size + total_dataset_nbytes += dataset_nbytes + info_to_dump.splits[split] = SplitInfo( + str(split), num_bytes=dataset_nbytes, num_examples=len(self[split]), dataset_name=dataset_name + ) + organization, dataset_name = repo_id.split("/") + info_to_dump.download_checksums = None + info_to_dump.download_size = total_uploaded_size + info_to_dump.dataset_size = total_dataset_nbytes + info_to_dump.size_in_bytes = total_uploaded_size + total_dataset_nbytes + buffer = BytesIO() + buffer.write(f'{{"{organization}--{dataset_name}": '.encode()) + info_to_dump._dump_info(buffer) + buffer.write(b"}") + HfApi(endpoint=config.HF_ENDPOINT).upload_file( + path_or_fileobj=buffer.getvalue(), + path_in_repo=config.DATASETDICT_INFOS_FILENAME, + repo_id=repo_id, + token=token, + repo_type="dataset", + revision=branch, + identical_ok=True, + ) class IterableDatasetDict(dict): diff --git a/src/datasets/info.py b/src/datasets/info.py index adf49ca6fd3..706aa812b9c 100644 --- a/src/datasets/info.py +++ b/src/datasets/info.py @@ -34,7 +34,7 @@ import json import os from dataclasses import asdict, dataclass, field -from typing import List, Optional, Union +from typing import Dict, List, Optional, Union from datasets.tasks.text_classification import TextClassification @@ -274,7 +274,7 @@ def copy(self) -> "DatasetInfo": return self.__class__(**{k: copy.deepcopy(v) for k, v in self.__dict__.items()}) -class DatasetInfosDict(dict): +class DatasetInfosDict(Dict[str, DatasetInfo]): def write_to_directory(self, dataset_infos_dir, overwrite=False): total_dataset_infos = {} dataset_infos_path = os.path.join(dataset_infos_dir, config.DATASETDICT_INFOS_FILENAME) diff --git a/src/datasets/load.py b/src/datasets/load.py index 3622ff01273..d718e047810 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -47,6 +47,7 @@ from .dataset_dict import DatasetDict, IterableDatasetDict from .features import Features from .filesystems import extract_path_from_uri, is_remote_filesystem +from .info import DatasetInfo, DatasetInfosDict from .iterable_dataset import IterableDataset from .metric import Metric from .packaged_modules import _EXTENSION_TO_MODULE, _PACKAGED_DATASETS_MODULES, hash_python_lines @@ -398,7 +399,7 @@ def _create_importable_file( name: str, download_mode: GenerateMode, ) -> Tuple[str, str]: - importable_directory_path = os.path.join(dynamic_modules_path, module_namespace, name.replace("/", "___")) + importable_directory_path = os.path.join(dynamic_modules_path, module_namespace, name.replace("/", "--")) Path(importable_directory_path).mkdir(parents=True, exist_ok=True) (Path(importable_directory_path).parent / "__init__.py").touch(exist_ok=True) hash = files_to_hash([local_path] + [loc[1] for loc in local_imports]) @@ -413,7 +414,7 @@ def _create_importable_file( ) logger.debug(f"Created importable dataset file at {importable_local_file}") module_path = ".".join( - [os.path.basename(dynamic_modules_path), module_namespace, name.replace("/", "___"), hash, name.split("/")[-1]] + [os.path.basename(dynamic_modules_path), module_namespace, name.replace("/", "--"), hash, name.split("/")[-1]] ) return module_path, hash @@ -741,6 +742,11 @@ def get_module(self) -> DatasetModule: "name": os.path.basename(self.path), "base_path": self.path, } + if os.path.isfile(os.path.join(self.path, config.DATASETDICT_INFOS_FILENAME)): + with open(os.path.join(self.path, config.DATASETDICT_INFOS_FILENAME), encoding="utf-8") as f: + dataset_infos: DatasetInfosDict = json.load(f) + builder_kwargs["name"] = next(iter(dataset_infos.values())) + builder_kwargs["info"] = DatasetInfo.from_dict(dataset_infos[builder_kwargs["name"]]) return DatasetModule(module_path, hash, builder_kwargs) @@ -799,7 +805,7 @@ def get_module(self) -> DatasetModule: token = HfFolder.get_token() if self.download_config.use_auth_token else None else: token = self.download_config.use_auth_token - dataset_info = HfApi(config.HF_ENDPOINT).dataset_info( + hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info( self.name, revision=self.revision, token=token, @@ -808,11 +814,11 @@ def get_module(self) -> DatasetModule: patterns = ( sanitize_patterns(self.data_files) if self.data_files is not None - else get_patterns_in_dataset_repository(dataset_info) + else get_patterns_in_dataset_repository(hfh_dataset_info) ) data_files = DataFilesDict.from_hf_repo( patterns, - dataset_info=dataset_info, + dataset_info=hfh_dataset_info, allowed_extensions=ALL_ALLOWED_EXTENSIONS, ) infered_module_names = { @@ -828,9 +834,20 @@ def get_module(self) -> DatasetModule: builder_kwargs = { "hash": hash, "data_files": data_files, - "name": self.name.replace("/", "___"), + "name": self.name.replace("/", "--"), "base_path": hf_hub_url(self.name, "", revision=self.revision), } + try: + dataset_infos_path = cached_path( + hf_hub_url(self.name, config.DATASETDICT_INFOS_FILENAME, revision=self.revision), + download_config=self.download_config, + ) + with open(dataset_infos_path, encoding="utf-8") as f: + dataset_infos: DatasetInfosDict = json.load(f) + builder_kwargs["name"] = next(iter(dataset_infos)) + builder_kwargs["info"] = DatasetInfo.from_dict(dataset_infos[builder_kwargs["name"]]) + except FileNotFoundError: + pass return DatasetModule(module_path, hash, builder_kwargs) @@ -918,7 +935,7 @@ def __init__( def get_module(self) -> DatasetModule: dynamic_modules_path = self.dynamic_modules_path if self.dynamic_modules_path else init_dynamic_modules() - importable_directory_path = os.path.join(dynamic_modules_path, "datasets", self.name.replace("/", "___")) + importable_directory_path = os.path.join(dynamic_modules_path, "datasets", self.name.replace("/", "--")) hashes = ( [h for h in os.listdir(importable_directory_path) if len(h) == 64] if os.path.isdir(importable_directory_path) @@ -945,7 +962,7 @@ def _get_modification_time(module_hash): [ os.path.basename(dynamic_modules_path), "datasets", - self.name.replace("/", "___"), + self.name.replace("/", "--"), hash, self.name.split("/")[-1], ] diff --git a/tests/test_inspect.py b/tests/test_inspect.py index 2aff0d2df8d..f17d13e3f8a 100644 --- a/tests/test_inspect.py +++ b/tests/test_inspect.py @@ -10,7 +10,7 @@ ("acronym_identification", "default"), ("lhoestq/squad", "plain_text"), ("lhoestq/test", "default"), - ("lhoestq/demo1", "lhoestq___demo1"), + ("lhoestq/demo1", "lhoestq--demo1"), ], ) def test_get_dataset_config_names(path, expected): diff --git a/tests/test_load.py b/tests/test_load.py index 5f860d7a5b1..2a63186f0a3 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -453,14 +453,14 @@ def test_load_dataset_builder_for_community_dataset_with_script(): assert builder.info.features == Features({"text": Value("string")}) namespace = SAMPLE_DATASET_IDENTIFIER[: SAMPLE_DATASET_IDENTIFIER.index("/")] assert builder._relative_data_dir().startswith(namespace) - assert SAMPLE_DATASET_IDENTIFIER.replace("/", "___") in builder.__module__ + assert SAMPLE_DATASET_IDENTIFIER.replace("/", "--") in builder.__module__ def test_load_dataset_builder_for_community_dataset_without_script(): builder = datasets.load_dataset_builder(SAMPLE_DATASET_IDENTIFIER2) assert isinstance(builder, DatasetBuilder) assert builder.name == "text" - assert builder.config.name == SAMPLE_DATASET_IDENTIFIER2.replace("/", "___") + assert builder.config.name == SAMPLE_DATASET_IDENTIFIER2.replace("/", "--") assert isinstance(builder.config.data_files, DataFilesDict) assert len(builder.config.data_files["train"]) > 0 assert len(builder.config.data_files["test"]) > 0 diff --git a/tests/test_upstream_hub.py b/tests/test_upstream_hub.py index f0d37efa0db..138b7d7a6c4 100644 --- a/tests/test_upstream_hub.py +++ b/tests/test_upstream_hub.py @@ -9,7 +9,7 @@ from huggingface_hub import HfApi from huggingface_hub.hf_api import HfFolder -from datasets import Dataset, DatasetDict, load_dataset +from datasets import ClassLabel, Dataset, DatasetDict, Features, Value, load_dataset REPO_NAME = "repo-{}".format(int(time.time() * 10e3)) @@ -84,8 +84,8 @@ def test_push_dataset_dict_to_hub_no_token(self): self.assertDictEqual(local_ds["train"].features, hub_ds["train"].features) # Ensure that there is a single file on the repository that has the correct name - files = self._api.list_repo_files(ds_name, repo_type="dataset") - self.assertListEqual(files, [".gitattributes", "data/train-00000-of-00001.parquet"]) + files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset")) + self.assertListEqual(files, [".gitattributes", "data/train-00000-of-00001.parquet", "dataset_infos.json"]) finally: self._api.delete_repo(ds_name.split("/")[1], organization=ds_name.split("/")[0], repo_type="dataset") @@ -104,8 +104,8 @@ def test_push_dataset_dict_to_hub_name_without_namespace(self): self.assertDictEqual(local_ds["train"].features, hub_ds["train"].features) # Ensure that there is a single file on the repository that has the correct name - files = self._api.list_repo_files(ds_name, repo_type="dataset") - self.assertListEqual(files, [".gitattributes", "data/train-00000-of-00001.parquet"]) + files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset")) + self.assertListEqual(files, [".gitattributes", "data/train-00000-of-00001.parquet", "dataset_infos.json"]) finally: self._api.delete_repo(ds_name.split("/")[1], organization=ds_name.split("/")[0], repo_type="dataset") @@ -124,8 +124,8 @@ def test_push_dataset_dict_to_hub_private(self): self.assertDictEqual(local_ds["train"].features, hub_ds["train"].features) # Ensure that there is a single file on the repository that has the correct name - files = self._api.list_repo_files(ds_name, repo_type="dataset", token=self._token) - self.assertListEqual(files, [".gitattributes", "data/train-00000-of-00001.parquet"]) + files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset", token=self._token)) + self.assertListEqual(files, [".gitattributes", "data/train-00000-of-00001.parquet", "dataset_infos.json"]) finally: self._api.delete_repo( ds_name.split("/")[1], organization=ds_name.split("/")[0], token=self._token, repo_type="dataset" @@ -146,8 +146,8 @@ def test_push_dataset_dict_to_hub(self): self.assertDictEqual(local_ds["train"].features, hub_ds["train"].features) # Ensure that there is a single file on the repository that has the correct name - files = self._api.list_repo_files(ds_name, repo_type="dataset", token=self._token) - self.assertListEqual(files, [".gitattributes", "data/train-00000-of-00001.parquet"]) + files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset", token=self._token)) + self.assertListEqual(files, [".gitattributes", "data/train-00000-of-00001.parquet", "dataset_infos.json"]) finally: self._api.delete_repo( ds_name.split("/")[1], organization=ds_name.split("/")[0], token=self._token, repo_type="dataset" @@ -170,7 +170,13 @@ def test_push_dataset_dict_to_hub_multiple_files(self): # Ensure that there are two files on the repository that have the correct name files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset", token=self._token)) self.assertListEqual( - files, [".gitattributes", "data/train-00000-of-00002.parquet", "data/train-00001-of-00002.parquet"] + files, + [ + ".gitattributes", + "data/train-00000-of-00002.parquet", + "data/train-00001-of-00002.parquet", + "dataset_infos.json", + ], ) finally: self._api.delete_repo( @@ -212,6 +218,7 @@ def test_push_dataset_dict_to_hub_overwrite_files(self): "data/train-00000-of-00002.parquet", "data/train-00001-of-00002.parquet", "datafile.txt", + "dataset_infos.json", ], ) @@ -254,6 +261,7 @@ def test_push_dataset_dict_to_hub_overwrite_files(self): "data/random-00000-of-00001.parquet", "data/train-00000-of-00001.parquet", "datafile.txt", + "dataset_infos.json", ], ) @@ -293,6 +301,42 @@ def test_push_dataset_to_hub(self): ds_name.split("/")[1], organization=ds_name.split("/")[0], token=self._token, repo_type="dataset" ) + def test_push_dataset_to_hub_custom_features(self): + features = Features({"x": Value("int64"), "y": ClassLabel(names=["neg", "pos"])}) + ds = Dataset.from_dict({"x": [1, 2, 3], "y": [0, 0, 1]}, features=features) + + ds_name = f"{USER}/test-{int(time.time() * 10e3)}" + try: + ds.push_to_hub(ds_name, token=self._token) + hub_ds = load_dataset(ds_name, download_mode="force_redownload") + + self.assertListEqual(ds.column_names, hub_ds["train"].column_names) + self.assertListEqual(list(ds.features.keys()), list(hub_ds["train"].features.keys())) + self.assertDictEqual(ds.features, hub_ds["train"].features) + finally: + self._api.delete_repo( + ds_name.split("/")[1], organization=ds_name.split("/")[0], token=self._token, repo_type="dataset" + ) + + def test_push_dataset_dict_to_hub_custom_features(self): + features = Features({"x": Value("int64"), "y": ClassLabel(names=["neg", "pos"])}) + ds = Dataset.from_dict({"x": [1, 2, 3], "y": [0, 0, 1]}, features=features) + + local_ds = DatasetDict({"test": ds}) + + ds_name = f"{USER}/test-{int(time.time() * 10e3)}" + try: + local_ds.push_to_hub(ds_name, token=self._token) + hub_ds = load_dataset(ds_name, download_mode="force_redownload") + + self.assertDictEqual(local_ds.column_names, hub_ds.column_names) + self.assertListEqual(list(local_ds["test"].features.keys()), list(hub_ds["test"].features.keys())) + self.assertDictEqual(local_ds["test"].features, hub_ds["test"].features) + finally: + self._api.delete_repo( + ds_name.split("/")[1], organization=ds_name.split("/")[0], token=self._token, repo_type="dataset" + ) + def test_push_dataset_to_hub_custom_splits(self): ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})