Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Finish deprecating the fs argument #5393

Merged
merged 9 commits into from Jan 18, 2023
63 changes: 54 additions & 9 deletions src/datasets/info.py
Expand Up @@ -33,11 +33,12 @@
import json
import os
import posixpath
import warnings
from dataclasses import dataclass
from pathlib import Path
from typing import ClassVar, Dict, List, Optional, Union

from fsspec.implementations.local import LocalFileSystem
import fsspec

from . import config
from .features import Features, Value
Expand Down Expand Up @@ -208,18 +209,30 @@ def __post_init__(self):
template.align_with_features(self.features) for template in (self.task_templates)
]

def write_to_directory(self, dataset_info_dir, pretty_print=False, fs=None):
def write_to_directory(
self, dataset_info_dir, pretty_print=False, fs="deprecated", storage_options: Optional[dict] = None
):
"""Write `DatasetInfo` and license (if present) as JSON files to `dataset_info_dir`.

Args:
dataset_info_dir (`str`):
Destination directory.
pretty_print (`bool`, defaults to `False`):
If `True`, the JSON will be pretty-printed with the indent level of 4.
fs (`fsspec.spec.AbstractFileSystem`, *optional*, defaults to `None`):
fs (`fsspec.spec.AbstractFileSystem`, *optional*):
dconathan marked this conversation as resolved.
Show resolved Hide resolved
Instance of the remote filesystem used to download the files from.

<Added version="2.5.0"/>
<Deprecated version="2.8.0">
dconathan marked this conversation as resolved.
Show resolved Hide resolved

`fs` was deprecated in version 2.8.0 and will be removed in 3.0.0.
dconathan marked this conversation as resolved.
Show resolved Hide resolved
Please use `storage_options` instead, e.g. `storage_options=fs.storage_options`
dconathan marked this conversation as resolved.
Show resolved Hide resolved

</Deprecated>

storage_options (`dict`, *optional*):
Key/value pairs to be passed on to the file-system backend, if any.

<Added version="2.8.1"/>
dconathan marked this conversation as resolved.
Show resolved Hide resolved

Example:

Expand All @@ -229,7 +242,17 @@ def write_to_directory(self, dataset_info_dir, pretty_print=False, fs=None):
>>> ds.info.write_to_directory("/path/to/directory/")
```
"""
fs = fs or LocalFileSystem()
if fs != "deprecated":
warnings.warn(
"'fs' was is deprecated in favor of 'storage_options' in version 2.8.0 and will be removed in 3.0.0.\n"
dconathan marked this conversation as resolved.
Show resolved Hide resolved
"You can remove this warning by passing 'storage_options=fs.storage_options' instead.",
FutureWarning,
)
storage_options = fs.storage_options

fs_token_paths = fsspec.get_fs_token_paths(dataset_info_dir, storage_options=storage_options)
fs: fsspec.AbstractFileSystem = fs_token_paths[0]

is_local = not is_remote_filesystem(fs)
path_join = os.path.join if is_local else posixpath.join

Expand Down Expand Up @@ -278,7 +301,9 @@ def from_merge(cls, dataset_infos: List["DatasetInfo"]):
)

@classmethod
def from_directory(cls, dataset_info_dir: str, fs=None) -> "DatasetInfo":
def from_directory(
cls, dataset_info_dir: str, fs="deprecated", storage_options: Optional[dict] = None
) -> "DatasetInfo":
"""Create [`DatasetInfo`] from the JSON file in `dataset_info_dir`.

This function updates all the dynamically generated fields (num_examples,
Expand All @@ -290,10 +315,20 @@ def from_directory(cls, dataset_info_dir: str, fs=None) -> "DatasetInfo":
dataset_info_dir (`str`):
The directory containing the metadata file. This
should be the root directory of a specific dataset version.
fs (`fsspec.spec.AbstractFileSystem`, *optional*, defaults to `None`):
fs (`fsspec.spec.AbstractFileSystem`, *optional*):
Instance of the remote filesystem used to download the files from.

<Added version="2.5.0"/>
<Deprecated version="2.8.0">
dconathan marked this conversation as resolved.
Show resolved Hide resolved

`fs` was deprecated in version 2.8.0 and will be removed in 3.0.0.
dconathan marked this conversation as resolved.
Show resolved Hide resolved
Please use `storage_options` instead, e.g. `storage_options=fs.storage_options`
dconathan marked this conversation as resolved.
Show resolved Hide resolved

</Deprecated>

storage_options (`dict`, *optional*):
Key/value pairs to be passed on to the file-system backend, if any.

<Added version="2.8.1"/>
dconathan marked this conversation as resolved.
Show resolved Hide resolved

Example:

Expand All @@ -302,7 +337,17 @@ def from_directory(cls, dataset_info_dir: str, fs=None) -> "DatasetInfo":
>>> ds_info = DatasetInfo.from_directory("/path/to/directory/")
```
"""
fs = fs or LocalFileSystem()
if fs != "deprecated":
warnings.warn(
"'fs' was is deprecated in favor of 'storage_options' in version 2.8.0 and will be removed in 3.0.0.\n"
dconathan marked this conversation as resolved.
Show resolved Hide resolved
"You can remove this warning by passing 'storage_options=fs.storage_options' instead.",
FutureWarning,
)
storage_options = fs.storage_options

fs_token_paths = fsspec.get_fs_token_paths(dataset_info_dir, storage_options=storage_options)
fs: fsspec.AbstractFileSystem = fs_token_paths[0]

logger.info(f"Loading Dataset info from {dataset_info_dir}")
if not dataset_info_dir:
raise ValueError("Calling DatasetInfo.from_directory() with undefined dataset_info_dir.")
Expand Down
30 changes: 28 additions & 2 deletions src/datasets/load.py
Expand Up @@ -1776,7 +1776,9 @@ def load_dataset(
return ds


def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] = None) -> Union[Dataset, DatasetDict]:
def load_from_disk(
dataset_path: str, fs="deprecated", keep_in_memory: Optional[bool] = None, storage_options: Optional[dict] = None
) -> Union[Dataset, DatasetDict]:
"""
Loads a dataset that was previously saved using [`~Dataset.save_to_disk`] from a dataset directory, or
from a filesystem using either [`~datasets.filesystems.S3FileSystem`] or any implementation of
Expand All @@ -1787,13 +1789,26 @@ def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] =
Path (e.g. `"dataset/train"`) or remote URI (e.g.
`"s3://my-bucket/dataset/train"`) of the [`Dataset`] or [`DatasetDict`] directory where the dataset will be
loaded from.
fs (`~filesystems.S3FileSystem` or `fsspec.spec.AbstractFileSystem`, *optional*, defaults to `None`):
fs (`~filesystems.S3FileSystem` or `fsspec.spec.AbstractFileSystem`, *optional*):
Instance of the remote filesystem used to download the files from.

<Deprecated version="2.8.0">
dconathan marked this conversation as resolved.
Show resolved Hide resolved

`fs` was deprecated in version 2.8.0 and will be removed in 3.0.0.
dconathan marked this conversation as resolved.
Show resolved Hide resolved
Please use `storage_options` instead, e.g. `storage_options=fs.storage_options`
dconathan marked this conversation as resolved.
Show resolved Hide resolved

</Deprecated>

keep_in_memory (`bool`, defaults to `None`):
Whether to copy the dataset in-memory. If `None`, the dataset
will not be copied in-memory unless explicitly enabled by setting `datasets.config.IN_MEMORY_MAX_SIZE` to
nonzero. See more details in the [improve performance](./cache#improve-performance) section.

storage_options (`dict`, *optional*):
Key/value pairs to be passed on to the file-system backend, if any.

<Added version="2.8.1"/>
dconathan marked this conversation as resolved.
Show resolved Hide resolved

Returns:
[`Dataset`] or [`DatasetDict`]:
- If `dataset_path` is a path of a dataset directory: the dataset requested.
Expand All @@ -1806,6 +1821,17 @@ def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] =
>>> ds = load_from_disk('path/to/dataset/directory')
```
"""
if fs != "deprecated":
warnings.warn(
"'fs' was is deprecated in favor of 'storage_options' in version 2.8.0 and will be removed in 3.0.0.\n"
dconathan marked this conversation as resolved.
Show resolved Hide resolved
"You can remove this warning by passing 'storage_options=fs.storage_options' instead.",
FutureWarning,
)
storage_options = fs.storage_options

fs_token_paths = fsspec.get_fs_token_paths(dataset_path, storage_options=storage_options)
fs: fsspec.AbstractFileSystem = fs_token_paths[0]

# gets filesystem from dataset, either s3:// or file:// and adjusted dataset_path
if is_remote_filesystem(fs):
dest_dataset_path = extract_path_from_uri(dataset_path)
Expand Down