Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add env variable for MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES #2399

Merged
merged 6 commits into from
May 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 3 additions & 2 deletions docs/source/loading_datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,9 @@ This call to :func:`datasets.load_dataset` does the following steps under the ho
(RAM) by setting the ``keep_in_memory`` argument of :func:`datasets.load_datasets` to ``True``.
The default in 🤗Datasets is to memory-map the dataset on drive if its size is larger than
``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (default ``250`` MiB); otherwise, the dataset is copied
in-memory. This behavior can be disabled by setting ``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES = None``,
and in this case the dataset is not loaded in memory.
in-memory. This behavior can be disabled (i.e., the dataset will not be loaded in memory) by setting to ``0`` either
the configuration option ``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (higher precedence) or the
environment variable ``MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (lower precedence).

3. Return a **dataset built from the splits** asked by the user (default: all); in the above example we create a dataset with the train split.

Expand Down
5 changes: 3 additions & 2 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -653,8 +653,9 @@ def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] =
keep_in_memory (:obj:`bool`, default ``None``): Whether to copy the dataset in-memory. If `None`, the
dataset will be copied in-memory if its size is smaller than
`datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES` (default `250 MiB`). This behavior can be
disabled by setting ``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES = None``, and
in this case the dataset is not loaded in memory.
disabled (i.e., the dataset will not be loaded in memory) by setting to ``0`` either the configuration
option ``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (higher precedence) or the
environment variable ``MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (lower precedence).

Returns:
:class:`Dataset` or :class:`DatasetDict`.
Expand Down
5 changes: 4 additions & 1 deletion src/datasets/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,10 @@
HF_DATASETS_OFFLINE = False

# In-memory
MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES = 250 * 2 ** 20 # 250 MiB
DEFAULT_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES = 250 * 2 ** 20 # 250 MiB
MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES = float(
os.environ.get("MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES", DEFAULT_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES)
)

# File names
DATASET_ARROW_FILENAME = "dataset.arrow"
Expand Down
5 changes: 3 additions & 2 deletions src/datasets/dataset_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -688,8 +688,9 @@ def load_from_disk(dataset_dict_path: str, fs=None, keep_in_memory: Optional[boo
keep_in_memory (:obj:`bool`, default ``None``): Whether to copy the dataset in-memory. If `None`, the
dataset will be copied in-memory if its size is smaller than
`datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES` (default `250 MiB`). This behavior can be
disabled by setting ``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES = None``, and in this case
the dataset is not loaded in memory.
disabled (i.e., the dataset will not be loaded in memory) by setting to ``0`` either the configuration
option ``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (higher precedence) or the environment
variable ``MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (lower precedence).

Returns:
:class:`DatasetDict`
Expand Down
14 changes: 8 additions & 6 deletions src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -684,9 +684,10 @@ def load_dataset(
ignore_verifications (:obj:`bool`, default ``False``): Ignore the verifications of the downloaded/processed dataset information (checksums/size/splits/...).
keep_in_memory (:obj:`bool`, default ``None``): Whether to copy the dataset in-memory. If `None`, the
dataset will be copied in-memory if its size is smaller than
`datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES` (default `250 MiB`). This behavior can be disabled by
setting ``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES = None``, and in this case the dataset is not
loaded in memory.
`datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES` (default `250 MiB`). This behavior can be disabled
(i.e., the dataset will not be loaded in memory) by setting to ``0`` either the configuration option
``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (higher precedence) or the environment variable
``MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (lower precedence).
save_infos (:obj:`bool`, default ``False``): Save the dataset information (checksums/size/splits/...).
script_version (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset script to load:

Expand Down Expand Up @@ -776,9 +777,10 @@ def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] =
Instance of of the remote filesystem used to download the files from.
keep_in_memory (:obj:`bool`, default ``None``): Whether to copy the dataset in-memory. If `None`, the
dataset will be copied in-memory if its size is smaller than
`datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES` (default `250 MiB`). This behavior can be disabled by
setting ``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES = None``, and in this case the dataset is
not loaded in memory.
`datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES` (default `250 MiB`). This behavior can be disabled
(i.e., the dataset will not be loaded in memory) by setting to ``0`` either the configuration option
``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (higher precedence) or the environment variable
``MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (lower precedence).

Returns:
``datasets.Dataset`` or ``datasets.DatasetDict``
Expand Down
2 changes: 1 addition & 1 deletion src/datasets/utils/info_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def is_small_dataset(dataset_size):
Returns:
bool: Whether `dataset_size` is smaller than `config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`.
"""
if dataset_size and config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES is not None:
if dataset_size and config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES:
return dataset_size < config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES
else:
return False
32 changes: 23 additions & 9 deletions tests/test_info_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,31 @@
from datasets.utils.info_utils import is_small_dataset


@pytest.fixture(params=[None, 0, 100 * 2 ** 20, 900 * 2 ** 20])
def env_max_in_memory_dataset_size(request, monkeypatch):
if request.param:
monkeypatch.setenv("MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES", request.param)


@pytest.mark.parametrize("dataset_size", [None, 400 * 2 ** 20, 600 * 2 ** 20])
@pytest.mark.parametrize("max_in_memory_dataset_size", ["default", None, 0, 100 * 2 ** 20, 900 * 2 ** 20])
def test_is_small_dataset(dataset_size, max_in_memory_dataset_size, monkeypatch):
if max_in_memory_dataset_size == "default":
# default = 250 * 2 ** 20
max_in_memory_dataset_size = datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES
else:
monkeypatch.setattr(datasets.config, "MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES", max_in_memory_dataset_size)
if dataset_size is None or max_in_memory_dataset_size is None:
expected = False
@pytest.mark.parametrize("config_max_in_memory_dataset_size", ["default", 0, 100 * 2 ** 20, 900 * 2 ** 20])
def test_is_small_dataset(
dataset_size, config_max_in_memory_dataset_size, env_max_in_memory_dataset_size, monkeypatch
):
if config_max_in_memory_dataset_size != "default":
monkeypatch.setattr(datasets.config, "MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES", config_max_in_memory_dataset_size)

max_in_memory_dataset_size = datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES
if config_max_in_memory_dataset_size == "default":
if env_max_in_memory_dataset_size:
assert max_in_memory_dataset_size == env_max_in_memory_dataset_size
else:
assert max_in_memory_dataset_size == 250 * 2 ** 20
else:
assert max_in_memory_dataset_size == config_max_in_memory_dataset_size
if dataset_size and max_in_memory_dataset_size:
expected = dataset_size < max_in_memory_dataset_size
else:
expected = False
result = is_small_dataset(dataset_size)
assert result == expected
17 changes: 8 additions & 9 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def assert_auth(url, *args, headers, **kwargs):
mock_head.assert_called()


@pytest.mark.parametrize("max_in_memory_dataset_size", ["default", None, 0, 50, 500])
@pytest.mark.parametrize("max_in_memory_dataset_size", ["default", 0, 50, 500])
def test_load_dataset_local_with_default_in_memory(
max_in_memory_dataset_size, dataset_loading_script_dir, data_dir, monkeypatch
):
Expand All @@ -236,18 +236,17 @@ def test_load_dataset_local_with_default_in_memory(
max_in_memory_dataset_size = datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES
else:
monkeypatch.setattr(datasets.config, "MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES", max_in_memory_dataset_size)
if max_in_memory_dataset_size is None:
max_in_memory_dataset_size = 0
expected_in_memory = False
else:
if max_in_memory_dataset_size:
expected_in_memory = current_dataset_size < max_in_memory_dataset_size
else:
expected_in_memory = False

with assert_arrow_memory_increases() if expected_in_memory else assert_arrow_memory_doesnt_increase():
dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir)
assert (dataset["train"].dataset_size < max_in_memory_dataset_size) is expected_in_memory


@pytest.mark.parametrize("max_in_memory_dataset_size", ["default", None, 0, 100, 1000])
@pytest.mark.parametrize("max_in_memory_dataset_size", ["default", 0, 100, 1000])
def test_load_from_disk_with_default_in_memory(
max_in_memory_dataset_size, dataset_loading_script_dir, data_dir, tmp_path, monkeypatch
):
Expand All @@ -257,10 +256,10 @@ def test_load_from_disk_with_default_in_memory(
max_in_memory_dataset_size = datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES
else:
monkeypatch.setattr(datasets.config, "MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES", max_in_memory_dataset_size)
if max_in_memory_dataset_size is None:
expected_in_memory = False
else:
if max_in_memory_dataset_size:
expected_in_memory = current_dataset_size < max_in_memory_dataset_size
else:
expected_in_memory = False

dset = load_dataset(dataset_loading_script_dir, data_dir=data_dir, keep_in_memory=True)
dataset_path = os.path.join(tmp_path, "saved_dataset")
Expand Down