diff --git a/docs/source/loading_datasets.rst b/docs/source/loading_datasets.rst index e393091fb46..43cedbd9511 100644 --- a/docs/source/loading_datasets.rst +++ b/docs/source/loading_datasets.rst @@ -68,8 +68,9 @@ This call to :func:`datasets.load_dataset` does the following steps under the ho (RAM) by setting the ``keep_in_memory`` argument of :func:`datasets.load_datasets` to ``True``. The default in 🤗Datasets is to memory-map the dataset on drive if its size is larger than ``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (default ``250`` MiB); otherwise, the dataset is copied - in-memory. This behavior can be disabled by setting ``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES = None``, - and in this case the dataset is not loaded in memory. + in-memory. This behavior can be disabled (i.e., the dataset will not be loaded in memory) by setting to ``0`` either + the configuration option ``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (higher precedence) or the + environment variable ``MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (lower precedence). 3. Return a **dataset built from the splits** asked by the user (default: all); in the above example we create a dataset with the train split. diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py index 59c011b79d0..6be0c8dfce0 100644 --- a/src/datasets/arrow_dataset.py +++ b/src/datasets/arrow_dataset.py @@ -653,8 +653,9 @@ def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] = keep_in_memory (:obj:`bool`, default ``None``): Whether to copy the dataset in-memory. If `None`, the dataset will be copied in-memory if its size is smaller than `datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES` (default `250 MiB`). This behavior can be - disabled by setting ``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES = None``, and - in this case the dataset is not loaded in memory. + disabled (i.e., the dataset will not be loaded in memory) by setting to ``0`` either the configuration + option ``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (higher precedence) or the + environment variable ``MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (lower precedence). Returns: :class:`Dataset` or :class:`DatasetDict`. diff --git a/src/datasets/config.py b/src/datasets/config.py index 3c4927858c4..c2752435c9b 100644 --- a/src/datasets/config.py +++ b/src/datasets/config.py @@ -144,7 +144,10 @@ HF_DATASETS_OFFLINE = False # In-memory -MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES = 250 * 2 ** 20 # 250 MiB +DEFAULT_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES = 250 * 2 ** 20 # 250 MiB +MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES = float( + os.environ.get("MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES", DEFAULT_MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES) +) # File names DATASET_ARROW_FILENAME = "dataset.arrow" diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py index 0223704ac75..7e07324b3af 100644 --- a/src/datasets/dataset_dict.py +++ b/src/datasets/dataset_dict.py @@ -688,8 +688,9 @@ def load_from_disk(dataset_dict_path: str, fs=None, keep_in_memory: Optional[boo keep_in_memory (:obj:`bool`, default ``None``): Whether to copy the dataset in-memory. If `None`, the dataset will be copied in-memory if its size is smaller than `datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES` (default `250 MiB`). This behavior can be - disabled by setting ``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES = None``, and in this case - the dataset is not loaded in memory. + disabled (i.e., the dataset will not be loaded in memory) by setting to ``0`` either the configuration + option ``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (higher precedence) or the environment + variable ``MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (lower precedence). Returns: :class:`DatasetDict` diff --git a/src/datasets/load.py b/src/datasets/load.py index 073caede72e..f1ed48b0706 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -684,9 +684,10 @@ def load_dataset( ignore_verifications (:obj:`bool`, default ``False``): Ignore the verifications of the downloaded/processed dataset information (checksums/size/splits/...). keep_in_memory (:obj:`bool`, default ``None``): Whether to copy the dataset in-memory. If `None`, the dataset will be copied in-memory if its size is smaller than - `datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES` (default `250 MiB`). This behavior can be disabled by - setting ``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES = None``, and in this case the dataset is not - loaded in memory. + `datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES` (default `250 MiB`). This behavior can be disabled + (i.e., the dataset will not be loaded in memory) by setting to ``0`` either the configuration option + ``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (higher precedence) or the environment variable + ``MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (lower precedence). save_infos (:obj:`bool`, default ``False``): Save the dataset information (checksums/size/splits/...). script_version (:class:`~utils.Version` or :obj:`str`, optional): Version of the dataset script to load: @@ -776,9 +777,10 @@ def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] = Instance of of the remote filesystem used to download the files from. keep_in_memory (:obj:`bool`, default ``None``): Whether to copy the dataset in-memory. If `None`, the dataset will be copied in-memory if its size is smaller than - `datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES` (default `250 MiB`). This behavior can be disabled by - setting ``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES = None``, and in this case the dataset is - not loaded in memory. + `datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES` (default `250 MiB`). This behavior can be disabled + (i.e., the dataset will not be loaded in memory) by setting to ``0`` either the configuration option + ``datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (higher precedence) or the environment variable + ``MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`` (lower precedence). Returns: ``datasets.Dataset`` or ``datasets.DatasetDict`` diff --git a/src/datasets/utils/info_utils.py b/src/datasets/utils/info_utils.py index c2529a70fdf..24954a600c0 100644 --- a/src/datasets/utils/info_utils.py +++ b/src/datasets/utils/info_utils.py @@ -93,7 +93,7 @@ def is_small_dataset(dataset_size): Returns: bool: Whether `dataset_size` is smaller than `config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES`. """ - if dataset_size and config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES is not None: + if dataset_size and config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES: return dataset_size < config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES else: return False diff --git a/tests/test_info_utils.py b/tests/test_info_utils.py index 489fde5fa48..91530e4efd1 100644 --- a/tests/test_info_utils.py +++ b/tests/test_info_utils.py @@ -4,17 +4,31 @@ from datasets.utils.info_utils import is_small_dataset +@pytest.fixture(params=[None, 0, 100 * 2 ** 20, 900 * 2 ** 20]) +def env_max_in_memory_dataset_size(request, monkeypatch): + if request.param: + monkeypatch.setenv("MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES", request.param) + + @pytest.mark.parametrize("dataset_size", [None, 400 * 2 ** 20, 600 * 2 ** 20]) -@pytest.mark.parametrize("max_in_memory_dataset_size", ["default", None, 0, 100 * 2 ** 20, 900 * 2 ** 20]) -def test_is_small_dataset(dataset_size, max_in_memory_dataset_size, monkeypatch): - if max_in_memory_dataset_size == "default": - # default = 250 * 2 ** 20 - max_in_memory_dataset_size = datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES - else: - monkeypatch.setattr(datasets.config, "MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES", max_in_memory_dataset_size) - if dataset_size is None or max_in_memory_dataset_size is None: - expected = False +@pytest.mark.parametrize("config_max_in_memory_dataset_size", ["default", 0, 100 * 2 ** 20, 900 * 2 ** 20]) +def test_is_small_dataset( + dataset_size, config_max_in_memory_dataset_size, env_max_in_memory_dataset_size, monkeypatch +): + if config_max_in_memory_dataset_size != "default": + monkeypatch.setattr(datasets.config, "MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES", config_max_in_memory_dataset_size) + + max_in_memory_dataset_size = datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES + if config_max_in_memory_dataset_size == "default": + if env_max_in_memory_dataset_size: + assert max_in_memory_dataset_size == env_max_in_memory_dataset_size + else: + assert max_in_memory_dataset_size == 250 * 2 ** 20 else: + assert max_in_memory_dataset_size == config_max_in_memory_dataset_size + if dataset_size and max_in_memory_dataset_size: expected = dataset_size < max_in_memory_dataset_size + else: + expected = False result = is_small_dataset(dataset_size) assert result == expected diff --git a/tests/test_load.py b/tests/test_load.py index 28b5e2ed127..5e8397baba5 100644 --- a/tests/test_load.py +++ b/tests/test_load.py @@ -226,7 +226,7 @@ def assert_auth(url, *args, headers, **kwargs): mock_head.assert_called() -@pytest.mark.parametrize("max_in_memory_dataset_size", ["default", None, 0, 50, 500]) +@pytest.mark.parametrize("max_in_memory_dataset_size", ["default", 0, 50, 500]) def test_load_dataset_local_with_default_in_memory( max_in_memory_dataset_size, dataset_loading_script_dir, data_dir, monkeypatch ): @@ -236,18 +236,17 @@ def test_load_dataset_local_with_default_in_memory( max_in_memory_dataset_size = datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES else: monkeypatch.setattr(datasets.config, "MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES", max_in_memory_dataset_size) - if max_in_memory_dataset_size is None: - max_in_memory_dataset_size = 0 - expected_in_memory = False - else: + if max_in_memory_dataset_size: expected_in_memory = current_dataset_size < max_in_memory_dataset_size + else: + expected_in_memory = False with assert_arrow_memory_increases() if expected_in_memory else assert_arrow_memory_doesnt_increase(): dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir) assert (dataset["train"].dataset_size < max_in_memory_dataset_size) is expected_in_memory -@pytest.mark.parametrize("max_in_memory_dataset_size", ["default", None, 0, 100, 1000]) +@pytest.mark.parametrize("max_in_memory_dataset_size", ["default", 0, 100, 1000]) def test_load_from_disk_with_default_in_memory( max_in_memory_dataset_size, dataset_loading_script_dir, data_dir, tmp_path, monkeypatch ): @@ -257,10 +256,10 @@ def test_load_from_disk_with_default_in_memory( max_in_memory_dataset_size = datasets.config.MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES else: monkeypatch.setattr(datasets.config, "MAX_IN_MEMORY_DATASET_SIZE_IN_BYTES", max_in_memory_dataset_size) - if max_in_memory_dataset_size is None: - expected_in_memory = False - else: + if max_in_memory_dataset_size: expected_in_memory = current_dataset_size < max_in_memory_dataset_size + else: + expected_in_memory = False dset = load_dataset(dataset_loading_script_dir, data_dir=data_dir, keep_in_memory=True) dataset_path = os.path.join(tmp_path, "saved_dataset")