From a6ccf944e42c1a84de81bf326accab9999b86c90 Mon Sep 17 00:00:00 2001 From: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com> Date: Wed, 19 Jun 2024 16:26:37 +0200 Subject: [PATCH] Update docs on trust_remote_code defaults to False (#6981) * Set trust_remote_code defaults to False in docstrings * Replace warning tip with version added in docstrings * Update docs * Rephrase * Fix typo --- docs/source/dataset_script.mdx | 2 +- docs/source/load_hub.mdx | 4 +-- src/datasets/hub.py | 8 ++--- src/datasets/load.py | 54 ++++++++++++++++++---------------- 4 files changed, 36 insertions(+), 32 deletions(-) diff --git a/docs/source/dataset_script.mdx b/docs/source/dataset_script.mdx index a881621a91e..8fa524406db 100644 --- a/docs/source/dataset_script.mdx +++ b/docs/source/dataset_script.mdx @@ -12,7 +12,7 @@ as long as your dataset repository has a [required structure](./repository_struc -In the next major release, the new safety features of 🤗 Datasets will disable running dataset loading scripts by default, and you will have to pass `trust_remote_code=True` to load datasets that require running a dataset script. +For security reasons, 🤗 Datasets do not allow running dataset loading scripts by default, and you have to pass `trust_remote_code=True` to load datasets that require running a dataset script. diff --git a/docs/source/load_hub.mdx b/docs/source/load_hub.mdx index e7fb5b0c9f6..4d3796ae810 100644 --- a/docs/source/load_hub.mdx +++ b/docs/source/load_hub.mdx @@ -106,7 +106,7 @@ Certain datasets repositories contain a loading script with the Python code used Those datasets are generally exported to Parquet by Hugging Face, so that 🤗 Datasets can load the dataset fast and without running a loading script. Even if a Parquet export is not available, you can still use any dataset with Python code in its repository with `load_dataset`. -All files and code uploaded to the Hub are scanned for malware (refer to the Hub security documentation for more information), but you should still review the dataset loading scripts and authors to avoid executing malicious code on your machine. You should set `trust_remote_code=True` to use a dataset with a loading script, or you will get a warning: +All files and code uploaded to the Hub are scanned for malware (refer to the Hub security documentation for more information), but you should still review the dataset loading scripts and authors to avoid executing malicious code on your machine. You should set `trust_remote_code=True` to use a dataset with a loading script, or you will get an error: ```py >>> from datasets import get_dataset_config_names, get_dataset_split_names, load_dataset @@ -120,6 +120,6 @@ All files and code uploaded to the Hub are scanned for malware (refer to the Hub -In the next major release, the new safety features of 🤗 Datasets will disable running dataset loading scripts by default, and you will have to pass `trust_remote_code=True` to load datasets that require running a dataset script. +For security reasons, 🤗 Datasets do not allow running dataset loading scripts by default, and you have to pass `trust_remote_code=True` to load datasets that require running a dataset script. diff --git a/src/datasets/hub.py b/src/datasets/hub.py index 2d8b60c9fea..06d4f0ddb95 100644 --- a/src/datasets/hub.py +++ b/src/datasets/hub.py @@ -42,15 +42,15 @@ def convert_to_parquet( `/`. revision (`str`, *optional*): Branch of the source Hub dataset repository. Defaults to the `"main"` branch. token (`bool` or `str`, *optional*): Authentication token for the Hugging Face Hub. - trust_remote_code (`bool`, defaults to `True`): Whether you trust the remote code of the Hub script-based + trust_remote_code (`bool`, defaults to `False`): Whether you trust the remote code of the Hub script-based dataset to be executed locally on your machine. This option should only be set to `True` for repositories where you have read the code and which you trust. - + - `trust_remote_code` will default to False in the next major release. + `trust_remote_code` defaults to `False` if not specified. - + Returns: `huggingface_hub.CommitInfo` diff --git a/src/datasets/load.py b/src/datasets/load.py index 2ce877c2d04..4499912fd6d 100644 --- a/src/datasets/load.py +++ b/src/datasets/load.py @@ -1749,18 +1749,19 @@ def dataset_module_factory( Directory to read/write data. Defaults to `"~/.cache/huggingface/datasets"`. - trust_remote_code (`bool`, defaults to `True`): + trust_remote_code (`bool`, defaults to `False`): Whether or not to allow for datasets defined on the Hub using a dataset script. This option should only be set to `True` for repositories you trust and in which you have read the code, as it will execute code present on the Hub on your local machine. - + - `trust_remote_code` will default to False in the next major release. + - + `trust_remote_code` defaults to `False` if not specified. + + - **download_kwargs (additional keyword arguments): optional attributes for DownloadConfig() which will override the attributes in download_config if supplied. @@ -1961,18 +1962,19 @@ def metric_module_factory( dynamic_modules_path (Optional str, defaults to HF_MODULES_CACHE / "datasets_modules", i.e. ~/.cache/huggingface/modules/datasets_modules): Optional path to the directory in which the dynamic modules are saved. It must have been initialized with :obj:`init_dynamic_modules`. By default, the datasets and metrics are stored inside the `datasets_modules` module. - trust_remote_code (`bool`, defaults to `True`): + trust_remote_code (`bool`, defaults to `False`): Whether or not to allow for datasets defined on the Hub using a dataset script. This option should only be set to `True` for repositories you trust and in which you have read the code, as it will execute code present on the Hub on your local machine. - + - `trust_remote_code` will default to False in the next major release. + - + `trust_remote_code` defaults to `False` if not specified. + + - **download_kwargs (additional keyword arguments): optional attributes for DownloadConfig() which will override the attributes in download_config if supplied. @@ -2078,18 +2080,18 @@ def load_metric( revision (Optional ``Union[str, datasets.Version]``): if specified, the module will be loaded from the datasets repository at this version. By default, it is set to the local version of the lib. Specifying a version that is different from your local version of the lib might cause compatibility issues. - trust_remote_code (`bool`, defaults to `True`): + trust_remote_code (`bool`, defaults to `False`): Whether or not to allow for datasets defined on the Hub using a dataset script. This option should only be set to `True` for repositories you trust and in which you have read the code, as it will execute code present on the Hub on your local machine. - + - `trust_remote_code` will default to False in the next major release. + - + `trust_remote_code` defaults to `False` if not specified. - + Returns: `datasets.Metric` @@ -2220,18 +2222,19 @@ def load_dataset_builder( **Experimental**. Key/value pairs to be passed on to the dataset file-system backend, if any. - trust_remote_code (`bool`, defaults to `True`): + trust_remote_code (`bool`, defaults to `False`): Whether or not to allow for datasets defined on the Hub using a dataset script. This option should only be set to `True` for repositories you trust and in which you have read the code, as it will execute code present on the Hub on your local machine. - + + + - `trust_remote_code` will default to False in the next major release. + `trust_remote_code` defaults to `False` if not specified. - + - **config_kwargs (additional keyword arguments): Keyword arguments to be passed to the [`BuilderConfig`] and used in the [`DatasetBuilder`]. @@ -2481,18 +2484,19 @@ def load_dataset( **Experimental**. Key/value pairs to be passed on to the dataset file-system backend, if any. - trust_remote_code (`bool`, defaults to `True`): + trust_remote_code (`bool`, defaults to `False`): Whether or not to allow for datasets defined on the Hub using a dataset script. This option should only be set to `True` for repositories you trust and in which you have read the code, as it will execute code present on the Hub on your local machine. - + - `trust_remote_code` will default to False in the next major release. + - + `trust_remote_code` defaults to `False` if not specified. + + - **config_kwargs (additional keyword arguments): Keyword arguments to be passed to the `BuilderConfig` and used in the [`DatasetBuilder`].