huggingface · lhoestq · Jan 26, 2021 · Jan 7, 2021 · Jan 7, 2021 · Jan 8, 2021
diff --git a/setup.py b/setup.py
@@ -59,19 +59,19 @@
 from setuptools import find_packages
 from setuptools import setup
 
-DOCLINES = __doc__.split('\n')
+DOCLINES = __doc__.split("\n")
 
 REQUIRED_PKGS = [
     # We use numpy>=1.17 to have np.random.Generator (Dataset shuffling)
-    'numpy>=1.17',
+    "numpy>=1.17",
     # Backend and serialization. Minimum 0.17.1 to support extension array
-    'pyarrow>=0.17.1',
+    "pyarrow>=0.17.1",
     # For smart caching dataset processing
-    'dill',
+    "dill",
     # For performance gains with apache arrow
-    'pandas',
+    "pandas",
     # for downloading datasets over HTTPS
-    'requests>=2.19.0',
+    "requests>=2.19.0",
     # progress bars in download and scripts
     # tqdm 4.50.0 introduced permission errors on windows
     # see https://app.circleci.com/pipelines/github/huggingface/datasets/235/workflows/cfb6a39f-68eb-4802-8b17-2cd5e8ea7369/jobs/1111
@@ -82,38 +82,44 @@
     "xxhash",
     # for better multiprocessing
     "multiprocess",
+    # for saving datsets to local or s3
+    "fsspec",
+    "fsspec[s3]",
+    # for getting credentials from aws_profile
+    "boto3",
     # to get metadata of optional dependencies such as torch or tensorflow for Python versions that don't have it
-    "importlib_metadata;python_version<'3.8'"
+    "importlib_metadata;python_version<'3.8'",
 ]
 
 BENCHMARKS_REQUIRE = [
-    'numpy==1.18.5',
-    'tensorflow==2.3.0',
-    'torch==1.6.0',
-    'transformers==3.0.2',
+    "numpy==1.18.5",
+    "tensorflow==2.3.0",
+    "torch==1.6.0",
+    "transformers==3.0.2",
 ]
 
 TESTS_REQUIRE = [
-    'apache-beam',
-    'absl-py',
-    'bs4',
-    'conllu',
-    'elasticsearch',
-    'faiss-cpu',
-    'langdetect',
-    'lxml',
-    'mwparserfromhell',
-    'nltk',
-    'openpyxl',
-    'py7zr',
-    'pytest',
-    'pytest-xdist',
-    'tensorflow',
-    'torch',
-    'tldextract',
-    'transformers',
-    'zstandard',
-    'rarfile',
+    "apache-beam",
+    "absl-py",
+    "bs4",
+    "conllu",
+    "elasticsearch",
+    "faiss-cpu",
+    "langdetect",
+    "lxml",
+    "mwparserfromhell",
+    "nltk",
+    "openpyxl",
+    "py7zr",
+    "pytest",
+    "pytest-xdist",
+    "tensorflow",
+    "torch",
+    "tldextract",
+    "transformers",
+    "zstandard",
+    "rarfile",
+    "moto[s3]",
 ]
 
 if os.name == "nt":  # windows
@@ -128,34 +134,36 @@
 
 
 EXTRAS_REQUIRE = {
-    'apache-beam': ['apache-beam'],
-    'tensorflow': ['tensorflow>=2.2.0'],
-    'tensorflow_gpu': ['tensorflow-gpu>=2.2.0'],
-    'torch': ['torch'],
-    'dev': TESTS_REQUIRE + QUALITY_REQUIRE,
-    'tests': TESTS_REQUIRE,
-    'quality': QUALITY_REQUIRE,
-    'benchmarks': BENCHMARKS_REQUIRE,
-    'docs': ["recommonmark", "sphinx==3.1.2", "sphinx-markdown-tables", "sphinx-rtd-theme==0.4.3", "sphinx-copybutton"]
+    "apache-beam": ["apache-beam"],
+    "tensorflow": ["tensorflow>=2.2.0"],
+    "tensorflow_gpu": ["tensorflow-gpu>=2.2.0"],
+    "torch": ["torch"],
+    "dev": TESTS_REQUIRE + QUALITY_REQUIRE,
+    "tests": TESTS_REQUIRE,
+    "quality": QUALITY_REQUIRE,
+    "benchmarks": BENCHMARKS_REQUIRE,
+    "docs": [
+        "recommonmark",
+        "sphinx==3.1.2",
+        "sphinx-markdown-tables",
+        "sphinx-rtd-theme==0.4.3",
+        "sphinx-copybutton",
+    ],
 }
 
 setup(
-    name='datasets',
+    name="datasets",
     version="1.2.0",
     description=DOCLINES[0],
-    long_description='\n'.join(DOCLINES[2:]),
-    author='HuggingFace Inc.',
-    author_email='thomas@huggingface.co',
-    url='https://github.com/huggingface/datasets',
-    download_url='https://github.com/huggingface/datasets/tags',
-    license='Apache 2.0',
+    long_description="\n".join(DOCLINES[2:]),
+    author="HuggingFace Inc.",
+    author_email="thomas@huggingface.co",
+    url="https://github.com/huggingface/datasets",
+    download_url="https://github.com/huggingface/datasets/tags",
+    license="Apache 2.0",
     package_dir={"": "src"},
     packages=find_packages("src"),
-    package_data={
-        'datasets': [
-            'scripts/templates/*',
-        ],
-    },
+    package_data={"datasets": ["scripts/templates/*",],},
     scripts=["datasets-cli"],
     install_requires=REQUIRED_PKGS,
     extras_require=EXTRAS_REQUIRE,
@@ -171,5 +179,5 @@
         "Programming Language :: Python :: 3.7",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
-    keywords='datasets machine learning datasets metrics',
+    keywords="datasets machine learning datasets metrics",
 )
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -44,7 +44,7 @@
 from .info import DatasetInfo
 from .search import IndexableMixin
 from .splits import NamedSplit
-from .utils import map_nested
+from .utils import get_filesystem_from_dataset_path, is_remote_filesystem, map_nested
 from .utils.logging import WARNING, get_logger, get_verbosity, set_verbosity_warning
 
 
@@ -425,17 +425,26 @@ def __setstate__(self, state):
         if self._indices is None and self._indices_data_files:
             self._indices = reader._read_files(self._indices_data_files)
 
-    def save_to_disk(self, dataset_path: str):
+    def save_to_disk(
+        self, dataset_path: str, aws_profile="default", aws_access_key_id=None, aws_secret_access_key=None
+    ):
         """
-        Save the dataset in a dataset directory
+        Save the dataset in a dataset directory or to a s3 bucket
 
         Args:
-            dataset_path (``str``): path of the dataset directory where the dataset will be saved to
+            dataset_path (``str``): path or s3 uri of the dataset directory where the dataset will be saved to
+            aws_profile (:obj:`str`,  `optional`, defaults to :obj:``default``): the aws profile used to create the `boto_session` for uploading the data to s3
+            aws_access_key_id (:obj:`str`,  `optional`, defaults to :obj:``None``): the aws access key id used to create the `boto_session` for uploading the data to s3
+            aws_secret_access_key (:obj:`str`,  `optional`, defaults to :obj:``None``): the aws secret access key used to create the `boto_session` for uploading the data to s3
         """
         assert (
             not self.list_indexes()
         ), "please remove all the indexes using `dataset.drop_index` before saving a dataset"
         self = pickle.loads(pickle.dumps(self))
+        # gets filesystem from dataset, either s3:// or file:// and adjusted dataset_path
+        fs, dataset_path = get_filesystem_from_dataset_path(
+            dataset_path, aws_profile, aws_access_key_id, aws_secret_access_key
+        )
         os.makedirs(dataset_path, exist_ok=True)
         # Write indices if needed
         if self._indices is not None:
@@ -455,12 +464,13 @@ def save_to_disk(self, dataset_path: str):
             self._inplace_history = [{"transforms": []}]
         # Copy all files into the dataset directory
         for data_file in self._data_files + self._indices_data_files:
-            # Copy file to destination directory
             src = data_file["filename"]
             filename = Path(src).name
             dest = os.path.join(dataset_path, filename)
             if src != dest:
-                shutil.copy(src, dest)
+                fs.put(src, dest)
+            elif fs.protocol != "file":
+                fs.put(src, dest)
             # Change path to relative path from inside the destination directory
             data_file["filename"] = filename
         # Get state
@@ -472,19 +482,38 @@ def save_to_disk(self, dataset_path: str):
             len(h["transforms"]) == 0 for h in state.get("_inplace_history", [])
         ), "in-place history needs to be empty"
         # Serialize state
-        with open(os.path.join(dataset_path, "state.json"), "w", encoding="utf-8") as state_file:
+        with fs.open(os.path.join(dataset_path, "state.json"), "w", encoding="utf-8") as state_file:
             json.dump(state, state_file, indent=2, sort_keys=True)
-        with open(os.path.join(dataset_path, "dataset_info.json"), "w", encoding="utf-8") as dataset_info_file:
+        with fs.open(os.path.join(dataset_path, "dataset_info.json"), "w", encoding="utf-8") as dataset_info_file:
             json.dump(dataset_info, dataset_info_file, indent=2, sort_keys=True)
         logger.info("Dataset saved in {}".format(dataset_path))
+        # removes temp empty directory if files are uploaded to s3
+        if "s3" in fs.protocol:
+            shutil.rmtree(dataset_path.split("/")[0])
 
     @staticmethod
-    def load_from_disk(dataset_path: str) -> "Dataset":
+    def load_from_disk(
+        dataset_path: str, aws_profile="default", aws_access_key_id=None, aws_secret_access_key=None, anon=False
+    ) -> "Dataset":
         """Load the dataset from a dataset directory
 
         Args:
-            dataset_path (``str``): path of the dataset directory where the dataset will be loaded from
+            dataset_path (``str``): path or s3 uri of the dataset directory where the dataset will be loaded from
+            aws_profile (:obj:`str`,  `optional`, defaults to :obj:``default``): the aws profile used to create the `boto_session` for downloading the data to s3
+            aws_access_key_id (:obj:`str`,  `optional`, defaults to :obj:``None``): the aws access key id used to create the `boto_session` for downloading the data to s3
+            aws_secret_access_key (:obj:`str`,  `optional`, defaults to :obj:``None``): the aws secret access key used to create the `boto_session` for downloading the data to s3
+            anon (:obj:`boolean`,  `optional`, defaults to :obj:``False``): The connection can be anonymous - in which case only publicly-available, read-only buckets are accessible, for anonymous connection use `anon=True`
         """
+        # copies file from filesystem if it is s3 to local filesystem and modifies dataset_path to temp directory containing local copies
+        if is_remote_filesystem(dataset_path):
+            # gets filesystem from dataset, either s3:// or file://
+            fs, proc_dataset_path = get_filesystem_from_dataset_path(
+                dataset_path, aws_profile, aws_access_key_id, aws_secret_access_key, anon
+            )
+            tmp_dir = tempfile.TemporaryDirectory()
+            dataset_path = os.path.join(tmp_dir.name, proc_dataset_path)
+            fs.download(proc_dataset_path, dataset_path, recursive=True)
+
         with open(os.path.join(dataset_path, "state.json"), "r", encoding="utf-8") as state_file:
             state = json.load(state_file)
         with open(os.path.join(dataset_path, "dataset_info.json"), "r", encoding="utf-8") as dataset_info_file:
@@ -496,6 +525,9 @@ def load_from_disk(dataset_path: str) -> "Dataset":
         for data_file in state.get("_data_files", []) + state.get("_indices_data_files", []):
             data_file["filename"] = os.path.join(dataset_path, data_file["filename"])
         dataset.__setstate__(state)
+
+        if "tmp_dir" in vars():
+            tmp_dir.cleanup()
         return dataset
 
     @property

diff --git a/src/datasets/dataset_dict.py b/src/datasets/dataset_dict.py
@@ -9,6 +9,7 @@
 
 from .arrow_dataset import Dataset
 from .features import Features
+from .utils import get_filesystem_from_dataset_path
 
 
 class DatasetDict(dict):
@@ -478,31 +479,58 @@ def shuffle(
             }
         )
 
-    def save_to_disk(self, dataset_dict_path: str):
+    def save_to_disk(
+        self, dataset_dict_path: str, aws_profile="default", aws_access_key_id=None, aws_secret_access_key=None
+    ):
         """
-        Save the dataset dict in a dataset dict directory.
+        Save the dataset dict in a dataset dict directory or to a s3 bucket
 
         Args:
             dataset_dict_path (``str``): path of the dataset dict directory where the dataset dict will be saved to
+            aws_profile (:obj:`str`,  `optional`, defaults to :obj:``default``): the aws profile used to create the `boto_session` for uploading the data to s3
+            aws_access_key_id (:obj:`str`,  `optional`, defaults to :obj:``None``): the aws access key id used to create the `boto_session` for uploading the data to s3
+            aws_secret_access_key (:obj:`str`,  `optional`, defaults to :obj:``None``): the aws secret access key used to create the `boto_session` for uploading the data to s3
         """
-        os.makedirs(dataset_dict_path, exist_ok=True)
+        fs, proc_dataset_dict_path = get_filesystem_from_dataset_path(
+            dataset_dict_path, aws_profile, aws_access_key_id, aws_secret_access_key
+        )
+        os.makedirs(proc_dataset_dict_path, exist_ok=True)
         json.dump(
-            {"splits": list(self)}, open(os.path.join(dataset_dict_path, "dataset_dict.json"), "w", encoding="utf-8")
+            {"splits": list(self)},
+            fs.open(os.path.join(proc_dataset_dict_path, "dataset_dict.json"), "w", encoding="utf-8"),
         )
         for k, dataset in self.items():
-            dataset.save_to_disk(os.path.join(dataset_dict_path, k))
+            dataset.save_to_disk(
+                os.path.join(dataset_dict_path, k), aws_profile, aws_access_key_id, aws_secret_access_key
+            )
 
     @staticmethod
-    def load_from_disk(dataset_dict_path: str) -> "DatasetDict":
+    def load_from_disk(
+        dataset_dict_path: str, aws_profile="default", aws_access_key_id=None, aws_secret_access_key=None, anon=False
+    ) -> "DatasetDict":
         """
-        Load the dataset dict from a dataset dict directory
+        Load the dataset dict from a dataset dict directory or from a s3 bucket
 
         Args:
             dataset_dict_path (``str``): path of the dataset dict directory where the dataset dict will be loaded from
+            aws_profile (:obj:`str`,  `optional`, defaults to :obj:``default``): the aws profile used to create the `boto_session` for uploading the data to s3
+            aws_access_key_id (:obj:`str`,  `optional`, defaults to :obj:``None``): the aws access key id used to create the `boto_session` for uploading the data to s3
+            aws_secret_access_key (:obj:`str`,  `optional`, defaults to :obj:``None``): the aws secret access key used to create the `boto_session` for uploading the data to s3
+            anon (:obj:`boolean`,  `optional`, defaults to :obj:``False``): The connection can be anonymous - in which case only publicly-available, read-only buckets are accessible, for anonymous connection use `anon=True`
+
         """
         dataset_dict = DatasetDict()
-        for k in json.load(open(os.path.join(dataset_dict_path, "dataset_dict.json"), "r", encoding="utf-8"))[
+        fs, proc_dataset_dict_path = get_filesystem_from_dataset_path(
+            dataset_dict_path, aws_profile, aws_access_key_id, aws_secret_access_key
+        )
+        for k in json.load(fs.open(os.path.join(proc_dataset_dict_path, "dataset_dict.json"), "r", encoding="utf-8"))[
             "splits"
         ]:
-            dataset_dict[k] = Dataset.load_from_disk(os.path.join(dataset_dict_path, k))
+            dataset_dict[k] = Dataset.load_from_disk(
+                os.path.join(dataset_dict_path, k),
+                aws_profile,
+                aws_access_key_id,
+                aws_secret_access_key,
+                anon,
+            )
         return dataset_dict
diff --git a/src/datasets/load.py b/src/datasets/load.py
@@ -34,6 +34,7 @@
 from .info import DATASET_INFOS_DICT_FILE_NAME
 from .metric import Metric
 from .splits import Split
+from .utils import get_filesystem_from_dataset_path
 from .utils.download_manager import GenerateMode
 from .utils.file_utils import HF_MODULES_CACHE, DownloadConfig, cached_path, head_hf_s3, hf_bucket_url, hf_github_url
 from .utils.filelock import FileLock
@@ -620,24 +621,38 @@ def load_dataset(
     return ds
 
 
-def load_from_disk(dataset_path: str) -> Union[Dataset, DatasetDict]:
+def load_from_disk(
+    dataset_path: str,
+    aws_profile="default",
+    aws_access_key_id=None,
+    aws_secret_access_key=None,
+    anon=False,
+) -> Union[Dataset, DatasetDict]:
     """
-    Load a dataset that was previously saved using ``dataset.save_to_disk(dataset_path)``.
+    Load a dataset that was previously saved using ``dataset.save_to_disk(dataset_path)`` from s3 or local filesystem.
 
     Args:
         dataset_path (``str``): path of a Dataset directory or a DatasetDict directory
+        aws_profile (:obj:`str`,  `optional`, defaults to :obj:``default``): the aws profile used to create the `boto_session` for downloading the data to s3
+        aws_access_key_id (:obj:`str`,  `optional`, defaults to :obj:``None``): the aws access key id used to create the `boto_session` for downloading the data to s3
+        aws_secret_access_key (:obj:`str`,  `optional`, defaults to :obj:``None``): the aws secret access key used to create the `boto_session` for downloading the data to s3
+        anon (:obj:`boolean`,  `optional`, defaults to :obj:``False``): The connection can be anonymous - in which case only publicly-available, read-only buckets are accessible, for anonymous connection use `anon=True`
 
     Returns:
         ``datasets.Dataset`` or ``datasets.DatasetDict``
             if `dataset_path` is a path of a dataset directory: the dataset requested,
             if `dataset_path` is a path of a dataset dict directory: a ``datasets.DatasetDict`` with each split.
     """
-    if not os.path.isdir(dataset_path):
+    # gets filesystem from dataset, either s3:// or file:// and adjusted dataset_path
+    fs, proc_dataset_path = get_filesystem_from_dataset_path(
+        dataset_path, aws_profile, aws_access_key_id, aws_secret_access_key
+    )
+    if not fs.exists(proc_dataset_path):
         raise FileNotFoundError("Directory {} not found".format(dataset_path))
-    if os.path.exists(os.path.join(dataset_path, "dataset_info.json")):
-        return Dataset.load_from_disk(dataset_path)
-    elif os.path.exists(os.path.join(dataset_path, "dataset_dict.json")):
-        return DatasetDict.load_from_disk(dataset_path)
+    if fs.isfile(os.path.join(proc_dataset_path, "dataset_info.json")):
+        return Dataset.load_from_disk(dataset_path, aws_profile, aws_access_key_id, aws_secret_access_key, anon)
+    elif fs.isfile(os.path.join(proc_dataset_path, "dataset_dict.json")):
+        return DatasetDict.load_from_disk(dataset_path, aws_profile, aws_access_key_id, aws_secret_access_key, anon)
     else:
         raise FileNotFoundError(
             "Directory {} is neither a dataset directory nor a dataset dict directory.".format(dataset_path)