huggingface · lhoestq · Jan 26, 2021 · Jan 7, 2021 · Jan 7, 2021 · Jan 8, 2021
diff --git a/docs/source/filesystems.rst b/docs/source/filesystems.rst
@@ -0,0 +1,154 @@
+FileSystems Integration for cloud storages
+====================================================================
+
+Supported Filesystems
+---------------------
+
+Currenlty ``datasets`` offers an s3 filesystem implementation with :class:`datasets.filesystems.S3FileSystem`. ``S3FileSystem`` is a subclass of `s3fs.S3FileSystem <https://s3fs.readthedocs.io/en/latest/api.html>`_, which is a known implementation of ``fsspec``.
+
+Furthermore ``datasets`` supports all ``fsspec`` implementations. Currently Known Implementations these are: 
+
+- `s3fs <https://s3fs.readthedocs.io/en/latest/>`_  for Amazon S3 and other compatible stores
+- `gcsfs <https://gcsfs.readthedocs.io/en/latest/>`_ for Google Cloud Storage
+- `adl <https://github.com/dask/adlfs>`_ for Azure DataLake storage
+- `abfs <https://github.com/dask/adlfs>`_ for Azure Blob service
+- `dropbox <https://github.com/MarineChap/dropboxdrivefs>`_ for access to dropbox shares
+- `gdrive <https://github.com/intake/gdrivefs>`_ to access Google Drive and shares (experimental)
+
+These know implementations are going to be natively added in the near future within ``datasets``.
+
+**Examples:**	
+
+Example using :class:`datasets.filesystems.S3FileSystem` within ``datasets``.
+
+
+.. code-block::
+
+    >>> pip install datasets[s3]
+
+Listing files from public s3 bucket.
+
+.. code-block::
+
+      >>> import datasets
+      >>> s3 = datasets.filesystems.S3FileSystem(anon=True)  # doctest: +SKIP
+      >>> s3.ls('public-datasets/imdb/train')  # doctest: +SKIP
+      ['dataset_info.json.json','dataset.arrow','state.json']
+
+Listing files from private s3 bucket using ``aws_access_key_id`` and ``aws_secret_access_key``.
+
+.. code-block::
+
+      >>> import datasets
+      >>> s3 = datasets.filesystems.S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key)  # doctest: +SKIP
+      >>> s3.ls('my-private-datasets/imdb/train')  # doctest: +SKIP
+      ['dataset_info.json.json','dataset.arrow','state.json']
+
+Using ``S3Filesystem`` with ``botocore.session.Session`` and custom ``aws_profile``.
+
+.. code-block::
+
+      >>> import botocore 
+      >>> from datasets.filesystems import S3Filesystem
+      >>> s3_session = botocore.session.Session(profile_name='my_profile_name')
+      >>>
+      >>> s3 = S3FileSystem(session=s3_session)  # doctest: +SKIP
+
+
+
+Saving a processed dataset to s3
+--------------------------------
+
+Once you have your final dataset you can save it to s3 and reuse it later using :obj:`datasets.load_from_disk`.
+Saving a dataset to s3 will upload various files to your bucket:
+
+- ``arrow files.arrow``: they contain your dataset's data
+- ``dataset_info.json``: contains the description, citations, etc. of the dataset
+- ``state.json``: contains the list of the arrow files and other informations like the dataset format type, if any (torch or tensorflow for example)
+
+Saving ``encoded_dataset`` to a private s3 bucket using ``aws_access_key_id`` and ``aws_secret_access_key``.
+
+.. code-block::
+
+      >>> from datasets.filesystems import S3FileSystem
+      >>>
+      >>> # create S3FileSystem instance with aws_access_key_id and aws_secret_access_key
+      >>> s3 = S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key)  # doctest: +SKIP
+      >>>
+      >>> # saves encoded_dataset to your s3 bucket
+      >>> encoded_dataset.save_to_disk('s3://my-private-datasets/imdb/train',fs=s3)  # doctest: +SKIP
+
+Saving ``encoded_dataset`` to a private s3 bucket using ``botocore.session.Session`` and custom ``aws_profile``.
+
+.. code-block::
+
+      >>> import botocore 
+      >>> from datasets.filesystems import S3Filesystem
+      >>>
+      >>> # creates a botocore session with the provided aws_profile
+      >>> s3_session = botocore.session.Session(profile_name='my_profile_name')
+      >>>
+      >>> # create S3FileSystem instance with s3_session
+      >>> s3 = S3FileSystem(sessions=s3_session)  # doctest: +SKIP
+      >>>
+      >>> # saves encoded_dataset to your s3 bucket
+      >>> encoded_dataset.save_to_disk('s3://my-private-datasets/imdb/train',fs=s3)  # doctest: +SKIP
+
+
+Loading a processed dataset from s3
+-----------------------------------
+
+After you have saved your processed dataset to s3 you can load it using :obj:`datasets.load_from_disk`.
+You can only load datasets from s3, which are saved using :func:`datasets.Dataset.save_to_disk` 
+and :func:`datasets.DatasetDict.save_to_disk`. 
+
+Loading ``encoded_dataset`` from a public s3 bucket.
+
+.. code-block::
+
+      >>> from datasets import load_from_disk
+      >>> from datasets.filesystems import S3Filesystem
+      >>>
+      >>> # create S3FileSystem without credentials
+      >>> s3 = S3FileSystem(anon=True)  # doctest: +SKIP
+      >>>
+      >>> # load encoded_dataset to from s3 bucket
+      >>> dataset = load_from_disk('s3://a-public-datasets/imdb/train',fs=s3)  # doctest: +SKIP
+      >>>
+      >>> print(len(dataset))
+      >>> # 25000
+
+Loading ``encoded_dataset`` from a private s3 bucket using ``aws_access_key_id`` and ``aws_secret_access_key``.
+
+.. code-block::
+
+      >>> from datasets import load_from_disk
+      >>> from datasets.filesystems import S3Filesystem
+      >>>
+      >>> # create S3FileSystem instance with aws_access_key_id and aws_secret_access_key
+      >>> s3 = S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key)  # doctest: +SKIP
+      >>>
+      >>> # load encoded_dataset to from s3 bucket
+      >>> dataset = load_from_disk('s3://my-private-datasets/imdb/train',fs=s3)  # doctest: +SKIP
+      >>>
+      >>> print(len(dataset))
+      >>> # 25000
+
+Loading ``encoded_dataset`` from a private s3 bucket using ``botocore.session.Session`` and custom ``aws_profile``.
+
+.. code-block::
+
+      >>> import botocore
+      >>> from datasets.filesystems import S3Filesystem
+      >>>
+      >>> # create S3FileSystem instance with aws_access_key_id and aws_secret_access_key
+      >>> s3_session = botocore.session.Session(profile_name='my_profile_name')
+      >>>
+      >>> # create S3FileSystem instance with s3_session
+      >>> s3 = S3FileSystem(sessions=s3_session)  
+      >>>
+      >>> # load encoded_dataset to from s3 bucket
+      >>> dataset = load_from_disk('s3://my-private-datasets/imdb/train',fs=s3)  # doctest: +SKIP
+      >>>
+      >>> print(len(dataset))
+      >>> # 25000
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -43,6 +43,7 @@ The documentation is organized in five parts:
     exploring
     processing
     torch_tensorflow
+    filesystems
     faiss_and_ea
 
 .. toctree::

diff --git a/docs/source/package_reference/main_classes.rst b/docs/source/package_reference/main_classes.rst
@@ -96,3 +96,14 @@ The base class ``Metric`` implements a Metric backed by one or several :class:`d
 
 .. autoclass:: datasets.Metric
     :members:
+
+
+``Filesystems``
+~~~~~~~~~~~~~~~~~~~~~
+
+
+.. autoclass:: datasets.filesystems.S3FileSystem(anon=False, key=None, secret=None, token=None, use_ssl=True, client_kwargs=None, requester_pays=False, default_block_size=None, default_fill_cache=True, default_cache_type='bytes', version_aware=False, config_kwargs=None, s3_additional_kwargs=None, session=None, username=None, password=None, asynchronous=False, loop=None, **kwargs)
+
+.. autofunction:: datasets.filesystems.extract_path_from_uri
+
+.. autofunction:: datasets.filesystems.is_remote_filesystem
diff --git a/docs/source/processing.rst b/docs/source/processing.rst
@@ -488,6 +488,10 @@ Saving a dataset creates a directory with various files:
 
 Both :obj:`datasets.Dataset` and :obj:`datasets.DatasetDict` objects can be saved on disk, by using respectively :func:`datasets.Dataset.save_to_disk` and :func:`datasets.DatasetDict.save_to_disk`.
 
+Furthermore it is also possible to save :obj:`datasets.Dataset` and :obj:`datasets.DatasetDict` to other filesystems and cloud storages such as S3 by using respectively :func:`datasets.Dataset.save_to_disk` 
+and :func:`datasets.DatasetDict.save_to_disk` and providing a ``Filesystem`` as input ``fs``. To learn more about saving your ``datasets`` to other filesystem take a look at :doc:`filesystems`
+
+
 Controling the cache behavior
 -----------------------------------
 

diff --git a/setup.py b/setup.py
@@ -59,19 +59,19 @@
 from setuptools import find_packages
 from setuptools import setup
 
-DOCLINES = __doc__.split('\n')
+DOCLINES = __doc__.split("\n")
 
 REQUIRED_PKGS = [
     # We use numpy>=1.17 to have np.random.Generator (Dataset shuffling)
-    'numpy>=1.17',
+    "numpy>=1.17",
     # Backend and serialization. Minimum 0.17.1 to support extension array
-    'pyarrow>=0.17.1',
+    "pyarrow>=0.17.1",
     # For smart caching dataset processing
-    'dill',
+    "dill",
     # For performance gains with apache arrow
-    'pandas',
+    "pandas",
     # for downloading datasets over HTTPS
-    'requests>=2.19.0',
+    "requests>=2.19.0",
     # progress bars in download and scripts
     # tqdm 4.50.0 introduced permission errors on windows
     # see https://app.circleci.com/pipelines/github/huggingface/datasets/235/workflows/cfb6a39f-68eb-4802-8b17-2cd5e8ea7369/jobs/1111
@@ -83,37 +83,43 @@
     # for better multiprocessing
     "multiprocess",
     # to get metadata of optional dependencies such as torch or tensorflow for Python versions that don't have it
-    "importlib_metadata;python_version<'3.8'"
+    "importlib_metadata;python_version<'3.8'",
+    # for saving datsets to local
+    "fsspec",
 ]
 
 BENCHMARKS_REQUIRE = [
-    'numpy==1.18.5',
-    'tensorflow==2.3.0',
-    'torch==1.6.0',
-    'transformers==3.0.2',
+    "numpy==1.18.5",
+    "tensorflow==2.3.0",
+    "torch==1.6.0",
+    "transformers==3.0.2",
 ]
 
 TESTS_REQUIRE = [
-    'apache-beam',
-    'absl-py',
-    'bs4',
-    'conllu',
-    'elasticsearch',
-    'faiss-cpu',
-    'langdetect',
-    'lxml',
-    'mwparserfromhell',
-    'nltk',
-    'openpyxl',
-    'py7zr',
-    'pytest',
-    'pytest-xdist',
-    'tensorflow',
-    'torch',
-    'tldextract',
-    'transformers',
-    'zstandard',
-    'rarfile',
+    "apache-beam",
+    "absl-py",
+    "bs4",
+    "conllu",
+    "elasticsearch",
+    "faiss-cpu",
+    "langdetect",
+    "lxml",
+    "mwparserfromhell",
+    "nltk",
+    "openpyxl",
+    "py7zr",
+    "pytest",
+    "pytest-xdist",
+    "tensorflow",
+    "torch",
+    "tldextract",
+    "transformers",
+    "zstandard",
+    "rarfile",
+    "moto[s3]==1.3.16",
+    "fsspec[s3]",
+    "boto3==1.16.43",
+    "botocore==1.19.43",
 ]
 
 if os.name == "nt":  # windows
@@ -128,32 +134,44 @@
 
 
 EXTRAS_REQUIRE = {
-    'apache-beam': ['apache-beam'],
-    'tensorflow': ['tensorflow>=2.2.0'],
-    'tensorflow_gpu': ['tensorflow-gpu>=2.2.0'],
-    'torch': ['torch'],
-    'dev': TESTS_REQUIRE + QUALITY_REQUIRE,
-    'tests': TESTS_REQUIRE,
-    'quality': QUALITY_REQUIRE,
-    'benchmarks': BENCHMARKS_REQUIRE,
-    'docs': ["recommonmark", "sphinx==3.1.2", "sphinx-markdown-tables", "sphinx-rtd-theme==0.4.3", "sphinx-copybutton"]
+    "apache-beam": ["apache-beam"],
+    "tensorflow": ["tensorflow>=2.2.0"],
+    "tensorflow_gpu": ["tensorflow-gpu>=2.2.0"],
+    "torch": ["torch"],
+    "s3": [
+        "fsspec[s3]",
+        "boto3==1.16.43",
+        "botocore==1.19.43",
+    ],
+    "dev": TESTS_REQUIRE + QUALITY_REQUIRE,
+    "tests": TESTS_REQUIRE,
+    "quality": QUALITY_REQUIRE,
+    "benchmarks": BENCHMARKS_REQUIRE,
+    "docs": [
+        "recommonmark",
+        "sphinx==3.1.2",
+        "sphinx-markdown-tables",
+        "sphinx-rtd-theme==0.4.3",
+        "sphinx-copybutton",
+        "fsspec[s3]",
+    ],
 }
 
 setup(
-    name='datasets',
+    name="datasets",
     version="1.2.1",
     description=DOCLINES[0],
-    long_description='\n'.join(DOCLINES[2:]),
-    author='HuggingFace Inc.',
-    author_email='thomas@huggingface.co',
-    url='https://github.com/huggingface/datasets',
-    download_url='https://github.com/huggingface/datasets/tags',
-    license='Apache 2.0',
+    long_description="\n".join(DOCLINES[2:]),
+    author="HuggingFace Inc.",
+    author_email="thomas@huggingface.co",
+    url="https://github.com/huggingface/datasets",
+    download_url="https://github.com/huggingface/datasets/tags",
+    license="Apache 2.0",
     package_dir={"": "src"},
     packages=find_packages("src"),
     package_data={
-        'datasets': [
-            'scripts/templates/*',
+        "datasets": [
+            "scripts/templates/*",
         ],
     },
     scripts=["datasets-cli"],
@@ -171,5 +189,5 @@
         "Programming Language :: Python :: 3.7",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
-    keywords='datasets machine learning datasets metrics',
+    keywords="datasets machine learning datasets metrics",
 )