Fix temporary dataset_path creation for URIs related to remote fs (#3296

) This aims to close #3295
huggingface · Dec 6, 2021 · 73ed661 · 73ed661 · github-actions · Dec 6, 2021
1 parent 16f562b
commit 73ed661
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 2 deletions.
diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -1001,6 +1001,23 @@ def save_to_disk(self, dataset_path: str, fs=None):
             json.dump(sorted_keys_dataset_info, dataset_info_file, indent=2)
         logger.info(f"Dataset saved in {dataset_path}")
 
+    @staticmethod
+    def _build_local_temp_path(uri_or_path: str) -> Path:
+        """
+        Builds and returns a Path concatenating a local temporary dir with the dir path (or absolute/relative
+        path extracted from the uri) passed.
+
+        Args:
+            uri_or_path (:obj:`str`): Path (e.g. `"dataset/train"`) or remote URI (e.g.
+                `"s3://my-bucket/dataset/train"`) to concatenate.
+
+        Returns:
+            :class:`Path`: the concatenated path (temp dir + path)
+        """
+        src_dataset_path = Path(uri_or_path)
+        tmp_dir = get_temporary_cache_files_directory()
+        return Path(tmp_dir, src_dataset_path.relative_to(src_dataset_path.anchor))
+
     @staticmethod
     def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] = None) -> "Dataset":
         """
@@ -1034,8 +1051,7 @@ def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] =
 
         if is_remote_filesystem(fs):
             src_dataset_path = extract_path_from_uri(dataset_path)
-            tmp_dir = get_temporary_cache_files_directory()
-            dataset_path = Path(tmp_dir, src_dataset_path)
+            dataset_path = Dataset._build_local_temp_path(src_dataset_path)
             fs.download(src_dataset_path, dataset_path.as_posix(), recursive=True)
 
         with open(

diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
@@ -20,6 +20,7 @@
 from datasets.arrow_dataset import Dataset, transmit_format, update_metadata_with_features
 from datasets.dataset_dict import DatasetDict
 from datasets.features import Array2D, Array3D, ClassLabel, Features, Sequence, Value
+from datasets.filesystems import extract_path_from_uri
 from datasets.info import DatasetInfo
 from datasets.splits import NamedSplit
 from datasets.table import ConcatenationTable, InMemoryTable, MemoryMappedTable
@@ -2804,6 +2805,29 @@ def test_dummy_dataset_serialize_s3(s3, dataset):
     assert dataset["id"][0] == 0
 
 
+@pytest.mark.parametrize(
+    "uri_or_path",
+    [
+        "relative/path",
+        "/absolute/path",
+        "s3://bucket/relative/path",
+        "hdfs://relative/path",
+        "hdfs:///absolute/path",
+    ],
+)
+def test_build_local_temp_path(uri_or_path):
+    extracted_path = extract_path_from_uri(uri_or_path)
+    local_temp_path = Dataset._build_local_temp_path(extracted_path)
+
+    assert (
+        "tmp" in local_temp_path.as_posix()
+        and "hdfs" not in local_temp_path.as_posix()
+        and "s3" not in local_temp_path.as_posix()
+        and not local_temp_path.as_posix().startswith(extracted_path)
+        and local_temp_path.as_posix().endswith(extracted_path)
+    ), f"Local temp path: {local_temp_path.as_posix()}"
+
+
 class TaskTemplatesTest(TestCase):
     def test_task_text_classification(self):
         labels = sorted(["pos", "neg"])