huggingface · lhoestq · Dec 1, 2022 · Dec 1, 2022 · Dec 1, 2022 · albertvillanova
diff --git a/src/datasets/arrow_reader.py b/src/datasets/arrow_reader.py
@@ -21,6 +21,7 @@
 import re
 import shutil
 from dataclasses import dataclass
+from pathlib import Path
 from typing import TYPE_CHECKING, List, Optional, Union
 
 import pyarrow as pa
@@ -294,11 +295,12 @@ def download_from_hf_gcs(self, download_config: DownloadConfig, relative_data_di
                     split_infos=self._info.splits.values(),
                 )
                 for file_instruction in file_instructions:
-                    remote_prepared_filename = os.path.join(remote_cache_dir, file_instruction["filename"])
+                    file_to_download = str(Path(file_instruction["filename"]).relative_to(self._path))
+                    remote_prepared_filename = os.path.join(remote_cache_dir, file_to_download)
                     downloaded_prepared_filename = cached_path(
                         remote_prepared_filename.replace(os.sep, "/"), download_config=download_config
                     )
-                    shutil.move(downloaded_prepared_filename, os.path.join(self._path, file_instruction["filename"]))
+                    shutil.move(downloaded_prepared_filename, file_instruction["filename"])
         except FileNotFoundError as err:
             raise MissingFilesOnHfGcsError(err) from None
 

diff --git a/tests/test_hf_gcp.py b/tests/test_hf_gcp.py
@@ -2,6 +2,7 @@
 from tempfile import TemporaryDirectory
 from unittest import TestCase
 
+import pytest
 from absl.testing import parameterized
 
 from datasets import config
@@ -67,3 +68,23 @@ def test_dataset_info_available(self, dataset, config_name):
             ).replace(os.sep, "/")
             datset_info_path = cached_path(dataset_info_url, cache_dir=tmp_dir)
             self.assertTrue(os.path.exists(datset_info_path))
+
+
+@pytest.mark.integration
+def test_wikipedia_frr(tmp_path_factory):
+    tmp_dir = tmp_path_factory.mktemp("test_hf_gcp") / "test_wikipedia_simple"
+    dataset_module = dataset_module_factory("wikipedia", cache_dir=tmp_dir)
+
+    builder_cls = import_main_class(dataset_module.module_path, dataset=True)
+
+    builder_instance: DatasetBuilder = builder_cls(
+        cache_dir=tmp_dir,
+        config_name="20220301.frr",
+        hash=dataset_module.hash,
+    )
+
+    # use the HF cloud storage, not the original download_and_prepare that uses apache-beam
+    builder_instance._download_and_prepare = None
+    builder_instance.download_and_prepare()
+    ds = builder_instance.as_dataset()
+    assert ds is not None