#2837 Use cache folder for lockfile (#2887)

* #2837 Use cache folder for lockfile * #2837 lint * #2837 simplify * #2837 Fix according to review * minor change in fixture * Apply suggestions from code review * fix test Co-authored-by: fr.branchaud-charron <fr.branchaud-charron@servicenow.com> Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Co-authored-by: Quentin Lhoest <lhoest.q@gmail.com>
huggingface · Oct 5, 2021 · dcd523c · dcd523c · github-actions · Oct 5, 2021
1 parent 9379a5a
commit dcd523c
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 1 deletion.
diff --git a/src/datasets/load.py b/src/datasets/load.py
@@ -711,8 +711,10 @@ def _get_modification_time(module_hash):
     local_file_path = os.path.join(hash_folder_path, name)
     dataset_infos_path = os.path.join(hash_folder_path, config.DATASETDICT_INFOS_FILENAME)
 
+    # Create the lock file where we know we have write permissions.
+    lock_path = (datasets_modules_path if dataset else metrics_modules_path) + f"{short_name}.lock"
+
     # Prevent parallel disk operations
-    lock_path = local_path + ".lock"
     with FileLock(lock_path):
         # Create main dataset/metrics folder if needed
         if download_mode == GenerateMode.FORCE_REDOWNLOAD and os.path.exists(main_folder_path):

diff --git a/tests/test_load.py b/tests/test_load.py
@@ -107,6 +107,21 @@ def dataset_loading_script_dir(tmp_path):
     return str(script_dir)
 
 
+@pytest.fixture
+def dataset_loading_script_dir_readonly(tmp_path):
+    script_name = DATASET_LOADING_SCRIPT_NAME
+    script_dir = tmp_path / "readonly" / script_name
+    script_dir.mkdir(parents=True)
+    script_path = script_dir / f"{script_name}.py"
+    with open(script_path, "w") as f:
+        f.write(DATASET_LOADING_SCRIPT_CODE)
+    dataset_loading_script_dir = str(script_dir)
+    # Make this directory readonly
+    os.chmod(dataset_loading_script_dir, 0o555)
+    os.chmod(os.path.join(dataset_loading_script_dir, f"{script_name}.py"), 0o555)
+    return dataset_loading_script_dir
+
+
 class LoadTest(TestCase):
     @pytest.fixture(autouse=True)
     def inject_fixtures(self, caplog):
@@ -422,6 +437,17 @@ def test_load_dataset_then_move_then_reload(dataset_loading_script_dir, data_dir
     assert dataset._fingerprint != fingerprint1
 
 
+def test_load_dataset_readonly(dataset_loading_script_dir, dataset_loading_script_dir_readonly, data_dir, tmp_path):
+    cache_dir1 = tmp_path / "cache1"
+    cache_dir2 = tmp_path / "cache2"
+    dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir, split="train", cache_dir=cache_dir1)
+    fingerprint1 = dataset._fingerprint
+    del dataset
+    # Load readonly dataset and check that the fingerprint is the same.
+    dataset = load_dataset(dataset_loading_script_dir_readonly, data_dir=data_dir, split="train", cache_dir=cache_dir2)
+    assert dataset._fingerprint == fingerprint1, "Cannot load a dataset in a readonly folder."
+
+
 @pytest.mark.parametrize("max_in_memory_dataset_size", ["default", 0, 50, 500])
 def test_load_dataset_local_with_default_in_memory(
     max_in_memory_dataset_size, dataset_loading_script_dir, data_dir, monkeypatch