Skip to content

Commit

Permalink
#2837 Use cache folder for lockfile (#2887)
Browse files Browse the repository at this point in the history
* #2837 Use cache folder for lockfile

* #2837 lint

* #2837 simplify

* #2837 Fix according to review

* minor change in fixture

* Apply suggestions from code review

* fix test

Co-authored-by: fr.branchaud-charron <fr.branchaud-charron@servicenow.com>
Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
Co-authored-by: Quentin Lhoest <lhoest.q@gmail.com>
  • Loading branch information
4 people committed Oct 5, 2021
1 parent 9379a5a commit dcd523c
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 1 deletion.
4 changes: 3 additions & 1 deletion src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -711,8 +711,10 @@ def _get_modification_time(module_hash):
local_file_path = os.path.join(hash_folder_path, name)
dataset_infos_path = os.path.join(hash_folder_path, config.DATASETDICT_INFOS_FILENAME)

# Create the lock file where we know we have write permissions.
lock_path = (datasets_modules_path if dataset else metrics_modules_path) + f"{short_name}.lock"

# Prevent parallel disk operations
lock_path = local_path + ".lock"
with FileLock(lock_path):
# Create main dataset/metrics folder if needed
if download_mode == GenerateMode.FORCE_REDOWNLOAD and os.path.exists(main_folder_path):
Expand Down
26 changes: 26 additions & 0 deletions tests/test_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,21 @@ def dataset_loading_script_dir(tmp_path):
return str(script_dir)


@pytest.fixture
def dataset_loading_script_dir_readonly(tmp_path):
script_name = DATASET_LOADING_SCRIPT_NAME
script_dir = tmp_path / "readonly" / script_name
script_dir.mkdir(parents=True)
script_path = script_dir / f"{script_name}.py"
with open(script_path, "w") as f:
f.write(DATASET_LOADING_SCRIPT_CODE)
dataset_loading_script_dir = str(script_dir)
# Make this directory readonly
os.chmod(dataset_loading_script_dir, 0o555)
os.chmod(os.path.join(dataset_loading_script_dir, f"{script_name}.py"), 0o555)
return dataset_loading_script_dir


class LoadTest(TestCase):
@pytest.fixture(autouse=True)
def inject_fixtures(self, caplog):
Expand Down Expand Up @@ -422,6 +437,17 @@ def test_load_dataset_then_move_then_reload(dataset_loading_script_dir, data_dir
assert dataset._fingerprint != fingerprint1


def test_load_dataset_readonly(dataset_loading_script_dir, dataset_loading_script_dir_readonly, data_dir, tmp_path):
cache_dir1 = tmp_path / "cache1"
cache_dir2 = tmp_path / "cache2"
dataset = load_dataset(dataset_loading_script_dir, data_dir=data_dir, split="train", cache_dir=cache_dir1)
fingerprint1 = dataset._fingerprint
del dataset
# Load readonly dataset and check that the fingerprint is the same.
dataset = load_dataset(dataset_loading_script_dir_readonly, data_dir=data_dir, split="train", cache_dir=cache_dir2)
assert dataset._fingerprint == fingerprint1, "Cannot load a dataset in a readonly folder."


@pytest.mark.parametrize("max_in_memory_dataset_size", ["default", 0, 50, 500])
def test_load_dataset_local_with_default_in_memory(
max_in_memory_dataset_size, dataset_loading_script_dir, data_dir, monkeypatch
Expand Down

1 comment on commit dcd523c

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==3.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.008487 / 0.011353 (-0.002866) 0.003786 / 0.011008 (-0.007223) 0.028521 / 0.038508 (-0.009987) 0.032040 / 0.023109 (0.008931) 0.263115 / 0.275898 (-0.012783) 0.300860 / 0.323480 (-0.022620) 0.007499 / 0.007986 (-0.000486) 0.003608 / 0.004328 (-0.000721) 0.008647 / 0.004250 (0.004396) 0.042557 / 0.037052 (0.005504) 0.262390 / 0.258489 (0.003901) 0.304177 / 0.293841 (0.010336) 0.020894 / 0.128546 (-0.107652) 0.007265 / 0.075646 (-0.068382) 0.227915 / 0.419271 (-0.191356) 0.041900 / 0.043533 (-0.001633) 0.259926 / 0.255139 (0.004787) 0.285733 / 0.283200 (0.002533) 0.081325 / 0.141683 (-0.060358) 1.509555 / 1.452155 (0.057400) 1.649267 / 1.492716 (0.156551)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.234937 / 0.018006 (0.216931) 0.536058 / 0.000490 (0.535568) 0.004656 / 0.000200 (0.004456) 0.000086 / 0.000054 (0.000031)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.037493 / 0.037411 (0.000082) 0.022418 / 0.014526 (0.007893) 0.028220 / 0.176557 (-0.148337) 0.127410 / 0.737135 (-0.609725) 0.028284 / 0.296338 (-0.268055)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.312123 / 0.215209 (0.096914) 3.126490 / 2.077655 (1.048835) 1.559180 / 1.504120 (0.055060) 1.402622 / 1.541195 (-0.138573) 1.451148 / 1.468490 (-0.017343) 0.274148 / 4.584777 (-4.310629) 4.241411 / 3.745712 (0.495699) 0.814308 / 5.269862 (-4.455554) 0.723972 / 4.565676 (-3.841705) 0.032836 / 0.424275 (-0.391439) 0.004310 / 0.007607 (-0.003297) 0.400032 / 0.226044 (0.173987) 3.994562 / 2.268929 (1.725634) 1.956819 / 55.444624 (-53.487805) 1.666318 / 6.876477 (-5.210158) 1.706194 / 2.142072 (-0.435878) 0.367165 / 4.805227 (-4.438062) 0.087510 / 6.500664 (-6.413154) 0.046544 / 0.075469 (-0.028925)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 0.785174 / 1.841788 (-1.056614) 13.103566 / 8.074308 (5.029258) 22.299414 / 10.191392 (12.108022) 0.823358 / 0.680424 (0.142934) 0.504877 / 0.534201 (-0.029324) 0.225474 / 0.579283 (-0.353809) 0.485050 / 0.434364 (0.050686) 0.159637 / 0.540337 (-0.380701) 0.168763 / 1.386936 (-1.218173)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.008693 / 0.011353 (-0.002660) 0.003700 / 0.011008 (-0.007308) 0.027968 / 0.038508 (-0.010540) 0.032701 / 0.023109 (0.009592) 0.257367 / 0.275898 (-0.018531) 0.292078 / 0.323480 (-0.031402) 0.007562 / 0.007986 (-0.000424) 0.003447 / 0.004328 (-0.000882) 0.008352 / 0.004250 (0.004101) 0.040498 / 0.037052 (0.003445) 0.257781 / 0.258489 (-0.000708) 0.300406 / 0.293841 (0.006565) 0.021822 / 0.128546 (-0.106724) 0.007094 / 0.075646 (-0.068552) 0.224239 / 0.419271 (-0.195032) 0.043063 / 0.043533 (-0.000470) 0.256247 / 0.255139 (0.001108) 0.279703 / 0.283200 (-0.003497) 0.083783 / 0.141683 (-0.057900) 1.521922 / 1.452155 (0.069767) 1.664353 / 1.492716 (0.171637)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.385230 / 0.018006 (0.367224) 0.575653 / 0.000490 (0.575163) 0.047837 / 0.000200 (0.047637) 0.000548 / 0.000054 (0.000493)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.032498 / 0.037411 (-0.004913) 0.020246 / 0.014526 (0.005720) 0.025928 / 0.176557 (-0.150629) 0.114434 / 0.737135 (-0.622701) 0.027680 / 0.296338 (-0.268658)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.307696 / 0.215209 (0.092487) 3.078179 / 2.077655 (1.000524) 1.623534 / 1.504120 (0.119414) 1.502450 / 1.541195 (-0.038744) 1.547059 / 1.468490 (0.078569) 0.272320 / 4.584777 (-4.312457) 4.388645 / 3.745712 (0.642933) 0.875869 / 5.269862 (-4.393992) 0.816349 / 4.565676 (-3.749327) 0.037352 / 0.424275 (-0.386923) 0.004927 / 0.007607 (-0.002680) 0.449139 / 0.226044 (0.223095) 4.531909 / 2.268929 (2.262981) 2.236620 / 55.444624 (-53.208004) 1.898049 / 6.876477 (-4.978427) 1.948350 / 2.142072 (-0.193722) 0.420630 / 4.805227 (-4.384597) 0.099351 / 6.500664 (-6.401313) 0.053024 / 0.075469 (-0.022445)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 0.788064 / 1.841788 (-1.053723) 12.673276 / 8.074308 (4.598968) 19.801436 / 10.191392 (9.610044) 0.605664 / 0.680424 (-0.074760) 0.450804 / 0.534201 (-0.083396) 0.202498 / 0.579283 (-0.376785) 0.503991 / 0.434364 (0.069627) 0.182834 / 0.540337 (-0.357504) 0.195332 / 1.386936 (-1.191604)

CML watermark

Please sign in to comment.