Skip to content

Commit

Permalink
Fix temporary dataset_path creation for URIs related to remote fs (#3296
Browse files Browse the repository at this point in the history
)

This aims to close #3295
  • Loading branch information
francisco-perez-sorrosal committed Dec 6, 2021
1 parent 16f562b commit 73ed661
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 2 deletions.
20 changes: 18 additions & 2 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1001,6 +1001,23 @@ def save_to_disk(self, dataset_path: str, fs=None):
json.dump(sorted_keys_dataset_info, dataset_info_file, indent=2)
logger.info(f"Dataset saved in {dataset_path}")

@staticmethod
def _build_local_temp_path(uri_or_path: str) -> Path:
"""
Builds and returns a Path concatenating a local temporary dir with the dir path (or absolute/relative
path extracted from the uri) passed.
Args:
uri_or_path (:obj:`str`): Path (e.g. `"dataset/train"`) or remote URI (e.g.
`"s3://my-bucket/dataset/train"`) to concatenate.
Returns:
:class:`Path`: the concatenated path (temp dir + path)
"""
src_dataset_path = Path(uri_or_path)
tmp_dir = get_temporary_cache_files_directory()
return Path(tmp_dir, src_dataset_path.relative_to(src_dataset_path.anchor))

@staticmethod
def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] = None) -> "Dataset":
"""
Expand Down Expand Up @@ -1034,8 +1051,7 @@ def load_from_disk(dataset_path: str, fs=None, keep_in_memory: Optional[bool] =

if is_remote_filesystem(fs):
src_dataset_path = extract_path_from_uri(dataset_path)
tmp_dir = get_temporary_cache_files_directory()
dataset_path = Path(tmp_dir, src_dataset_path)
dataset_path = Dataset._build_local_temp_path(src_dataset_path)
fs.download(src_dataset_path, dataset_path.as_posix(), recursive=True)

with open(
Expand Down
24 changes: 24 additions & 0 deletions tests/test_arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from datasets.arrow_dataset import Dataset, transmit_format, update_metadata_with_features
from datasets.dataset_dict import DatasetDict
from datasets.features import Array2D, Array3D, ClassLabel, Features, Sequence, Value
from datasets.filesystems import extract_path_from_uri
from datasets.info import DatasetInfo
from datasets.splits import NamedSplit
from datasets.table import ConcatenationTable, InMemoryTable, MemoryMappedTable
Expand Down Expand Up @@ -2804,6 +2805,29 @@ def test_dummy_dataset_serialize_s3(s3, dataset):
assert dataset["id"][0] == 0


@pytest.mark.parametrize(
"uri_or_path",
[
"relative/path",
"/absolute/path",
"s3://bucket/relative/path",
"hdfs://relative/path",
"hdfs:///absolute/path",
],
)
def test_build_local_temp_path(uri_or_path):
extracted_path = extract_path_from_uri(uri_or_path)
local_temp_path = Dataset._build_local_temp_path(extracted_path)

assert (
"tmp" in local_temp_path.as_posix()
and "hdfs" not in local_temp_path.as_posix()
and "s3" not in local_temp_path.as_posix()
and not local_temp_path.as_posix().startswith(extracted_path)
and local_temp_path.as_posix().endswith(extracted_path)
), f"Local temp path: {local_temp_path.as_posix()}"


class TaskTemplatesTest(TestCase):
def test_task_text_classification(self):
labels = sorted(["pos", "neg"])
Expand Down

1 comment on commit 73ed661

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==3.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.083456 / 0.011353 (0.072103) 0.004486 / 0.011008 (-0.006522) 0.037017 / 0.038508 (-0.001491) 0.041550 / 0.023109 (0.018441) 0.355934 / 0.275898 (0.080036) 0.391467 / 0.323480 (0.067987) 0.099090 / 0.007986 (0.091104) 0.004632 / 0.004328 (0.000303) 0.010669 / 0.004250 (0.006419) 0.045643 / 0.037052 (0.008590) 0.349398 / 0.258489 (0.090909) 0.386518 / 0.293841 (0.092677) 0.100075 / 0.128546 (-0.028471) 0.010150 / 0.075646 (-0.065497) 0.302824 / 0.419271 (-0.116448) 0.053346 / 0.043533 (0.009813) 0.360494 / 0.255139 (0.105355) 0.385814 / 0.283200 (0.102615) 0.093483 / 0.141683 (-0.048200) 2.006723 / 1.452155 (0.554568) 2.102447 / 1.492716 (0.609730)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.261294 / 0.018006 (0.243288) 0.481148 / 0.000490 (0.480658) 0.004873 / 0.000200 (0.004673) 0.000103 / 0.000054 (0.000048)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.043003 / 0.037411 (0.005592) 0.025331 / 0.014526 (0.010805) 0.030422 / 0.176557 (-0.146135) 0.227114 / 0.737135 (-0.510022) 0.031621 / 0.296338 (-0.264717)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.491913 / 0.215209 (0.276704) 4.837987 / 2.077655 (2.760332) 2.058927 / 1.504120 (0.554807) 1.824324 / 1.541195 (0.283129) 1.905727 / 1.468490 (0.437237) 0.484709 / 4.584777 (-4.100068) 5.748164 / 3.745712 (2.002452) 2.379986 / 5.269862 (-2.889876) 1.007344 / 4.565676 (-3.558333) 0.057195 / 0.424275 (-0.367080) 0.013135 / 0.007607 (0.005528) 0.616642 / 0.226044 (0.390598) 6.140592 / 2.268929 (3.871664) 2.600819 / 55.444624 (-52.843805) 2.201352 / 6.876477 (-4.675125) 2.357745 / 2.142072 (0.215673) 0.624816 / 4.805227 (-4.180411) 0.135037 / 6.500664 (-6.365627) 0.067012 / 0.075469 (-0.008457)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.827103 / 1.841788 (-0.014684) 14.294101 / 8.074308 (6.219793) 30.893883 / 10.191392 (20.702491) 0.953652 / 0.680424 (0.273228) 0.622659 / 0.534201 (0.088458) 0.431622 / 0.579283 (-0.147662) 0.646931 / 0.434364 (0.212567) 0.303840 / 0.540337 (-0.236497) 0.309934 / 1.386936 (-1.077002)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.080951 / 0.011353 (0.069598) 0.004296 / 0.011008 (-0.006713) 0.034738 / 0.038508 (-0.003770) 0.039591 / 0.023109 (0.016482) 0.342720 / 0.275898 (0.066822) 0.381125 / 0.323480 (0.057645) 0.099680 / 0.007986 (0.091695) 0.005472 / 0.004328 (0.001143) 0.008561 / 0.004250 (0.004310) 0.048865 / 0.037052 (0.011812) 0.343882 / 0.258489 (0.085393) 0.387574 / 0.293841 (0.093733) 0.099369 / 0.128546 (-0.029177) 0.009827 / 0.075646 (-0.065819) 0.296346 / 0.419271 (-0.122926) 0.054068 / 0.043533 (0.010536) 0.348950 / 0.255139 (0.093811) 0.372394 / 0.283200 (0.089194) 0.091117 / 0.141683 (-0.050566) 2.112130 / 1.452155 (0.659975) 2.112627 / 1.492716 (0.619911)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.349771 / 0.018006 (0.331764) 0.519072 / 0.000490 (0.518582) 0.042636 / 0.000200 (0.042436) 0.000694 / 0.000054 (0.000639)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.039989 / 0.037411 (0.002578) 0.024662 / 0.014526 (0.010136) 0.030650 / 0.176557 (-0.145907) 0.238172 / 0.737135 (-0.498963) 0.031813 / 0.296338 (-0.264526)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.497189 / 0.215209 (0.281980) 4.848608 / 2.077655 (2.770954) 2.088782 / 1.504120 (0.584662) 1.833015 / 1.541195 (0.291820) 1.904825 / 1.468490 (0.436335) 0.481447 / 4.584777 (-4.103330) 5.832075 / 3.745712 (2.086363) 2.473656 / 5.269862 (-2.796205) 1.099798 / 4.565676 (-3.465879) 0.057914 / 0.424275 (-0.366361) 0.013033 / 0.007607 (0.005426) 0.621075 / 0.226044 (0.395030) 6.194744 / 2.268929 (3.925815) 2.611181 / 55.444624 (-52.833443) 2.175320 / 6.876477 (-4.701157) 2.272825 / 2.142072 (0.130753) 0.616162 / 4.805227 (-4.189065) 0.130652 / 6.500664 (-6.370012) 0.064658 / 0.075469 (-0.010811)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.813620 / 1.841788 (-0.028168) 13.809869 / 8.074308 (5.735561) 31.032232 / 10.191392 (20.840840) 0.952592 / 0.680424 (0.272169) 0.619119 / 0.534201 (0.084918) 0.427298 / 0.579283 (-0.151985) 0.623593 / 0.434364 (0.189229) 0.297434 / 0.540337 (-0.242904) 0.330026 / 1.386936 (-1.056910)

CML watermark

Please sign in to comment.