Skip to content

Commit

Permalink
Fix filter indices when batched (#5113)
Browse files Browse the repository at this point in the history
* Test filter indices

* Fix filter indices when batched

* Rename test
  • Loading branch information
albertvillanova committed Oct 14, 2022
1 parent 3ad9644 commit d60f5ff
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/datasets/arrow_dataset.py
Expand Up @@ -2961,7 +2961,7 @@ def init_buffer_and_writer():
else:
writer.write(example)
else:
for i, batch in enumerate(pbar):
for i, batch in zip(range(0, num_rows, batch_size), pbar):
indices = list(
range(*(slice(i, i + batch_size).indices(input_dataset.num_rows)))
) # Something simpler?
Expand Down
6 changes: 6 additions & 0 deletions tests/test_arrow_dataset.py
Expand Up @@ -3081,6 +3081,12 @@ def test_dataset_add_item_introduce_feature_type():
assert dataset[:] == {"col_1": [None, None, None, "a"]}


def test_dataset_filter_batched_indices():
ds = Dataset.from_dict({"num": [0, 1, 2, 3]})
ds = ds.filter(lambda num: num % 2 == 0, input_columns="num", batch_size=2)
assert all(item["num"] % 2 == 0 for item in ds)


@pytest.mark.parametrize("in_memory", [False, True])
def test_dataset_from_file(in_memory, dataset, arrow_file):
filename = arrow_file
Expand Down

1 comment on commit d60f5ff

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==6.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.009487 / 0.011353 (-0.001866) 0.005009 / 0.011008 (-0.006000) 0.102165 / 0.038508 (0.063657) 0.035013 / 0.023109 (0.011904) 0.305510 / 0.275898 (0.029612) 0.349633 / 0.323480 (0.026153) 0.007507 / 0.007986 (-0.000479) 0.003978 / 0.004328 (-0.000350) 0.078873 / 0.004250 (0.074623) 0.042072 / 0.037052 (0.005020) 0.302243 / 0.258489 (0.043754) 0.338429 / 0.293841 (0.044588) 0.042174 / 0.128546 (-0.086373) 0.015242 / 0.075646 (-0.060404) 0.340286 / 0.419271 (-0.078986) 0.050401 / 0.043533 (0.006868) 0.295157 / 0.255139 (0.040018) 0.318938 / 0.283200 (0.035738) 0.102057 / 0.141683 (-0.039626) 1.537195 / 1.452155 (0.085041) 1.596300 / 1.492716 (0.103584)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.010585 / 0.018006 (-0.007422) 0.445262 / 0.000490 (0.444772) 0.004462 / 0.000200 (0.004262) 0.000078 / 0.000054 (0.000024)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.021765 / 0.037411 (-0.015647) 0.099355 / 0.014526 (0.084829) 0.109730 / 0.176557 (-0.066826) 0.151286 / 0.737135 (-0.585850) 0.113706 / 0.296338 (-0.182633)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.396023 / 0.215209 (0.180814) 3.949905 / 2.077655 (1.872251) 1.758432 / 1.504120 (0.254313) 1.570835 / 1.541195 (0.029640) 1.621656 / 1.468490 (0.153166) 0.681074 / 4.584777 (-3.903703) 3.725628 / 3.745712 (-0.020084) 2.022575 / 5.269862 (-3.247287) 1.294247 / 4.565676 (-3.271430) 0.083302 / 0.424275 (-0.340973) 0.011685 / 0.007607 (0.004078) 0.503189 / 0.226044 (0.277145) 5.027661 / 2.268929 (2.758732) 2.229232 / 55.444624 (-53.215392) 1.901913 / 6.876477 (-4.974563) 2.025664 / 2.142072 (-0.116408) 0.835958 / 4.805227 (-3.969269) 0.166055 / 6.500664 (-6.334609) 0.062192 / 0.075469 (-0.013277)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.484218 / 1.841788 (-0.357569) 13.349571 / 8.074308 (5.275263) 24.703538 / 10.191392 (14.512146) 0.858426 / 0.680424 (0.178002) 0.553432 / 0.534201 (0.019231) 0.437619 / 0.579283 (-0.141664) 0.424830 / 0.434364 (-0.009534) 0.277975 / 0.540337 (-0.262362) 0.273927 / 1.386936 (-1.113009)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.007066 / 0.011353 (-0.004287) 0.005118 / 0.011008 (-0.005890) 0.096736 / 0.038508 (0.058228) 0.032702 / 0.023109 (0.009593) 0.351971 / 0.275898 (0.076073) 0.406183 / 0.323480 (0.082703) 0.005592 / 0.007986 (-0.002394) 0.004098 / 0.004328 (-0.000231) 0.074124 / 0.004250 (0.069874) 0.039094 / 0.037052 (0.002041) 0.361349 / 0.258489 (0.102860) 0.402415 / 0.293841 (0.108575) 0.039134 / 0.128546 (-0.089413) 0.012282 / 0.075646 (-0.063365) 0.333995 / 0.419271 (-0.085277) 0.069100 / 0.043533 (0.025567) 0.358314 / 0.255139 (0.103175) 0.362756 / 0.283200 (0.079556) 0.102422 / 0.141683 (-0.039261) 1.444130 / 1.452155 (-0.008024) 1.537449 / 1.492716 (0.044732)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.223694 / 0.018006 (0.205688) 0.436049 / 0.000490 (0.435559) 0.002036 / 0.000200 (0.001837) 0.000081 / 0.000054 (0.000027)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.023901 / 0.037411 (-0.013510) 0.102800 / 0.014526 (0.088274) 0.113500 / 0.176557 (-0.063056) 0.155505 / 0.737135 (-0.581630) 0.119523 / 0.296338 (-0.176816)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.441898 / 0.215209 (0.226689) 4.403923 / 2.077655 (2.326268) 2.240364 / 1.504120 (0.736244) 2.039983 / 1.541195 (0.498789) 2.036279 / 1.468490 (0.567789) 0.709367 / 4.584777 (-3.875410) 3.766417 / 3.745712 (0.020705) 2.067296 / 5.269862 (-3.202565) 1.332714 / 4.565676 (-3.232963) 0.087870 / 0.424275 (-0.336405) 0.012215 / 0.007607 (0.004607) 0.538309 / 0.226044 (0.312264) 5.399288 / 2.268929 (3.130360) 2.650876 / 55.444624 (-52.793749) 2.329649 / 6.876477 (-4.546827) 2.408317 / 2.142072 (0.266245) 0.851201 / 4.805227 (-3.954026) 0.168194 / 6.500664 (-6.332470) 0.061773 / 0.075469 (-0.013696)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.539742 / 1.841788 (-0.302046) 13.450716 / 8.074308 (5.376408) 12.475132 / 10.191392 (2.283740) 0.919183 / 0.680424 (0.238759) 0.609618 / 0.534201 (0.075417) 0.418553 / 0.579283 (-0.160730) 0.475985 / 0.434364 (0.041621) 0.263449 / 0.540337 (-0.276889) 0.278185 / 1.386936 (-1.108751)

CML watermark

Please sign in to comment.