Skip to content

Commit

Permalink
Revert input_columns change (#5006)
Browse files Browse the repository at this point in the history
revert input_columns change
  • Loading branch information
lhoestq committed Sep 21, 2022
1 parent cec23d5 commit 4889d5d
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 4 deletions.
2 changes: 1 addition & 1 deletion src/datasets/arrow_dataset.py
Expand Up @@ -2733,7 +2733,7 @@ def apply_function_on_filtered_inputs(inputs, indices, check_same_num_examples=F
validate_function_output(processed_inputs, indices)
if not update_data:
return None # Nothing to update, let's move on
if self._format_type is not None or input_columns:
if self._format_type:
inputs = self._getitem(
key=(indices if isinstance(indices, int) else slice(indices[0], indices[-1] + 1)),
format_type=None,
Expand Down
4 changes: 1 addition & 3 deletions tests/test_arrow_dataset.py
Expand Up @@ -1362,14 +1362,12 @@ def test_map_input_columns(self, in_memory):
with tempfile.TemporaryDirectory() as tmp_dir:
with self._create_dummy_dataset(in_memory, tmp_dir, multiple_columns=True) as dset:
with dset.map(lambda col_1: {"label": col_1 % 2}, input_columns="col_1") as mapped_dset:
self.assertEqual(mapped_dset[0].keys(), {"col_1", "col_2", "col_3", "label"})
self.assertEqual(mapped_dset[0].keys(), {"col_1", "label"})
self.assertEqual(
mapped_dset.features,
Features(
{
"col_1": Value("int64"),
"col_2": Value("string"),
"col_3": Value("bool"),
"label": Value("int64"),
}
),
Expand Down

1 comment on commit 4889d5d

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==6.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.007365 / 0.011353 (-0.003987) 0.003451 / 0.011008 (-0.007557) 0.029884 / 0.038508 (-0.008624) 0.028993 / 0.023109 (0.005884) 0.298714 / 0.275898 (0.022816) 0.357807 / 0.323480 (0.034327) 0.005239 / 0.007986 (-0.002747) 0.002914 / 0.004328 (-0.001414) 0.006578 / 0.004250 (0.002328) 0.037384 / 0.037052 (0.000332) 0.310864 / 0.258489 (0.052375) 0.353125 / 0.293841 (0.059284) 0.028796 / 0.128546 (-0.099751) 0.009208 / 0.075646 (-0.066438) 0.247399 / 0.419271 (-0.171872) 0.044127 / 0.043533 (0.000594) 0.302885 / 0.255139 (0.047746) 0.330378 / 0.283200 (0.047178) 0.084423 / 0.141683 (-0.057260) 1.494902 / 1.452155 (0.042747) 1.535163 / 1.492716 (0.042447)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.205819 / 0.018006 (0.187813) 0.410935 / 0.000490 (0.410446) 0.004631 / 0.000200 (0.004431) 0.000074 / 0.000054 (0.000020)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.019628 / 0.037411 (-0.017783) 0.090115 / 0.014526 (0.075590) 0.101849 / 0.176557 (-0.074708) 0.140150 / 0.737135 (-0.596985) 0.103868 / 0.296338 (-0.192471)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.405345 / 0.215209 (0.190135) 4.034912 / 2.077655 (1.957257) 1.825393 / 1.504120 (0.321273) 1.627150 / 1.541195 (0.085955) 1.640655 / 1.468490 (0.172165) 0.435028 / 4.584777 (-4.149749) 3.441412 / 3.745712 (-0.304300) 2.920039 / 5.269862 (-2.349823) 1.472848 / 4.565676 (-3.092828) 0.052544 / 0.424275 (-0.371731) 0.010944 / 0.007607 (0.003337) 0.516537 / 0.226044 (0.290493) 5.224385 / 2.268929 (2.955456) 2.270154 / 55.444624 (-53.174471) 1.936729 / 6.876477 (-4.939748) 2.015754 / 2.142072 (-0.126318) 0.553698 / 4.805227 (-4.251529) 0.116558 / 6.500664 (-6.384107) 0.062154 / 0.075469 (-0.013315)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.539300 / 1.841788 (-0.302487) 12.508641 / 8.074308 (4.434333) 26.052350 / 10.191392 (15.860958) 0.927668 / 0.680424 (0.247245) 0.625272 / 0.534201 (0.091071) 0.343950 / 0.579283 (-0.235333) 0.394236 / 0.434364 (-0.040128) 0.239840 / 0.540337 (-0.300497) 0.238093 / 1.386936 (-1.148843)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.005470 / 0.011353 (-0.005883) 0.003556 / 0.011008 (-0.007453) 0.026832 / 0.038508 (-0.011676) 0.027638 / 0.023109 (0.004528) 0.384136 / 0.275898 (0.108238) 0.410879 / 0.323480 (0.087399) 0.003281 / 0.007986 (-0.004705) 0.004273 / 0.004328 (-0.000055) 0.004637 / 0.004250 (0.000386) 0.035574 / 0.037052 (-0.001478) 0.381333 / 0.258489 (0.122844) 0.405730 / 0.293841 (0.111889) 0.027321 / 0.128546 (-0.101225) 0.009386 / 0.075646 (-0.066260) 0.249979 / 0.419271 (-0.169293) 0.046797 / 0.043533 (0.003265) 0.397395 / 0.255139 (0.142256) 0.387038 / 0.283200 (0.103838) 0.088038 / 0.141683 (-0.053645) 1.493803 / 1.452155 (0.041648) 1.525629 / 1.492716 (0.032913)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.231094 / 0.018006 (0.213087) 0.411390 / 0.000490 (0.410901) 0.001160 / 0.000200 (0.000960) 0.000077 / 0.000054 (0.000023)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.020337 / 0.037411 (-0.017074) 0.091713 / 0.014526 (0.077187) 0.102475 / 0.176557 (-0.074082) 0.141958 / 0.737135 (-0.595178) 0.104402 / 0.296338 (-0.191937)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.471597 / 0.215209 (0.256388) 4.697288 / 2.077655 (2.619634) 2.439847 / 1.504120 (0.935727) 2.235656 / 1.541195 (0.694462) 2.283188 / 1.468490 (0.814698) 0.448324 / 4.584777 (-4.136453) 3.394838 / 3.745712 (-0.350874) 1.841039 / 5.269862 (-3.428822) 1.106870 / 4.565676 (-3.458806) 0.053220 / 0.424275 (-0.371055) 0.010809 / 0.007607 (0.003202) 0.579737 / 0.226044 (0.353693) 5.821869 / 2.268929 (3.552940) 2.921302 / 55.444624 (-52.523322) 2.595409 / 6.876477 (-4.281068) 2.682210 / 2.142072 (0.540138) 0.558393 / 4.805227 (-4.246834) 0.119654 / 6.500664 (-6.381010) 0.065004 / 0.075469 (-0.010465)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.570809 / 1.841788 (-0.270979) 12.451048 / 8.074308 (4.376740) 26.195375 / 10.191392 (16.003983) 0.923602 / 0.680424 (0.243179) 0.640484 / 0.534201 (0.106283) 0.345901 / 0.579283 (-0.233382) 0.404261 / 0.434364 (-0.030103) 0.238112 / 0.540337 (-0.302226) 0.246128 / 1.386936 (-1.140808)

CML watermark

Please sign in to comment.