Skip to content

Commit

Permalink
update format, fingerprint and indices after add_item
Browse files Browse the repository at this point in the history
  • Loading branch information
lhoestq committed Apr 23, 2021
1 parent 1f83a89 commit a877fff
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 10 deletions.
18 changes: 16 additions & 2 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2858,7 +2858,9 @@ def add_elasticsearch_index(
)
return self

def add_item(self, item: dict):
@transmit_format
@fingerprint_transform(inplace=False)
def add_item(self, item: dict, new_fingerprint: str):
"""Add item to Dataset.
.. versionadded:: 1.6
Expand All @@ -2875,7 +2877,19 @@ def add_item(self, item: dict):
item_table = item_table.cast(schema)
# Concatenate tables
table = concat_tables([self._data, item_table])
return Dataset(table)
if self._indices is None:
indices_table = None
else:
new_indices_array = pa.array([len(self._data)], type=pa.uint64())
new_indices_table = InMemoryTable.from_arrays([new_indices_array], names=["indices"])
indices_table = concat_tables([self._indices, new_indices_table])
return Dataset(
table,
info=copy.deepcopy(self.info),
split=self.split,
indices_table=indices_table,
fingerprint=new_fingerprint,
)


def concatenate_datasets(
Expand Down
26 changes: 18 additions & 8 deletions tests/test_arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1948,6 +1948,10 @@ def test_concatenate_datasets_duplicate_columns(dataset):
assert "duplicated" in str(excinfo.value)


@pytest.mark.parametrize(
"transform",
[None, ("shuffle", (42,), {}), ("with_format", ("pandas",), {}), ("class_encode_column", ("col_2",), {})],
)
@pytest.mark.parametrize("in_memory", [False, True])
@pytest.mark.parametrize(
"item",
Expand All @@ -1958,22 +1962,28 @@ def test_concatenate_datasets_duplicate_columns(dataset):
{"col_1": 4.0, "col_2": 4.0, "col_3": 4.0},
],
)
def test_dataset_add_item(item, in_memory, dataset_dict, arrow_path):
dataset = (
def test_dataset_add_item(item, in_memory, dataset_dict, arrow_path, transform):
dataset_to_test = (
Dataset(InMemoryTable.from_pydict(dataset_dict))
if in_memory
else Dataset(MemoryMappedTable.from_file(arrow_path))
)
dataset = dataset.add_item(item)
if transform is not None:
transform_name, args, kwargs = transform
dataset_to_test: Dataset = getattr(dataset_to_test, transform_name)(*args, **kwargs)
dataset = dataset_to_test.add_item(item)
assert dataset.data.shape == (5, 3)
expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64"}
expected_features = dataset_to_test.features
assert dataset.data.column_names == list(expected_features.keys())
for feature, expected_dtype in expected_features.items():
assert dataset.features[feature].dtype == expected_dtype
assert len(dataset.data.blocks) == 1 if in_memory else 2 # multiple InMemoryTables are consolidated as one
dataset = dataset.add_item(item)
assert dataset.data.shape == (6, 3)
assert dataset.features[feature] == expected_dtype
assert len(dataset.data.blocks) == 1 if in_memory else 2 # multiple InMemoryTables are consolidated as one
assert dataset.format["type"] == dataset_to_test.format["type"]
assert dataset._fingerprint != dataset_to_test._fingerprint
dataset.reset_format()
dataset_to_test.reset_format()
assert dataset[:-1] == dataset_to_test[:]
assert {k: int(v) for k, v in dataset[-1].items()} == {k: int(v) for k, v in item.items()}


@pytest.mark.parametrize("keep_in_memory", [False, True])
Expand Down

1 comment on commit a877fff

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==1.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.025207 / 0.011353 (0.013854) 0.017460 / 0.011008 (0.006451) 0.052445 / 0.038508 (0.013937) 0.041685 / 0.023109 (0.018576) 0.390150 / 0.275898 (0.114252) 0.422553 / 0.323480 (0.099073) 0.012169 / 0.007986 (0.004184) 0.005554 / 0.004328 (0.001225) 0.011805 / 0.004250 (0.007554) 0.054225 / 0.037052 (0.017173) 0.373073 / 0.258489 (0.114583) 0.426951 / 0.293841 (0.133110) 0.174344 / 0.128546 (0.045798) 0.141721 / 0.075646 (0.066075) 0.472948 / 0.419271 (0.053677) 0.669354 / 0.043533 (0.625821) 0.409316 / 0.255139 (0.154177) 0.403262 / 0.283200 (0.120062) 2.221655 / 0.141683 (2.079972) 2.055332 / 1.452155 (0.603177) 2.074828 / 1.492716 (0.582112)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.018300 / 0.018006 (0.000294) 0.000696 / 0.000490 (0.000207) 0.000194 / 0.000200 (-0.000006) 0.000062 / 0.000054 (0.000008)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.052616 / 0.037411 (0.015205) 0.024205 / 0.014526 (0.009679) 0.036316 / 0.176557 (-0.140241) 0.048287 / 0.737135 (-0.688848) 0.034926 / 0.296338 (-0.261412)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.513954 / 0.215209 (0.298745) 5.267366 / 2.077655 (3.189712) 2.411089 / 1.504120 (0.906969) 2.100485 / 1.541195 (0.559290) 2.105564 / 1.468490 (0.637074) 7.512495 / 4.584777 (2.927718) 6.843566 / 3.745712 (3.097853) 9.318846 / 5.269862 (4.048985) 8.350672 / 4.565676 (3.784996) 0.755945 / 0.424275 (0.331669) 0.012106 / 0.007607 (0.004499) 0.655972 / 0.226044 (0.429927) 6.553674 / 2.268929 (4.284746) 3.537356 / 55.444624 (-51.907269) 2.993846 / 6.876477 (-3.882631) 3.084085 / 2.142072 (0.942013) 7.709649 / 4.805227 (2.904422) 6.389187 / 6.500664 (-0.111477) 8.282616 / 0.075469 (8.207147)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 12.613240 / 1.841788 (10.771452) 14.786437 / 8.074308 (6.712129) 43.161095 / 10.191392 (32.969703) 0.976766 / 0.680424 (0.296342) 0.646842 / 0.534201 (0.112641) 0.835950 / 0.579283 (0.256667) 0.674547 / 0.434364 (0.240183) 0.776304 / 0.540337 (0.235966) 1.678382 / 1.386936 (0.291446)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.024544 / 0.011353 (0.013191) 0.016914 / 0.011008 (0.005905) 0.052221 / 0.038508 (0.013713) 0.039571 / 0.023109 (0.016462) 0.368835 / 0.275898 (0.092937) 0.391036 / 0.323480 (0.067556) 0.012133 / 0.007986 (0.004147) 0.005140 / 0.004328 (0.000811) 0.011607 / 0.004250 (0.007357) 0.062131 / 0.037052 (0.025079) 0.354150 / 0.258489 (0.095661) 0.398131 / 0.293841 (0.104290) 0.178621 / 0.128546 (0.050075) 0.137131 / 0.075646 (0.061484) 0.460031 / 0.419271 (0.040759) 0.473246 / 0.043533 (0.429713) 0.366755 / 0.255139 (0.111616) 0.396735 / 0.283200 (0.113535) 1.821190 / 0.141683 (1.679507) 1.976736 / 1.452155 (0.524582) 2.033133 / 1.492716 (0.540417)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.018820 / 0.018006 (0.000814) 0.000519 / 0.000490 (0.000030) 0.000196 / 0.000200 (-0.000004) 0.000062 / 0.000054 (0.000007)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.042171 / 0.037411 (0.004760) 0.023886 / 0.014526 (0.009360) 0.029554 / 0.176557 (-0.147003) 0.048379 / 0.737135 (-0.688757) 0.030540 / 0.296338 (-0.265799)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.512972 / 0.215209 (0.297763) 5.086536 / 2.077655 (3.008881) 2.246412 / 1.504120 (0.742292) 1.961951 / 1.541195 (0.420756) 1.956855 / 1.468490 (0.488365) 7.310182 / 4.584777 (2.725405) 6.591374 / 3.745712 (2.845662) 9.194191 / 5.269862 (3.924330) 8.048809 / 4.565676 (3.483132) 0.749657 / 0.424275 (0.325382) 0.011355 / 0.007607 (0.003748) 0.641688 / 0.226044 (0.415644) 6.494086 / 2.268929 (4.225158) 3.539842 / 55.444624 (-51.904783) 2.989907 / 6.876477 (-3.886570) 3.011933 / 2.142072 (0.869861) 7.545490 / 4.805227 (2.740263) 5.658451 / 6.500664 (-0.842213) 9.183832 / 0.075469 (9.108363)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 12.296886 / 1.841788 (10.455098) 14.584541 / 8.074308 (6.510233) 41.037014 / 10.191392 (30.845622) 0.924479 / 0.680424 (0.244056) 0.627755 / 0.534201 (0.093554) 0.837541 / 0.579283 (0.258258) 0.681707 / 0.434364 (0.247343) 0.752561 / 0.540337 (0.212224) 1.656765 / 1.386936 (0.269829)

CML watermark

Please sign in to comment.