Implement Dataset add_item (#1870)

* Test Dataset.add_item * Implement Dataset.add_item * tmp * Use InMemoryTable for new item * Add dataset_dict and arrow_path for tests * Fix test Dataset.add_item * Add docstring * Return new Dataset * Fix test with returned new dataset * Test multiple InMemoryTables are consolidated * Test for consolidated InMemoryTables after multiple calls * Add versionadded to docstring * Add method docstring to the docs * Simplify cast schema Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
huggingface · Apr 23, 2021 · 1f83a89 · 1f83a89 · github-actions · Apr 23, 2021
1 parent d843090
commit 1f83a89
Show file tree

Hide file tree

Showing 3 changed files with 50 additions and 1 deletion.
diff --git a/docs/source/package_reference/main_classes.rst b/docs/source/package_reference/main_classes.rst
@@ -14,7 +14,9 @@ Main classes
 The base class :class:`datasets.Dataset` implements a Dataset backed by an Apache Arrow table.
 
 .. autoclass:: datasets.Dataset
-    :members: from_file, from_buffer, from_pandas, from_dict,
+    :members:
+        add_item,
+        from_file, from_buffer, from_pandas, from_dict,
         data, cache_files, num_columns, num_rows, column_names, shape,
         unique,
         flatten_, cast_, remove_columns_, rename_column_,

diff --git a/src/datasets/arrow_dataset.py b/src/datasets/arrow_dataset.py
@@ -2858,6 +2858,25 @@ def add_elasticsearch_index(
             )
         return self
 
+    def add_item(self, item: dict):
+        """Add item to Dataset.
+
+        .. versionadded:: 1.6
+
+        Args:
+            item (dict): Item data to be added.
+
+        Returns:
+            :class:`Dataset`
+        """
+        item_table = InMemoryTable.from_pydict({k: [v] for k, v in item.items()})
+        # Cast item
+        schema = pa.schema(self.features.type)
+        item_table = item_table.cast(schema)
+        # Concatenate tables
+        table = concat_tables([self._data, item_table])
+        return Dataset(table)
+
 
 def concatenate_datasets(
     dsets: List[Dataset],

diff --git a/tests/test_arrow_dataset.py b/tests/test_arrow_dataset.py
@@ -1948,6 +1948,34 @@ def test_concatenate_datasets_duplicate_columns(dataset):
     assert "duplicated" in str(excinfo.value)
 
 
+@pytest.mark.parametrize("in_memory", [False, True])
+@pytest.mark.parametrize(
+    "item",
+    [
+        {"col_1": "4", "col_2": 4, "col_3": 4.0},
+        {"col_1": "4", "col_2": "4", "col_3": "4"},
+        {"col_1": 4, "col_2": 4, "col_3": 4},
+        {"col_1": 4.0, "col_2": 4.0, "col_3": 4.0},
+    ],
+)
+def test_dataset_add_item(item, in_memory, dataset_dict, arrow_path):
+    dataset = (
+        Dataset(InMemoryTable.from_pydict(dataset_dict))
+        if in_memory
+        else Dataset(MemoryMappedTable.from_file(arrow_path))
+    )
+    dataset = dataset.add_item(item)
+    assert dataset.data.shape == (5, 3)
+    expected_features = {"col_1": "string", "col_2": "int64", "col_3": "float64"}
+    assert dataset.data.column_names == list(expected_features.keys())
+    for feature, expected_dtype in expected_features.items():
+        assert dataset.features[feature].dtype == expected_dtype
+    assert len(dataset.data.blocks) == 1 if in_memory else 2  # multiple InMemoryTables are consolidated as one
+    dataset = dataset.add_item(item)
+    assert dataset.data.shape == (6, 3)
+    assert len(dataset.data.blocks) == 1 if in_memory else 2  # multiple InMemoryTables are consolidated as one
+
+
 @pytest.mark.parametrize("keep_in_memory", [False, True])
 @pytest.mark.parametrize(
     "features",