add capability to add a row to a column after IO (#426)

* add capability to add a row to a column that is using DataIO * add capability to add a row to a column that is an h5py.Dataset
hdmf-dev · Oct 12, 2020 · abc291c · abc291c
1 parent 023353d
commit abc291c
Show file tree

Hide file tree

Showing 4 changed files with 114 additions and 44 deletions.
diff --git a/src/hdmf/common/table.py b/src/hdmf/common/table.py
@@ -91,7 +91,7 @@ def add_vector(self, arg):
     def __check_precision(self, idx):
         """
         Check precision of current dataset and, if
-        necessary, adjust precision to accomodate new value.
+        necessary, adjust precision to accommodate new value.
 
         Returns:
             unsigned integer encoding of idx

diff --git a/src/hdmf/container.py b/src/hdmf/container.py
@@ -3,9 +3,8 @@
 from uuid import uuid4
 from .utils import (docval, get_docval, call_docval_func, getargs, ExtenderMeta, get_data_shape, fmt_docval_args,
                     popargs, LabelledDict)
-from .data_utils import DataIO
+from .data_utils import DataIO, append_data, extend_data
 from warnings import warn
-import h5py
 import types
 
 
@@ -477,32 +476,10 @@ def get(self, args):
         return self.data[args]
 
     def append(self, arg):
-        if isinstance(self.data, list):
-            self.data.append(arg)
-        elif isinstance(self.data, np.ndarray):
-            self.__data = np.append(self.__data, [arg])
-        elif isinstance(self.data, h5py.Dataset):
-            shape = list(self.__data.shape)
-            shape[0] += 1
-            self.__data.resize(shape)
-            self.__data[-1] = arg
-        else:
-            msg = "Data cannot append to object of type '%s'" % type(self.__data)
-            raise ValueError(msg)
+        self.__data = append_data(self.__data, arg)
 
     def extend(self, arg):
-        if isinstance(self.data, list):
-            self.data.extend(arg)
-        elif isinstance(self.data, np.ndarray):
-            self.__data = np.append(self.__data, [arg])
-        elif isinstance(self.data, h5py.Dataset):
-            shape = list(self.__data.shape)
-            shape[0] += len(arg)
-            self.__data.resize(shape)
-            self.__data[-len(arg):] = arg
-        else:
-            msg = "Data cannot extend object of type '%s'" % type(self.__data)
-            raise ValueError(msg)
+        self.__data = extend_data(self.__data, arg)
 
 
 class DataRegion(Data):

diff --git a/src/hdmf/data_utils.py b/src/hdmf/data_utils.py
@@ -3,10 +3,45 @@
 import numpy as np
 from warnings import warn
 import copy
+import h5py
 
 from .utils import docval, getargs, popargs, docval_macro, get_data_shape
 
 
+def append_data(data, arg):
+    if isinstance(data, (list, DataIO)):
+        data.append(arg)
+        return data
+    elif isinstance(data, np.ndarray):
+        return np.append(data, [arg])
+    elif isinstance(data, h5py.Dataset):
+        shape = list(data.shape)
+        shape[0] += 1
+        data.resize(shape)
+        data[-1] = arg
+        return data
+    else:
+        msg = "Data cannot append to object of type '%s'" % type(data)
+        raise ValueError(msg)
+
+
+def extend_data(data, arg):
+    if isinstance(data, (list, DataIO)):
+        data.extend(arg)
+        return data
+    elif isinstance(data, np.ndarray):
+        return np.vstack((data, arg))
+    elif isinstance(data, h5py.Dataset):
+        shape = list(data.shape)
+        shape[0] += len(arg)
+        data.resize(shape)
+        data[-len(arg):] = arg
+        return data
+    else:
+        msg = "Data cannot extend object of type '%s'" % type(data)
+        raise ValueError(msg)
+
+
 @docval_macro('array_data')
 class AbstractDataChunkIterator(metaclass=ABCMeta):
     """
@@ -602,6 +637,12 @@ def __copy__(self):
         newobj = DataIO(data=self.data)
         return newobj
 
+    def append(self, arg):
+        self.__data = append_data(self.__data, arg)
+
+    def extend(self, arg):
+        self.__data = extend_data(self.__data, arg)
+
     def __deepcopy__(self, memo):
         """
         Define a custom copy method for deep copy.

diff --git a/tests/unit/common/test_table.py b/tests/unit/common/test_table.py
@@ -1,7 +1,8 @@
 import unittest
-from hdmf.common import DynamicTable, VectorData, VectorIndex, ElementIdentifiers, DynamicTableRegion, VocabData
+from hdmf.common import DynamicTable, VectorData, VectorIndex, ElementIdentifiers, \
+    DynamicTableRegion, VocabData, get_manager
 from hdmf.testing import TestCase, H5RoundTripMixin
-from hdmf.backends.hdf5 import H5DataIO
+from hdmf.backends.hdf5 import H5DataIO, HDF5IO
 
 from collections import OrderedDict
 import h5py
@@ -543,7 +544,7 @@ def test_dynamic_table_region_to_dataframe(self):
     def test_dynamic_table_region_to_dataframe_exclude_cols(self):
         table = self.with_columns_and_data()
         dynamic_table_region = DynamicTableRegion('dtr', [0, 1, 2, 2], 'desc', table=table)
-        res = dynamic_table_region.to_dataframe(exclude=set(['baz', 'foo']))
+        res = dynamic_table_region.to_dataframe(exclude={'baz', 'foo'})
         self.assertListEqual(res.index.tolist(), [0, 1, 2, 2])
         self.assertEqual(len(res.columns), 1)
         self.assertListEqual(res['bar'].tolist(), [10.0, 20.0, 30.0, 30.0])
@@ -1180,6 +1181,45 @@ def setUpContainer(self):
 
 
 class TestDataIOColumns(H5RoundTripMixin, TestCase):
+    def setUpContainer(self):
+        self.chunked_data = H5DataIO(
+            data=[i for i in range(10)],
+            chunks=(3,),
+            fillvalue=-1,
+        )
+        self.compressed_data = H5DataIO(
+            data=np.arange(10),
+            compression=1,
+            shuffle=True,
+            fletcher32=True,
+            allow_plugin_filters=True,
+        )
+        foo = VectorData(name='foo', description='chunked column', data=self.chunked_data)
+        bar = VectorData(name='bar', description='chunked column', data=self.compressed_data)
+
+        # NOTE: on construct, columns are ordered such that indices go before data, so create the table that way
+        # for proper comparison of the columns list
+        table = DynamicTable('table0', 'an example table', columns=[foo, bar])
+        table.add_row(foo=1, bar=1)
+        return table
+
+    def test_roundtrip(self):
+        super().test_roundtrip()
+
+        with h5py.File(self.filename, 'r') as f:
+            chunked_dset = f['foo']
+            self.assertTrue(np.all(chunked_dset[:] == self.chunked_data.data))
+            self.assertEqual(chunked_dset.chunks, (3,))
+            self.assertEqual(chunked_dset.fillvalue, -1)
+
+            compressed_dset = f['bar']
+            self.assertTrue(np.all(compressed_dset[:] == self.compressed_data.data))
+            self.assertEqual(compressed_dset.compression, 'gzip')
+            self.assertEqual(compressed_dset.shuffle, True)
+            self.assertEqual(compressed_dset.fletcher32, True)
+
+
+class TestDataIOIndexedColumns(H5RoundTripMixin, TestCase):
 
     def setUpContainer(self):
         self.chunked_data = H5DataIO(
@@ -1202,6 +1242,10 @@ def setUpContainer(self):
         # NOTE: on construct, columns are ordered such that indices go before data, so create the table that way
         # for proper comparison of the columns list
         table = DynamicTable('table0', 'an example table', columns=[foo_ind, foo, bar_ind, bar])
+
+        # check for add_row
+        table.add_row(foo=np.arange(30).reshape(5, 2, 3), bar=np.arange(30).reshape(5, 2, 3))
+
         return table
 
     def test_roundtrip(self):
@@ -1227,25 +1271,29 @@ def setUpContainer(self):
             data=np.arange(30).reshape(5, 2, 3),
             chunks=(1, 1, 3),
             fillvalue=-1,
+            maxshape=(None, 2, 3)
         )
         self.chunked_index_data = H5DataIO(
             data=np.array([2, 3, 5], dtype=np.uint),
             chunks=(2, ),
             fillvalue=np.uint(10),
+            maxshape=(None,)
         )
         self.compressed_data = H5DataIO(
             data=np.arange(30).reshape(5, 2, 3),
             compression=1,
             shuffle=True,
             fletcher32=True,
             allow_plugin_filters=True,
+            maxshape=(None, 2, 3)
         )
         self.compressed_index_data = H5DataIO(
-            data=np.array([2, 3, 5], dtype=np.uint),
+            data=np.array([2, 4, 5], dtype=np.uint),
             compression=1,
             shuffle=True,
             fletcher32=False,
             allow_plugin_filters=True,
+            maxshape=(None,)
         )
         foo = VectorData(name='foo', description='chunked column', data=self.chunked_data)
         foo_ind = VectorIndex(name='foo_index', target=foo, data=self.chunked_index_data)
@@ -1254,20 +1302,24 @@ def setUpContainer(self):
 
         # NOTE: on construct, columns are ordered such that indices go before data, so create the table that way
         # for proper comparison of the columns list
-        table = DynamicTable('table0', 'an example table', columns=[foo_ind, foo, bar_ind, bar])
+        table = DynamicTable('table0', 'an example table', columns=[foo_ind, foo, bar_ind, bar],
+                             id=H5DataIO(data=[0, 1, 2], chunks=True, maxshape=(None,)))
+
+        # check for add_row
+        table.add_row(foo=np.arange(30).reshape(5, 2, 3),
+                      bar=np.arange(30).reshape(5, 2, 3))
+
         return table
 
-    def test_roundtrip(self):
-        super().test_roundtrip()
+    def test_append(self, cache_spec=False):
+        """Write the container to an HDF5 file, read the container from the file, and append to it."""
+        with HDF5IO(self.filename, manager=get_manager(), mode='w') as write_io:
+            write_io.write(self.container, cache_spec=cache_spec)
 
-        with h5py.File(self.filename, 'r') as f:
-            chunked_dset = f['foo_index']
-            self.assertTrue(np.all(chunked_dset[:] == self.chunked_index_data.data))
-            self.assertEqual(chunked_dset.chunks, (2, ))
-            self.assertEqual(chunked_dset.fillvalue, 10)
+        self.reader = HDF5IO(self.filename, manager=get_manager(), mode='a')
+        read_table = self.reader.read()
 
-            compressed_dset = f['bar_index']
-            self.assertTrue(np.all(compressed_dset[:] == self.compressed_index_data.data))
-            self.assertEqual(compressed_dset.compression, 'gzip')
-            self.assertEqual(compressed_dset.shuffle, True)
-            self.assertEqual(compressed_dset.fletcher32, False)
+        data = np.arange(30, 60).reshape(5, 2, 3)
+        read_table.add_row(foo=data, bar=data)
+
+        np.testing.assert_array_equal(read_table['foo'][-1], data)