TermSet Update #1 (#935)

* add_ref_term_set update * name changes and ruff * gallery * Update container.py * clean * clean * Update CHANGELOG.md * rename * test * test * test * document * document line * Update src/hdmf/container.py Co-authored-by: Ryan Ly <rly@lbl.gov> * Update src/hdmf/container.py Co-authored-by: Ryan Ly <rly@lbl.gov> --------- Co-authored-by: Ryan Ly <rly@lbl.gov>
hdmf-dev · Aug 17, 2023 · 901e124 · 901e124
1 parent 918e6ba
commit 901e124
Show file tree

Hide file tree

Showing 6 changed files with 84 additions and 183 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@
 - Increase raw data chunk cache size for reading HDF5 files from 1 MiB to 32 MiB. @bendichter, @rly [#925](https://github.com/hdmf-dev/hdmf/pull/925)
 - Increase default chunk size for `GenericDataChunkIterator` from 1 MB to 10 MB. @bendichter, @rly [#925](https://github.com/hdmf-dev/hdmf/pull/925)
 - Added the magic `__reduce__` method as well as two private semi-abstract helper methods to enable pickling of the `GenericDataChunkIterator`. @codycbakerphd [#924](https://github.com/hdmf-dev/hdmf/pull/924)
+- Updated `add_ref_termset` to add all instances of `TermSet` within a given root container. @mavaylon1 [#935](https://github.com/hdmf-dev/hdmf/pull/935)
 - Added Dynamic Enumerations and Schemasheets support to `TermSet`. @mavaylon1 [#923](https://github.com/hdmf-dev/hdmf/pull/923)
 - Updated `HERD` to support user defined file name for the `HERD` zip file. @mavaylon1 [#941](https://github.com/hdmf-dev/hdmf/pull/941)
 - Added method `Containter.set_data_io`, which wraps an existing data field in a `DataIO`. @bendichter [#938](https://github.com/hdmf-dev/hdmf/pull/938)

diff --git a/docs/gallery/plot_external_resources.py b/docs/gallery/plot_external_resources.py
@@ -323,33 +323,3 @@ def __init__(self, **kwargs):
 
 er_read = HERD.from_zip(path='./HERD.zip')
 os.remove('./HERD.zip')
-
-###############################################################################
-# Using TermSet with HERD
-# ------------------------------------------------
-# :py:class:`~hdmf.term_set.TermSet` allows for an easier way to add references to
-# :py:class:`~hdmf.common.resources.HERD`. These enumerations take place of the
-# entity_id and entity_uri parameters. :py:class:`~hdmf.common.resources.Key` values will have
-# to match the name of the term in the :py:class:`~hdmf.term_set.TermSet`.
-from hdmf.term_set import TermSet
-
-try:
-    dir_path = os.path.dirname(os.path.abspath(__file__))
-    yaml_file = os.path.join(dir_path, 'example_term_set.yaml')
-except NameError:
-    dir_path = os.path.dirname(os.path.abspath('.'))
-    yaml_file = os.path.join(dir_path, 'gallery/example_term_set.yaml')
-
-terms = TermSet(term_schema_path=yaml_file)
-col1 = VectorData(
-    name='Species_Data',
-    description='...',
-    data=['Homo sapiens', 'Ursus arctos horribilis'],
-    term_set=terms,
-)
-
-species = DynamicTable(name='species', description='My species', columns=[col1],)
-er.add_ref_term_set(file=file,
-                    container=species,
-                    attribute='Species_Data',
-                   )
diff --git a/src/hdmf/common/resources.py b/src/hdmf/common/resources.py
@@ -2,11 +2,9 @@
 import numpy as np
 from . import register_class, EXP_NAMESPACE
 from . import get_type_map
-from ..container import Table, Row, Container, AbstractContainer, Data, HERDManager
-from ..data_utils import DataIO
+from ..container import Table, Row, Container, AbstractContainer, HERDManager
 from ..utils import docval, popargs, AllowPositional
 from ..build import TypeMap
-from ..term_set import TermSet
 from glob import glob
 import os
 import zipfile
@@ -410,77 +408,35 @@ def _get_file_from_container(self, **kwargs):
                 msg = 'Could not find file. Add container to the file.'
                 raise ValueError(msg)
 
-    @docval({'name': 'file',  'type': HERDManager, 'doc': 'The file associated with the container.',
-             'default': None},
-            {'name': 'container', 'type': (str, AbstractContainer), 'default': None,
-             'doc': ('The Container/Data object that uses the key or '
-                     'the object_id for the Container/Data object that uses the key.')},
-            {'name': 'attribute', 'type': str,
-             'doc': 'The attribute of the container for the external reference.', 'default': None},
-            {'name': 'field', 'type': str, 'default': '',
-             'doc': ('The field of the compound data type using an external resource.')},
-            {'name': 'key', 'type': (str, Key), 'default': None,
-             'doc': 'The name of the key or the Key object from the KeyTable for the key to add a resource for.'},
-            {'name': 'term_set', 'type': TermSet, 'default': None,
-             'doc': 'The TermSet to be used if the container/attribute does not have one.'}
-            )
+    @docval({'name': 'root_container',  'type': HERDManager,
+             'doc': 'The root container or file containing objects with a TermSet.'})
     def add_ref_term_set(self, **kwargs):
-        file = kwargs['file']
-        container = kwargs['container']
-        attribute = kwargs['attribute']
-        key = kwargs['key']
-        field = kwargs['field']
-        term_set = kwargs['term_set']
-
-        if term_set is None:
-            if attribute is None:
-                try:
-                    term_set = container.term_set
-                except AttributeError:
-                    msg = "Cannot Find TermSet"
-                    raise AttributeError(msg)
-            else:
-                term_set = container[attribute].term_set
-                if term_set is None:
-                    msg = "Cannot Find TermSet"
-                    raise ValueError(msg)
+        """
+        Method to search through the root_container for all instances of TermSet.
+        Currently, only datasets are supported. By using a TermSet, the data comes validated
+        and can use the permissible values within the set to populate HERD.
+        """
+        root_container = kwargs['root_container']
 
-        if file is None:
-            file = self._get_file_from_container(container=container)
+        all_children = root_container.all_objects # dictionary of objects with the IDs as keys
 
-        # if key is provided then add_ref proceeds as normal
-        # use key provided as the term in the term_set for entity look-up
-        if key is not None:
-            data = [key]
-        else:
-            if attribute is None:
-                data_object = container
-            else:
-                data_object = getattr(container, attribute)
-            if isinstance(data_object, (Data, DataIO)):
-                data = data_object.data
-            elif isinstance(data_object, (list, np.ndarray)):
-                data = data_object
-        missing_terms = []
-        for term in data:
+        for child in all_children:
             try:
-                term_info = term_set[term]
-            except ValueError:
-                missing_terms.append(term)
+                term_set = all_children[child].term_set
+                data = all_children[child].data # TODO: This will be expanded to not just support data
+            except AttributeError:
                 continue
-            entity_id = term_info[0]
-            entity_uri = term_info[2]
-            self.add_ref(file=file,
-                         container=container,
-                         attribute=attribute,
-                         key=term,
-                         field=field,
-                         entity_id=entity_id,
-                         entity_uri=entity_uri)
-        if len(missing_terms)>0:
-            return {"Missing Values in TermSet": missing_terms}
-        else:
-            return True
+
+            if term_set is not None:
+                for term in data:
+                    term_info = term_set[term]
+                    entity_id = term_info[0]
+                    entity_uri = term_info[2]
+                    self.add_ref(file=root_container,
+                                 container=all_children[child],
+                                 key=term,
+                                 entity_id=entity_id,
+                                 entity_uri=entity_uri)
 
     @docval({'name': 'key_name', 'type': str, 'doc': 'The name of the Key to get.'},
             {'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.',

diff --git a/src/hdmf/container.py b/src/hdmf/container.py
@@ -230,6 +230,7 @@ def __init__(self, **kwargs):
         self.__name = name
         self.__field_values = dict()
         self.__read_io = None
+        self.__obj = None
 
     @property
     def read_io(self):
@@ -302,6 +303,37 @@ def get_ancestor(self, **kwargs):
             p = p.parent
         return None
 
+    def all_children(self):
+        """Get a list of all child objects and their child objects recursively.
+
+        If the object has an object_id, the object will be added to "ret" to be returned.
+        If that object has children, they will be added to the "stack" in order to be:
+        1) Checked to see if has an object_id, if so then add to "ret"
+        2) Have children that will also be checked
+        """
+        stack = [self] # list of containers, including self, to add and later parse for children
+        ret = list()
+        self.__obj = LabelledDict(label='all_objects', key_attr='object_id')
+        while len(stack): # search until there's nothing in the list
+            n = stack.pop()
+            ret.append(n)
+            if n.object_id is not None:
+                self.__obj[n.object_id] = n
+            else: # pragma: no cover
+                # warn that a child does not have an object_id, which is unusual
+                warn('%s "%s" does not have an object_id' % (type(n).__class__, n.name))
+            if hasattr(n, 'children'):
+                for c in n.children:
+                    stack.append(c)
+        return ret
+
+    @property
+    def all_objects(self):
+        """Get a LabelledDict that indexed all child objects and their children by object ID."""
+        if self.__obj is None:
+            self.all_children()
+        return self.__obj
+
     @docval()
     def get_ancestors(self, **kwargs):
         p = self.parent

diff --git a/tests/unit/common/test_resources.py b/tests/unit/common/test_resources.py
@@ -275,93 +275,22 @@ def test_add_ref_termset(self):
         em = HERDManagerContainer()
         em.link_resources(er)
 
-        col1 = VectorData(name='Species_Data',
-                          description='species from NCBI and Ensemble',
-                          data=['Homo sapiens'],
-                          term_set=terms)
-
-        species = DynamicTable(name='species', description='My species', columns=[col1],)
-
-        er.add_ref_term_set(file=em,
-                    container=species,
-                    attribute='Species_Data',
-                   )
-        self.assertEqual(er.keys.data, [('Homo sapiens',)])
-        self.assertEqual(er.entities.data, [('NCBI_TAXON:9606',
-        'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9606')])
-        self.assertEqual(er.objects.data, [(0, col1.object_id, 'VectorData', '', '')])
-
-    @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed")
-    def test_add_ref_termset_missing_termset(self):
-        er = HERD()
-        em = HERDManagerContainer()
-        em.link_resources(er)
-
-        species = DynamicTable(name='species', description='My species')
-
-        with self.assertRaises(AttributeError):
-            er.add_ref_term_set(file=em,
-                                container=species,
-                               )
-
-    @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed")
-    def test_add_ref_termset_missing_attribute_termset_value(self):
-        er = HERD()
-        em = HERDManagerContainer()
-        em.link_resources(er)
-
-        col1 = VectorData(name='Species_Data',
-                          description='species from NCBI and Ensemble',
-                          data=['Homo sapiens'])
-        species = DynamicTable(name='species', description='My species', columns=[col1],)
-
-        with self.assertRaises(ValueError):
-            er.add_ref_term_set(file=em,
-                                container=species,
-                                attribute='Species_Data',
-                               )
-
-    @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed")
-    def test_add_ref_termset_missing_terms(self):
-        terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml')
-        er = HERD()
-        em = HERDManagerContainer()
-        em.link_resources(er)
-
-        col1 = VectorData(name='Species_Data',
-                          description='species from NCBI and Ensemble',
-                          data=['Homo sapiens', 'missing_term'])
+        # create children and add parent
+        col1 = VectorData(
+            name='Species_1',
+            description='...',
+            data=['Homo sapiens'],
+            term_set=terms,
+        )
+        species = DynamicTable(name='species', description='My species', columns=[col1])
 
-        species = DynamicTable(name='species', description='My species', columns=[col1],)
+        species.parent = em
 
-        missing_terms = er.add_ref_term_set(file=em,
-                                            container=species,
-                                            attribute='Species_Data',
-                                            term_set=terms
-                                           )
+        er.add_ref_term_set(root_container=em)
         self.assertEqual(er.keys.data, [('Homo sapiens',)])
         self.assertEqual(er.entities.data, [('NCBI_TAXON:9606',
         'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9606')])
         self.assertEqual(er.objects.data, [(0, col1.object_id, 'VectorData', '', '')])
-        self.assertEqual(missing_terms, {'Missing Values in TermSet': ['missing_term']})
-
-    @unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed")
-    def test_add_ref_termset_missing_file_error(self):
-        terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml')
-        er = HERD()
-
-        col1 = VectorData(name='Species_Data',
-                          description='species from NCBI and Ensemble',
-                          data=['Homo sapiens'],
-                          term_set=terms)
-
-        species = DynamicTable(name='species', description='My species', columns=[col1],)
-
-        with self.assertRaises(ValueError):
-            er.add_ref_term_set(
-                        container=species,
-                        attribute='Species_Data',
-                       )
 
     def test_get_file_from_container(self):
         file = HERDManagerContainer(name='file')

diff --git a/tests/unit/test_container.py b/tests/unit/test_container.py
@@ -195,6 +195,16 @@ def test_set_modified_parent(self):
         child_obj.set_modified()
         self.assertTrue(child_obj.parent.modified)
 
+    def test_all_children(self):
+        col1 = VectorData(
+            name='Species_1',
+            description='...',
+            data=['Homo sapiens'],
+        )
+        species = DynamicTable(name='species', description='My species', columns=[col1])
+        obj = species.all_objects
+        self.assertEqual(sorted(list(obj.keys())), sorted([species.object_id, species.id.object_id, col1.object_id]))
+
     def test_add_child(self):
         """Test that add child creates deprecation warning and also properly sets child's parent and modified
         """
@@ -578,7 +588,8 @@ class EmptyFields(AbstractContainer):
         self.assertTupleEqual(EmptyFields.get_fields_conf(), tuple())
 
         props = TestAbstractContainerFieldsConf.find_all_properties(EmptyFields)
-        expected = ['children', 'container_source', 'fields', 'modified', 'name', 'object_id', 'parent', 'read_io']
+        expected = ['all_objects', 'children', 'container_source', 'fields', 'modified',
+                    'name', 'object_id', 'parent', 'read_io']
         self.assertListEqual(props, expected)
 
     def test_named_fields(self):
@@ -598,7 +609,8 @@ def __init__(self, **kwargs):
         self.assertTupleEqual(NamedFields.get_fields_conf(), expected)
 
         props = TestAbstractContainerFieldsConf.find_all_properties(NamedFields)
-        expected = ['children', 'container_source', 'field1', 'field2', 'fields', 'modified', 'name', 'object_id',
+        expected = ['all_objects', 'children', 'container_source', 'field1', 'field2',
+                    'fields', 'modified', 'name', 'object_id',
                     'parent', 'read_io']
         self.assertListEqual(props, expected)
 
@@ -679,7 +691,8 @@ class NamedFieldsChild(NamedFields):
         self.assertTupleEqual(NamedFieldsChild.get_fields_conf(), expected)
 
         props = TestAbstractContainerFieldsConf.find_all_properties(NamedFieldsChild)
-        expected = ['children', 'container_source', 'field1', 'field2', 'fields', 'modified', 'name', 'object_id',
+        expected = ['all_objects', 'children', 'container_source', 'field1', 'field2',
+                    'fields', 'modified', 'name', 'object_id',
                     'parent', 'read_io']
         self.assertListEqual(props, expected)