Skip to content

Commit

Permalink
TermSet Update #1 (#935)
Browse files Browse the repository at this point in the history
* add_ref_term_set update

* name changes and ruff

* gallery

* Update container.py

* clean

* clean

* Update CHANGELOG.md

* rename

* test

* test

* test

* document

* document line

* Update src/hdmf/container.py

Co-authored-by: Ryan Ly <rly@lbl.gov>

* Update src/hdmf/container.py

Co-authored-by: Ryan Ly <rly@lbl.gov>

---------

Co-authored-by: Ryan Ly <rly@lbl.gov>
  • Loading branch information
mavaylon1 and rly committed Aug 17, 2023
1 parent 918e6ba commit 901e124
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 183 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
- Increase raw data chunk cache size for reading HDF5 files from 1 MiB to 32 MiB. @bendichter, @rly [#925](https://github.com/hdmf-dev/hdmf/pull/925)
- Increase default chunk size for `GenericDataChunkIterator` from 1 MB to 10 MB. @bendichter, @rly [#925](https://github.com/hdmf-dev/hdmf/pull/925)
- Added the magic `__reduce__` method as well as two private semi-abstract helper methods to enable pickling of the `GenericDataChunkIterator`. @codycbakerphd [#924](https://github.com/hdmf-dev/hdmf/pull/924)
- Updated `add_ref_termset` to add all instances of `TermSet` within a given root container. @mavaylon1 [#935](https://github.com/hdmf-dev/hdmf/pull/935)
- Added Dynamic Enumerations and Schemasheets support to `TermSet`. @mavaylon1 [#923](https://github.com/hdmf-dev/hdmf/pull/923)
- Updated `HERD` to support user defined file name for the `HERD` zip file. @mavaylon1 [#941](https://github.com/hdmf-dev/hdmf/pull/941)
- Added method `Containter.set_data_io`, which wraps an existing data field in a `DataIO`. @bendichter [#938](https://github.com/hdmf-dev/hdmf/pull/938)
Expand Down
30 changes: 0 additions & 30 deletions docs/gallery/plot_external_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,33 +323,3 @@ def __init__(self, **kwargs):

er_read = HERD.from_zip(path='./HERD.zip')
os.remove('./HERD.zip')

###############################################################################
# Using TermSet with HERD
# ------------------------------------------------
# :py:class:`~hdmf.term_set.TermSet` allows for an easier way to add references to
# :py:class:`~hdmf.common.resources.HERD`. These enumerations take place of the
# entity_id and entity_uri parameters. :py:class:`~hdmf.common.resources.Key` values will have
# to match the name of the term in the :py:class:`~hdmf.term_set.TermSet`.
from hdmf.term_set import TermSet

try:
dir_path = os.path.dirname(os.path.abspath(__file__))
yaml_file = os.path.join(dir_path, 'example_term_set.yaml')
except NameError:
dir_path = os.path.dirname(os.path.abspath('.'))
yaml_file = os.path.join(dir_path, 'gallery/example_term_set.yaml')

terms = TermSet(term_schema_path=yaml_file)
col1 = VectorData(
name='Species_Data',
description='...',
data=['Homo sapiens', 'Ursus arctos horribilis'],
term_set=terms,
)

species = DynamicTable(name='species', description='My species', columns=[col1],)
er.add_ref_term_set(file=file,
container=species,
attribute='Species_Data',
)
94 changes: 25 additions & 69 deletions src/hdmf/common/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,9 @@
import numpy as np
from . import register_class, EXP_NAMESPACE
from . import get_type_map
from ..container import Table, Row, Container, AbstractContainer, Data, HERDManager
from ..data_utils import DataIO
from ..container import Table, Row, Container, AbstractContainer, HERDManager
from ..utils import docval, popargs, AllowPositional
from ..build import TypeMap
from ..term_set import TermSet
from glob import glob
import os
import zipfile
Expand Down Expand Up @@ -410,77 +408,35 @@ def _get_file_from_container(self, **kwargs):
msg = 'Could not find file. Add container to the file.'
raise ValueError(msg)

@docval({'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.',
'default': None},
{'name': 'container', 'type': (str, AbstractContainer), 'default': None,
'doc': ('The Container/Data object that uses the key or '
'the object_id for the Container/Data object that uses the key.')},
{'name': 'attribute', 'type': str,
'doc': 'The attribute of the container for the external reference.', 'default': None},
{'name': 'field', 'type': str, 'default': '',
'doc': ('The field of the compound data type using an external resource.')},
{'name': 'key', 'type': (str, Key), 'default': None,
'doc': 'The name of the key or the Key object from the KeyTable for the key to add a resource for.'},
{'name': 'term_set', 'type': TermSet, 'default': None,
'doc': 'The TermSet to be used if the container/attribute does not have one.'}
)
@docval({'name': 'root_container', 'type': HERDManager,
'doc': 'The root container or file containing objects with a TermSet.'})
def add_ref_term_set(self, **kwargs):
file = kwargs['file']
container = kwargs['container']
attribute = kwargs['attribute']
key = kwargs['key']
field = kwargs['field']
term_set = kwargs['term_set']

if term_set is None:
if attribute is None:
try:
term_set = container.term_set
except AttributeError:
msg = "Cannot Find TermSet"
raise AttributeError(msg)
else:
term_set = container[attribute].term_set
if term_set is None:
msg = "Cannot Find TermSet"
raise ValueError(msg)
"""
Method to search through the root_container for all instances of TermSet.
Currently, only datasets are supported. By using a TermSet, the data comes validated
and can use the permissible values within the set to populate HERD.
"""
root_container = kwargs['root_container']

if file is None:
file = self._get_file_from_container(container=container)
all_children = root_container.all_objects # dictionary of objects with the IDs as keys

# if key is provided then add_ref proceeds as normal
# use key provided as the term in the term_set for entity look-up
if key is not None:
data = [key]
else:
if attribute is None:
data_object = container
else:
data_object = getattr(container, attribute)
if isinstance(data_object, (Data, DataIO)):
data = data_object.data
elif isinstance(data_object, (list, np.ndarray)):
data = data_object
missing_terms = []
for term in data:
for child in all_children:
try:
term_info = term_set[term]
except ValueError:
missing_terms.append(term)
term_set = all_children[child].term_set
data = all_children[child].data # TODO: This will be expanded to not just support data
except AttributeError:
continue
entity_id = term_info[0]
entity_uri = term_info[2]
self.add_ref(file=file,
container=container,
attribute=attribute,
key=term,
field=field,
entity_id=entity_id,
entity_uri=entity_uri)
if len(missing_terms)>0:
return {"Missing Values in TermSet": missing_terms}
else:
return True

if term_set is not None:
for term in data:
term_info = term_set[term]
entity_id = term_info[0]
entity_uri = term_info[2]
self.add_ref(file=root_container,
container=all_children[child],
key=term,
entity_id=entity_id,
entity_uri=entity_uri)

@docval({'name': 'key_name', 'type': str, 'doc': 'The name of the Key to get.'},
{'name': 'file', 'type': HERDManager, 'doc': 'The file associated with the container.',
Expand Down
32 changes: 32 additions & 0 deletions src/hdmf/container.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ def __init__(self, **kwargs):
self.__name = name
self.__field_values = dict()
self.__read_io = None
self.__obj = None

@property
def read_io(self):
Expand Down Expand Up @@ -302,6 +303,37 @@ def get_ancestor(self, **kwargs):
p = p.parent
return None

def all_children(self):
"""Get a list of all child objects and their child objects recursively.
If the object has an object_id, the object will be added to "ret" to be returned.
If that object has children, they will be added to the "stack" in order to be:
1) Checked to see if has an object_id, if so then add to "ret"
2) Have children that will also be checked
"""
stack = [self] # list of containers, including self, to add and later parse for children
ret = list()
self.__obj = LabelledDict(label='all_objects', key_attr='object_id')
while len(stack): # search until there's nothing in the list
n = stack.pop()
ret.append(n)
if n.object_id is not None:
self.__obj[n.object_id] = n
else: # pragma: no cover
# warn that a child does not have an object_id, which is unusual
warn('%s "%s" does not have an object_id' % (type(n).__class__, n.name))
if hasattr(n, 'children'):
for c in n.children:
stack.append(c)
return ret

@property
def all_objects(self):
"""Get a LabelledDict that indexed all child objects and their children by object ID."""
if self.__obj is None:
self.all_children()
return self.__obj

@docval()
def get_ancestors(self, **kwargs):
p = self.parent
Expand Down
91 changes: 10 additions & 81 deletions tests/unit/common/test_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,93 +275,22 @@ def test_add_ref_termset(self):
em = HERDManagerContainer()
em.link_resources(er)

col1 = VectorData(name='Species_Data',
description='species from NCBI and Ensemble',
data=['Homo sapiens'],
term_set=terms)

species = DynamicTable(name='species', description='My species', columns=[col1],)

er.add_ref_term_set(file=em,
container=species,
attribute='Species_Data',
)
self.assertEqual(er.keys.data, [('Homo sapiens',)])
self.assertEqual(er.entities.data, [('NCBI_TAXON:9606',
'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9606')])
self.assertEqual(er.objects.data, [(0, col1.object_id, 'VectorData', '', '')])

@unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed")
def test_add_ref_termset_missing_termset(self):
er = HERD()
em = HERDManagerContainer()
em.link_resources(er)

species = DynamicTable(name='species', description='My species')

with self.assertRaises(AttributeError):
er.add_ref_term_set(file=em,
container=species,
)

@unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed")
def test_add_ref_termset_missing_attribute_termset_value(self):
er = HERD()
em = HERDManagerContainer()
em.link_resources(er)

col1 = VectorData(name='Species_Data',
description='species from NCBI and Ensemble',
data=['Homo sapiens'])
species = DynamicTable(name='species', description='My species', columns=[col1],)

with self.assertRaises(ValueError):
er.add_ref_term_set(file=em,
container=species,
attribute='Species_Data',
)

@unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed")
def test_add_ref_termset_missing_terms(self):
terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml')
er = HERD()
em = HERDManagerContainer()
em.link_resources(er)

col1 = VectorData(name='Species_Data',
description='species from NCBI and Ensemble',
data=['Homo sapiens', 'missing_term'])
# create children and add parent
col1 = VectorData(
name='Species_1',
description='...',
data=['Homo sapiens'],
term_set=terms,
)
species = DynamicTable(name='species', description='My species', columns=[col1])

species = DynamicTable(name='species', description='My species', columns=[col1],)
species.parent = em

missing_terms = er.add_ref_term_set(file=em,
container=species,
attribute='Species_Data',
term_set=terms
)
er.add_ref_term_set(root_container=em)
self.assertEqual(er.keys.data, [('Homo sapiens',)])
self.assertEqual(er.entities.data, [('NCBI_TAXON:9606',
'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?mode=Info&id=9606')])
self.assertEqual(er.objects.data, [(0, col1.object_id, 'VectorData', '', '')])
self.assertEqual(missing_terms, {'Missing Values in TermSet': ['missing_term']})

@unittest.skipIf(not LINKML_INSTALLED, "optional LinkML module is not installed")
def test_add_ref_termset_missing_file_error(self):
terms = TermSet(term_schema_path='tests/unit/example_test_term_set.yaml')
er = HERD()

col1 = VectorData(name='Species_Data',
description='species from NCBI and Ensemble',
data=['Homo sapiens'],
term_set=terms)

species = DynamicTable(name='species', description='My species', columns=[col1],)

with self.assertRaises(ValueError):
er.add_ref_term_set(
container=species,
attribute='Species_Data',
)

def test_get_file_from_container(self):
file = HERDManagerContainer(name='file')
Expand Down
19 changes: 16 additions & 3 deletions tests/unit/test_container.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,16 @@ def test_set_modified_parent(self):
child_obj.set_modified()
self.assertTrue(child_obj.parent.modified)

def test_all_children(self):
col1 = VectorData(
name='Species_1',
description='...',
data=['Homo sapiens'],
)
species = DynamicTable(name='species', description='My species', columns=[col1])
obj = species.all_objects
self.assertEqual(sorted(list(obj.keys())), sorted([species.object_id, species.id.object_id, col1.object_id]))

def test_add_child(self):
"""Test that add child creates deprecation warning and also properly sets child's parent and modified
"""
Expand Down Expand Up @@ -578,7 +588,8 @@ class EmptyFields(AbstractContainer):
self.assertTupleEqual(EmptyFields.get_fields_conf(), tuple())

props = TestAbstractContainerFieldsConf.find_all_properties(EmptyFields)
expected = ['children', 'container_source', 'fields', 'modified', 'name', 'object_id', 'parent', 'read_io']
expected = ['all_objects', 'children', 'container_source', 'fields', 'modified',
'name', 'object_id', 'parent', 'read_io']
self.assertListEqual(props, expected)

def test_named_fields(self):
Expand All @@ -598,7 +609,8 @@ def __init__(self, **kwargs):
self.assertTupleEqual(NamedFields.get_fields_conf(), expected)

props = TestAbstractContainerFieldsConf.find_all_properties(NamedFields)
expected = ['children', 'container_source', 'field1', 'field2', 'fields', 'modified', 'name', 'object_id',
expected = ['all_objects', 'children', 'container_source', 'field1', 'field2',
'fields', 'modified', 'name', 'object_id',
'parent', 'read_io']
self.assertListEqual(props, expected)

Expand Down Expand Up @@ -679,7 +691,8 @@ class NamedFieldsChild(NamedFields):
self.assertTupleEqual(NamedFieldsChild.get_fields_conf(), expected)

props = TestAbstractContainerFieldsConf.find_all_properties(NamedFieldsChild)
expected = ['children', 'container_source', 'field1', 'field2', 'fields', 'modified', 'name', 'object_id',
expected = ['all_objects', 'children', 'container_source', 'field1', 'field2',
'fields', 'modified', 'name', 'object_id',
'parent', 'read_io']
self.assertListEqual(props, expected)

Expand Down

0 comments on commit 901e124

Please sign in to comment.