Skip to content

Commit

Permalink
ExternalResources I/O (#895)
Browse files Browse the repository at this point in the history
* externalresources

* hdf5io

* Update CHANGELOG.md

* format

* fix

* ruff

* test

* Update src/hdmf/backends/io.py

Co-authored-by: Oliver Ruebel <oruebel@users.noreply.github.com>

* feedback

---------

Co-authored-by: Oliver Ruebel <oruebel@users.noreply.github.com>
  • Loading branch information
mavaylon1 and oruebel committed Jul 9, 2023
1 parent 0c01dd7 commit b89679d
Show file tree
Hide file tree
Showing 5 changed files with 129 additions and 10 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
- Updated `ExternalResources` to have EntityKeyTable with updated tests/documentation and minor bug fix to ObjectKeyTable. @mavaylon1 [#872](https://github.com/hdmf-dev/hdmf/pull/872)
- Added warning for DynamicTableRegion links that are not added to the same parent as the original container object. @mavaylon1 [#891](https://github.com/hdmf-dev/hdmf/pull/891)
- Added the `TermSet` class along with integrated validation methods for any child of `AbstractContainer`, e.g., `VectorData`, `Data`, `DynamicTable`. @mavaylon1 [#880](https://github.com/hdmf-dev/hdmf/pull/880)
- Updated `HDMFIO` and `HDF5IO` to support `ExternalResources`. @mavaylon1 [#895](https://github.com/hdmf-dev/hdmf/pull/895)

### Documentation and tutorial enhancements:

Expand Down
13 changes: 9 additions & 4 deletions src/hdmf/backends/hdf5/h5tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,17 @@ class HDF5IO(HDMFIO):
{'name': 'comm', 'type': 'Intracomm',
'doc': 'the MPI communicator to use for parallel I/O', 'default': None},
{'name': 'file', 'type': [File, "S3File"], 'doc': 'a pre-existing h5py.File object', 'default': None},
{'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None})
{'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None},
{'name': 'external_resources_path', 'type': str,
'doc': 'The path to the ExternalResources', 'default': None},)
def __init__(self, **kwargs):
"""Open an HDF5 file for IO.
"""
self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__))
path, manager, mode, comm, file_obj, driver = popargs('path', 'manager', 'mode', 'comm', 'file', 'driver',
kwargs)
path, manager, mode, comm, file_obj, driver, external_resources_path = popargs('path', 'manager', 'mode',
'comm', 'file', 'driver',
'external_resources_path',
kwargs)

self.__open_links = [] # keep track of other files opened from links in this file
self.__file = None # This will be set below, but set to None first in case an error occurs and we need to close
Expand All @@ -76,7 +80,8 @@ def __init__(self, **kwargs):
self.__comm = comm
self.__mode = mode
self.__file = file_obj
super().__init__(manager, source=path) # NOTE: source is not set if path is None and file_obj is passed
super().__init__(manager, source=path, external_resources_path=external_resources_path)
# NOTE: source is not set if path is None and file_obj is passed
self.__built = dict() # keep track of each builder for each dataset/group/link for each file
self.__read = dict() # keep track of which files have been read. Key is the filename value is the builder
self.__ref_queue = deque() # a queue of the references that need to be added
Expand Down
24 changes: 21 additions & 3 deletions src/hdmf/backends/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,21 @@
from pathlib import Path

from ..build import BuildManager, GroupBuilder
from ..container import Container
from ..container import Container, ExternalResourcesManager
from .errors import UnsupportedOperation
from ..utils import docval, getargs, popargs
from warnings import warn


class HDMFIO(metaclass=ABCMeta):
@docval({'name': 'manager', 'type': BuildManager,
'doc': 'the BuildManager to use for I/O', 'default': None},
{"name": "source", "type": (str, Path),
"doc": "the source of container being built i.e. file path", 'default': None})
"doc": "the source of container being built i.e. file path", 'default': None},
{'name': 'external_resources_path', 'type': str,
'doc': 'The path to the ExternalResources', 'default': None},)
def __init__(self, **kwargs):
manager, source = getargs('manager', 'source', kwargs)
manager, source, external_resources_path = getargs('manager', 'source', 'external_resources_path', kwargs)
if isinstance(source, Path):
source = source.resolve()
elif (isinstance(source, str) and
Expand All @@ -26,6 +29,8 @@ def __init__(self, **kwargs):
self.__manager = manager
self.__built = dict()
self.__source = source
self.external_resources_path = external_resources_path
self.external_resources = None
self.open()

@property
Expand All @@ -46,6 +51,19 @@ def read(self, **kwargs):
# TODO also check that the keys are appropriate. print a better error message
raise UnsupportedOperation('Cannot build data. There are no values.')
container = self.__manager.construct(f_builder)
if self.external_resources_path is not None:
from hdmf.common import ExternalResources
try:
self.external_resources = ExternalResources.from_norm_tsv(path=self.external_resources_path)
if isinstance(container, ExternalResourcesManager):
container.link_resources(external_resources=self.external_resources)
except FileNotFoundError:
msg = "File not found at {}. ExternalResources not added.".format(self.external_resources_path)
warn(msg)
except ValueError:
msg = "Check ExternalResources separately for alterations. ExternalResources not added."
warn(msg)

return container

@docval({'name': 'container', 'type': Container, 'doc': 'the Container object to write'},
Expand Down
4 changes: 2 additions & 2 deletions tests/unit/helpers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from copy import copy, deepcopy

from hdmf.build import BuildManager, ObjectMapper, TypeMap
from hdmf.container import Container, Data
from hdmf.container import Container, ExternalResourcesManager, Data
from hdmf.spec import (
AttributeSpec,
DatasetSpec,
Expand Down Expand Up @@ -117,7 +117,7 @@ def remove_foo(self, foo_name):
return foo


class FooFile(Container):
class FooFile(Container, ExternalResourcesManager):
"""
NOTE: if the ROOT_NAME for the backend is not 'root' then we must set FooFile.ROOT_NAME before use
and should be reset to 'root' when use is finished to avoid potential cross-talk between tests.
Expand Down
97 changes: 96 additions & 1 deletion tests/unit/test_io_hdf5_h5tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
from pathlib import Path
import shutil
import tempfile
from glob import glob
import zipfile

import h5py
import numpy as np
Expand All @@ -18,11 +20,14 @@
from hdmf.backends.errors import UnsupportedOperation
from hdmf.build import GroupBuilder, DatasetBuilder, BuildManager, TypeMap, OrphanContainerBuildError, LinkBuilder
from hdmf.container import Container
from hdmf import Data
from hdmf.data_utils import DataChunkIterator, GenericDataChunkIterator, InvalidDataIOError
from hdmf.spec.catalog import SpecCatalog
from hdmf.spec.namespace import NamespaceCatalog, SpecNamespace
from hdmf.spec.spec import GroupSpec
from hdmf.testing import TestCase
from hdmf.testing import TestCase, remove_test_file
from hdmf.common.resources import ExternalResources


from tests.unit.helpers.utils import (Foo, FooBucket, FooFile, get_foo_buildmanager,
Baz, BazData, BazCpdData, BazBucket, get_baz_buildmanager,
Expand Down Expand Up @@ -925,6 +930,96 @@ def test_no_cache_spec(self):
self.assertNotIn('specifications', f)


class TestExternalResourcesIO(TestCase):

def setUp(self):
self.manager = get_foo_buildmanager()
self.path = get_temp_filepath()

foo1 = Foo('foo1', [1, 2, 3, 4, 5], "I am foo1", 17, 3.14)
foobucket = FooBucket('bucket1', [foo1])
self.foofile = FooFile(buckets=[foobucket])

with HDF5IO(self.path, manager=self.manager, mode='w') as io:
io.write(self.foofile)

def remove_er_files(self):
remove_test_file('./entities.tsv')
remove_test_file('./entity_keys.tsv')
remove_test_file('./objects.tsv')
remove_test_file('./object_keys.tsv')
remove_test_file('./keys.tsv')
remove_test_file('./files.tsv')
remove_test_file('./er.tsv')
remove_test_file('./er.zip')

def child_tsv(self, external_resources):
for child in external_resources.children:
df = child.to_dataframe()
df.to_csv('./'+child.name+'.tsv', sep='\t', index=False)

def zip_child(self):
files = glob('*.tsv')
with zipfile.ZipFile('er.zip', 'w') as zipF:
for file in files:
zipF.write(file)

def test_io_read_external_resources(self):
er = ExternalResources()
data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
er.add_ref(file=self.foofile,
container=data,
key='key1',
entity_id='entity_id1',
entity_uri='entity1')
er.to_norm_tsv(path='./')

with HDF5IO(self.path, manager=self.manager, mode='r', external_resources_path='./') as io:
container = io.read()
self.assertIsInstance(io.external_resources, ExternalResources)
self.assertIsInstance(container.get_linked_resources(), ExternalResources)

self.remove_er_files()

def test_io_read_external_resources_file_warn(self):
er = ExternalResources()
data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
er.add_ref(file=self.foofile,
container=data,
key='key1',
entity_id='entity_id1',
entity_uri='entity1')
er.to_norm_tsv(path='./')

with HDF5IO(self.path, manager=self.manager, mode='r', external_resources_path='wrong_path') as io:
with self.assertWarns(Warning):
io.read()

self.remove_er_files()

def test_io_read_external_resources_value_warn(self):
er = ExternalResources()
data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
er.add_ref(file=self.foofile,
container=data,
key='key1',
entity_id='entity_id1',
entity_uri='entity1')
er.to_norm_tsv(path='./')

self.child_tsv(external_resources=er)

df = er.entities.to_dataframe()
df.at[0, ('keys_idx')] = 10 # Change key_ix 0 to 10
df.to_csv('./entities.tsv', sep='\t', index=False)

self.zip_child()
with HDF5IO(self.path, manager=self.manager, mode='r', external_resources_path='./') as io:
with self.assertWarns(Warning):
io.read()

self.remove_er_files()

class TestMultiWrite(TestCase):

def setUp(self):
Expand Down

0 comments on commit b89679d

Please sign in to comment.