Skip to content

Commit

Permalink
Update HERD for user defined zip file (#941)
Browse files Browse the repository at this point in the history
  • Loading branch information
mavaylon1 committed Aug 15, 2023
1 parent 8376a6a commit 92915c2
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 67 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
- Increase default chunk size for `GenericDataChunkIterator` from 1 MB to 10 MB. @bendichter, @rly [#925](https://github.com/hdmf-dev/hdmf/pull/925)
- Added the magic `__reduce__` method as well as two private semi-abstract helper methods to enable pickling of the `GenericDataChunkIterator`. @codycbakerphd [#924](https://github.com/hdmf-dev/hdmf/pull/924)
- Added Dynamic Enumerations and Schemasheets support to `TermSet`. @mavaylon1 [#923](https://github.com/hdmf-dev/hdmf/pull/923)
- Updated `HERD` to support user defined file name for the `HERD` zip file. @mavaylon1 [#941](https://github.com/hdmf-dev/hdmf/pull/941)

## HDMF 3.8.1 (July 25, 2023)

Expand Down
12 changes: 6 additions & 6 deletions docs/gallery/plot_external_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -311,18 +311,18 @@ def __init__(self, **kwargs):
# ------------------------------------------------------
# :py:class:`~hdmf.common.resources.HERD` is written as a zip file of
# the individual tables written to tsv.
# The user provides the path, which contains the name of the directory.
# The user provides the path, which contains the name of the file.

er.to_norm_tsv(path='./')
er.to_zip(path='./HERD.zip')

###############################################################################
# Read HERD
# ------------------------------------------------------
# Users can read :py:class:`~hdmf.common.resources.HERD` from the tsv format
# by providing the path to the directory.
# Users can read :py:class:`~hdmf.common.resources.HERD` from the zip file
# by providing the path to the file itself.

er_read = HERD.from_norm_tsv(path='./')
os.remove('./er.zip')
er_read = HERD.from_zip(path='./HERD.zip')
os.remove('./HERD.zip')

###############################################################################
# Using TermSet with HERD
Expand Down
4 changes: 2 additions & 2 deletions src/hdmf/backends/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def read(self, **kwargs):
if self.herd_path is not None:
from hdmf.common import HERD
try:
self.herd = HERD.from_norm_tsv(path=self.herd_path)
self.herd = HERD.from_zip(path=self.herd_path)
if isinstance(container, HERDManager):
container.link_resources(herd=self.herd)
except FileNotFoundError:
Expand All @@ -84,7 +84,7 @@ def write(self, **kwargs):
if self.herd_path is not None:
herd = container.get_linked_resources()
if herd is not None:
herd.to_norm_tsv(path=self.herd_path)
herd.to_zip(path=self.herd_path)
else:
msg = "Could not find linked HERD. Container was still written to IO source."
warn(msg)
Expand Down
31 changes: 18 additions & 13 deletions src/hdmf/common/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -836,19 +836,20 @@ def to_dataframe(self, **kwargs):
# return the result
return result_df

@docval({'name': 'path', 'type': str, 'doc': 'path of the folder tsv file to write'})
def to_norm_tsv(self, **kwargs):
@docval({'name': 'path', 'type': str, 'doc': 'The path to the zip file.'})
def to_zip(self, **kwargs):
"""
Write the tables in HERD to individual tsv files.
Write the tables in HERD to zipped tsv files.
"""
path = kwargs['path']
files = [path+child.name+'.tsv' for child in self.children]
zip_file = kwargs['path']
directory = os.path.dirname(zip_file)

files = [os.path.join(directory, child.name)+'.tsv' for child in self.children]
for i in range(len(self.children)):
df = self.children[i].to_dataframe()
df.to_csv(files[i], sep='\t', index=False)

with zipfile.ZipFile('er.zip', 'w') as zipF:
with zipfile.ZipFile(zip_file, 'w') as zipF:
for file in files:
zipF.write(file)

Expand All @@ -857,13 +858,17 @@ def to_norm_tsv(self, **kwargs):
os.remove(file)

@classmethod
@docval({'name': 'path', 'type': str, 'doc': 'path of the folder containing the tsv files to read'},
returns="HERD loaded from TSV", rtype="HERD")
def from_norm_tsv(cls, **kwargs):
path = kwargs['path']
with zipfile.ZipFile(path+'/er.zip', 'r') as zip:
zip.extractall(path)
tsv_paths = glob(path+'/*')
@docval({'name': 'path', 'type': str, 'doc': 'The path to the zip file.'})
def from_zip(cls, **kwargs):
"""
Method to read in zipped tsv files to populate HERD.
"""
zip_file = kwargs['path']
directory = os.path.dirname(zip_file)

with zipfile.ZipFile(zip_file, 'r') as zip:
zip.extractall(directory)
tsv_paths = glob(directory+'/*')

for file in tsv_paths:
file_name = os.path.basename(file)
Expand Down
61 changes: 30 additions & 31 deletions tests/unit/common/test_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,17 +51,16 @@ def remove_er_files(self):
remove_test_file('./object_keys.tsv')
remove_test_file('./keys.tsv')
remove_test_file('./files.tsv')
remove_test_file('./er.tsv')
remove_test_file('./er.zip')
remove_test_file('./HERD.zip')

def child_tsv(self, external_resources):
for child in external_resources.children:
df = child.to_dataframe()
df.to_csv('./'+child.name+'.tsv', sep='\t', index=False)

def zip_child(self):
def zip_child(self, zip_file):
files = glob('*.tsv')
with zipfile.ZipFile('er.zip', 'w') as zipF:
with zipfile.ZipFile(zip_file, 'w') as zipF:
for file in files:
zipF.write(file)

Expand Down Expand Up @@ -590,159 +589,159 @@ def test_get_obj_entities_attribute(self):

pd.testing.assert_frame_equal(df, expected_df)

def test_to_and_from_norm_tsv(self):
def test_to_and_from_zip(self):
er = HERD()
data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
er.add_ref(file=HERDManagerContainer(name='file'),
container=data,
key='key1',
entity_id='entity_id1',
entity_uri='entity1')
er.to_norm_tsv(path='./')
er.to_zip(path='./HERD.zip')

er_read = HERD.from_norm_tsv(path='./')
er_read = HERD.from_zip(path='./HERD.zip')
HERD.assert_external_resources_equal(er_read, er, check_dtype=False)

self.remove_er_files()

def test_to_and_from_norm_tsv_entity_value_error(self):
def test_to_and_from_zip_entity_value_error(self):
er = HERD()
data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
er.add_ref(file=HERDManagerContainer(name='file'),
container=data,
key='key1',
entity_id='entity_id1',
entity_uri='entity1')
er.to_norm_tsv(path='./')
er.to_zip(path='./HERD.zip')

self.child_tsv(external_resources=er)

df = er.entities.to_dataframe()
df.at[0, ('keys_idx')] = 10 # Change key_ix 0 to 10
df.to_csv('./entities.tsv', sep='\t', index=False)

self.zip_child()
self.zip_child(zip_file='HERD.zip')

with self.assertRaises(ValueError):
_ = HERD.from_norm_tsv(path='./')
_ = HERD.from_zip(path='./HERD.zip')

self.remove_er_files()

def test_to_and_from_norm_tsv_entity_key_value_error_key(self):
def test_to_and_from_zip_entity_key_value_error_key(self):
er = HERD()
data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
er.add_ref(file=HERDManagerContainer(name='file'),
container=data,
key='key1',
entity_id='entity_id1',
entity_uri='entity1')
er.to_norm_tsv(path='./')
er.to_zip(path='./HERD.zip')

self.child_tsv(external_resources=er)

df = er.entity_keys.to_dataframe()
df.at[0, ('keys_idx')] = 10 # Change key_ix 0 to 10
df.to_csv('./entity_keys.tsv', sep='\t', index=False)

self.zip_child()
self.zip_child(zip_file='HERD.zip')

with self.assertRaises(ValueError):
_ = HERD.from_norm_tsv(path='./')
_ = HERD.from_zip(path='./HERD.zip')

self.remove_er_files()

def test_to_and_from_norm_tsv_entity_key_value_error_entity(self):
def test_to_and_from_zip_entity_key_value_error_entity(self):
er = HERD()
data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
er.add_ref(file=HERDManagerContainer(name='file'),
container=data,
key='key1',
entity_id='entity_id1',
entity_uri='entity1')
er.to_norm_tsv(path='./')
er.to_zip(path='./HERD.zip')

self.child_tsv(external_resources=er)

df = er.entity_keys.to_dataframe()
df.at[0, ('entities_idx')] = 10 # Change key_ix 0 to 10
df.to_csv('./entity_keys.tsv', sep='\t', index=False)

self.zip_child()
self.zip_child(zip_file='HERD.zip')

with self.assertRaises(ValueError):
_ = HERD.from_norm_tsv(path='./')
_ = HERD.from_zip(path='./HERD.zip')

self.remove_er_files()

def test_to_and_from_norm_tsv_object_value_error(self):
def test_to_and_from_zip_object_value_error(self):
er = HERD()
data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
er.add_ref(file=HERDManagerContainer(name='file'),
container=data,
key='key1',
entity_id='entity_id1',
entity_uri='entity1')
er.to_norm_tsv(path='./')
er.to_zip(path='./HERD.zip')

self.child_tsv(external_resources=er)

df = er.objects.to_dataframe()
df.at[0, ('files_idx')] = 10 # Change key_ix 0 to 10
df.to_csv('./objects.tsv', sep='\t', index=False)

self.zip_child()
self.zip_child(zip_file='HERD.zip')

msg = "File_ID Index out of range in ObjectTable. Please check for alterations."
with self.assertRaisesWith(ValueError, msg):
_ = HERD.from_norm_tsv(path='./')
_ = HERD.from_zip(path='./HERD.zip')

self.remove_er_files()

def test_to_and_from_norm_tsv_object_keys_object_idx_value_error(self):
def test_to_and_from_zip_object_keys_object_idx_value_error(self):
er = HERD()
data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
er.add_ref(file=HERDManagerContainer(name='file'),
container=data,
key='key1',
entity_id='entity_id1',
entity_uri='entity1')
er.to_norm_tsv(path='./')
er.to_zip(path='./HERD.zip')

self.child_tsv(external_resources=er)

df = er.object_keys.to_dataframe()
df.at[0, ('objects_idx')] = 10 # Change key_ix 0 to 10
df.to_csv('./object_keys.tsv', sep='\t', index=False)

self.zip_child()
self.zip_child(zip_file='HERD.zip')

msg = "Object Index out of range in ObjectKeyTable. Please check for alterations."
with self.assertRaisesWith(ValueError, msg):
_ = HERD.from_norm_tsv(path='./')
_ = HERD.from_zip(path='./HERD.zip')

self.remove_er_files()

def test_to_and_from_norm_tsv_object_keys_key_idx_value_error(self):
def test_to_and_from_zip_object_keys_key_idx_value_error(self):
er = HERD()
data = Data(name="species", data=['Homo sapiens', 'Mus musculus'])
er.add_ref(file=HERDManagerContainer(name='file'),
container=data,
key='key1',
entity_id='entity_id1',
entity_uri='entity1')
er.to_norm_tsv(path='./')
er.to_zip(path='./HERD.zip')

self.child_tsv(external_resources=er)

df = er.object_keys.to_dataframe()
df.at[0, ('keys_idx')] = 10 # Change key_ix 0 to 10
df.to_csv('./object_keys.tsv', sep='\t', index=False)

self.zip_child()
self.zip_child(zip_file='HERD.zip')

msg = "Key Index out of range in ObjectKeyTable. Please check for alterations."
with self.assertRaisesWith(ValueError, msg):
_ = HERD.from_norm_tsv(path='./')
_ = HERD.from_zip(path='./HERD.zip')

self.remove_er_files()

Expand Down
27 changes: 12 additions & 15 deletions tests/unit/test_io_hdf5_h5tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -950,17 +950,16 @@ def remove_er_files(self):
remove_test_file('./object_keys.tsv')
remove_test_file('./keys.tsv')
remove_test_file('./files.tsv')
remove_test_file('./er.tsv')
remove_test_file('./er.zip')
remove_test_file('./HERD.zip')

def child_tsv(self, herd):
for child in herd.children:
df = child.to_dataframe()
df.to_csv('./'+child.name+'.tsv', sep='\t', index=False)

def zip_child(self):
def zip_child(self, zip_file):
files = glob('*.tsv')
with zipfile.ZipFile('er.zip', 'w') as zipF:
with zipfile.ZipFile(zip_file, 'w') as zipF:
for file in files:
zipF.write(file)

Expand All @@ -972,13 +971,11 @@ def test_io_read_herd(self):
key='key1',
entity_id='entity_id1',
entity_uri='entity1')
er.to_norm_tsv(path='./')

with HDF5IO(self.path, manager=self.manager, mode='r', herd_path='./') as io:
er.to_zip(path='./HERD.zip')
with HDF5IO(self.path, manager=self.manager, mode='r', herd_path='./HERD.zip') as io:
container = io.read()
self.assertIsInstance(io.herd, HERD)
self.assertIsInstance(container.get_linked_resources(), HERD)

self.remove_er_files()

def test_io_read_herd_file_warn(self):
Expand All @@ -989,7 +986,7 @@ def test_io_read_herd_file_warn(self):
key='key1',
entity_id='entity_id1',
entity_uri='entity1')
er.to_norm_tsv(path='./')
er.to_zip(path='./HERD.zip')

with HDF5IO(self.path, manager=self.manager, mode='r', herd_path='wrong_path') as io:
with self.assertWarns(Warning):
Expand All @@ -1005,16 +1002,16 @@ def test_io_read_herd_value_warn(self):
key='key1',
entity_id='entity_id1',
entity_uri='entity1')
er.to_norm_tsv(path='./')
er.to_zip(path='./HERD.zip')

self.child_tsv(herd=er)

df = er.entities.to_dataframe()
df.at[0, ('keys_idx')] = 10 # Change key_ix 0 to 10
df.to_csv('./entities.tsv', sep='\t', index=False)

self.zip_child()
with HDF5IO(self.path, manager=self.manager, mode='r', herd_path='./') as io:
self.zip_child(zip_file='HERD.zip')
with HDF5IO(self.path, manager=self.manager, mode='r', herd_path='./HERD.zip') as io:
with self.assertWarns(Warning):
io.read()

Expand All @@ -1031,10 +1028,10 @@ def test_io_write_herd(self):
entity_id='entity_id1',
entity_uri='entity1')

with HDF5IO(self.path, manager=self.manager, mode='w', herd_path='./') as io:
with HDF5IO(self.path, manager=self.manager, mode='w', herd_path='./HERD.zip') as io:
io.write(self.foofile)

with HDF5IO(self.path, manager=self.manager, mode='r', herd_path='./') as io:
with HDF5IO(self.path, manager=self.manager, mode='r', herd_path='./HERD.zip') as io:
container = io.read()
self.assertIsInstance(io.herd, HERD)
self.assertIsInstance(container.get_linked_resources(), HERD)
Expand All @@ -1050,7 +1047,7 @@ def test_io_warn(self):
key='key1',
entity_id='entity_id1',
entity_uri='entity1')
with HDF5IO(self.path, manager=self.manager, mode='w', herd_path='./') as io:
with HDF5IO(self.path, manager=self.manager, mode='w', herd_path='./HERD.zip') as io:
with self.assertWarns(Warning):
io.write(self.foofile)

Expand Down

0 comments on commit 92915c2

Please sign in to comment.