Skip to content

Commit

Permalink
Expose AWS Region to HDF5IO (#1040)
Browse files Browse the repository at this point in the history
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Ryan Ly <rly@lbl.gov>
  • Loading branch information
3 people committed May 21, 2024
1 parent 6a0f9d8 commit c18d1b3
Show file tree
Hide file tree
Showing 8 changed files with 159 additions and 17 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/run_all_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ jobs:
run: |
tox -e wheelinstall --installpkg dist/*.tar.gz
run-gallery-ros3-tests:
run-ros3-tests:
name: ${{ matrix.name }}
runs-on: ${{ matrix.os }}
defaults:
Expand All @@ -210,9 +210,9 @@ jobs:
fail-fast: false
matrix:
include:
- { name: linux-gallery-python3.12-ros3 , python-ver: "3.12", os: ubuntu-latest }
- { name: windows-gallery-python3.12-ros3 , python-ver: "3.12", os: windows-latest }
- { name: macos-gallery-python3.12-ros3 , python-ver: "3.12", os: macos-latest }
- { name: linux-python3.12-ros3 , python-ver: "3.12", os: ubuntu-latest }
- { name: windows-python3.12-ros3 , python-ver: "3.12", os: windows-latest }
- { name: macos-python3.12-ros3 , python-ver: "3.12", os: macos-latest }
steps:
- name: Checkout repo with submodules
uses: actions/checkout@v4
Expand Down
55 changes: 55 additions & 0 deletions .github/workflows/run_coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,58 @@ jobs:
file: ./coverage.xml
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

run-ros3-coverage:
name: ${{ matrix.name }}
runs-on: ${{ matrix.os }}
defaults:
run:
shell: bash -l {0} # necessary for conda
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.name }}
cancel-in-progress: true
strategy:
fail-fast: false
matrix:
include:
- { name: linux-python3.12-ros3 , python-ver: "3.12", os: ubuntu-latest }
steps:
- name: Checkout repo with submodules
uses: actions/checkout@v4
with:
submodules: 'recursive'
fetch-depth: 0 # tags are required to determine the version

- name: Set up Conda
uses: conda-incubator/setup-miniconda@v3
with:
auto-update-conda: true
activate-environment: ros3
environment-file: environment-ros3.yml
python-version: ${{ matrix.python-ver }}
channels: conda-forge
auto-activate-base: false
mamba-version: "*"

- name: Install run dependencies
run: |
pip install .
pip list
- name: Conda reporting
run: |
conda info
conda config --show-sources
conda list --show-channel-urls
- name: Run ros3 tests # TODO include gallery tests after they are written
run: |
pytest --cov --cov-report=xml --cov-report=term tests/unit/test_io_hdf5_streaming.py
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
with:
fail_ci_if_error: true
file: ./coverage.xml
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
4 changes: 2 additions & 2 deletions .github/workflows/run_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ jobs:
--token ${{ secrets.BOT_GITHUB_TOKEN }} \
--re-upload
run-gallery-ros3-tests:
run-ros3-tests:
name: ${{ matrix.name }}
runs-on: ${{ matrix.os }}
defaults:
Expand All @@ -222,7 +222,7 @@ jobs:
fail-fast: false
matrix:
include:
- { name: linux-gallery-python3.12-ros3 , python-ver: "3.12", os: ubuntu-latest }
- { name: linux-python3.12-ros3 , python-ver: "3.12", os: ubuntu-latest }
steps:
- name: Checkout repo with submodules
uses: actions/checkout@v4
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
- Updated `TermSetWrapper` to support validating a single field within a compound array. @mavaylon1 [#1061](https://github.com/hdmf-dev/hdmf/pull/1061)
- Updated testing to not install in editable mode and not run `coverage` by default. @rly [#1107](https://github.com/hdmf-dev/hdmf/pull/1107)
- Add `post_init_method` parameter when generating classes to perform post-init functionality, i.e., validation. @mavaylon1 [#1089](https://github.com/hdmf-dev/hdmf/pull/1089)
- Exposed `aws_region` to `HDF5IO` and downstream passes to `h5py.File`. @codycbakerphd [#1040](https://github.com/hdmf-dev/hdmf/pull/1040)
- Exposed `progress_bar_class` to the `GenericDataChunkIterator` for more custom control over display of progress while iterating. @codycbakerphd [#1110](https://github.com/hdmf-dev/hdmf/pull/1110)
- Updated loading, unloading, and getting the `TypeConfigurator` to support a `TypeMap` parameter. @mavaylon1 [#1117](https://github.com/hdmf-dev/hdmf/pull/1117)

Expand Down
38 changes: 30 additions & 8 deletions src/hdmf/backends/hdf5/h5tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,15 +62,21 @@ def can_read(path):
{'name': 'file', 'type': [File, "S3File", "RemFile"],
'doc': 'a pre-existing h5py.File, S3File, or RemFile object', 'default': None},
{'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None},
{
'name': 'aws_region',
'type': str,
'doc': 'If driver is ros3, then specify the aws region of the url.',
'default': None
},
{'name': 'herd_path', 'type': str,
'doc': 'The path to read/write the HERD file', 'default': None},)
def __init__(self, **kwargs):
"""Open an HDF5 file for IO.
"""
self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__))
path, manager, mode, comm, file_obj, driver, herd_path = popargs('path', 'manager', 'mode',
path, manager, mode, comm, file_obj, driver, aws_region, herd_path = popargs('path', 'manager', 'mode',
'comm', 'file', 'driver',
'herd_path',
'aws_region', 'herd_path',
kwargs)

self.__open_links = [] # keep track of other files opened from links in this file
Expand All @@ -91,6 +97,7 @@ def __init__(self, **kwargs):
elif isinstance(manager, TypeMap):
manager = BuildManager(manager)
self.__driver = driver
self.__aws_region = aws_region
self.__comm = comm
self.__mode = mode
self.__file = file_obj
Expand All @@ -116,6 +123,10 @@ def _file(self):
def driver(self):
return self.__driver

@property
def aws_region(self):
return self.__aws_region

@classmethod
def __check_path_file_obj(cls, path, file_obj):
if isinstance(path, Path):
Expand All @@ -133,13 +144,17 @@ def __check_path_file_obj(cls, path, file_obj):
return path

@classmethod
def __resolve_file_obj(cls, path, file_obj, driver):
def __resolve_file_obj(cls, path, file_obj, driver, aws_region=None):
"""Helper function to return a File when loading or getting namespaces from a file."""
path = cls.__check_path_file_obj(path, file_obj)

if file_obj is None:
file_kwargs = dict()
if driver is not None:
file_kwargs.update(driver=driver)

if aws_region is not None:
file_kwargs.update(aws_region=bytes(aws_region, "ascii"))
file_obj = File(path, 'r', **file_kwargs)
return file_obj

Expand All @@ -150,6 +165,8 @@ def __resolve_file_obj(cls, path, file_obj, driver):
{'name': 'namespaces', 'type': list, 'doc': 'the namespaces to load', 'default': None},
{'name': 'file', 'type': File, 'doc': 'a pre-existing h5py.File object', 'default': None},
{'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None},
{'name': 'aws_region', 'type': str, 'doc': 'If driver is ros3, then specify the aws region of the url.',
'default': None},
returns=("dict mapping the names of the loaded namespaces to a dict mapping included namespace names and "
"the included data types"),
rtype=dict)
Expand All @@ -162,10 +179,10 @@ def load_namespaces(cls, **kwargs):
:raises ValueError: if both `path` and `file` are supplied but `path` is not the same as the path of `file`.
"""
namespace_catalog, path, namespaces, file_obj, driver = popargs(
'namespace_catalog', 'path', 'namespaces', 'file', 'driver', kwargs)
namespace_catalog, path, namespaces, file_obj, driver, aws_region = popargs(
'namespace_catalog', 'path', 'namespaces', 'file', 'driver', 'aws_region', kwargs)

open_file_obj = cls.__resolve_file_obj(path, file_obj, driver)
open_file_obj = cls.__resolve_file_obj(path, file_obj, driver, aws_region=aws_region)
if file_obj is None: # need to close the file object that we just opened
with open_file_obj:
return cls.__load_namespaces(namespace_catalog, namespaces, open_file_obj)
Expand Down Expand Up @@ -214,6 +231,8 @@ def __check_specloc(cls, file_obj):
@docval({'name': 'path', 'type': (str, Path), 'doc': 'the path to the HDF5 file', 'default': None},
{'name': 'file', 'type': File, 'doc': 'a pre-existing h5py.File object', 'default': None},
{'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None},
{'name': 'aws_region', 'type': str, 'doc': 'If driver is ros3, then specify the aws region of the url.',
'default': None},
returns="dict mapping names to versions of the namespaces in the file", rtype=dict)
def get_namespaces(cls, **kwargs):
"""Get the names and versions of the cached namespaces from a file.
Expand All @@ -227,9 +246,9 @@ def get_namespaces(cls, **kwargs):
:raises ValueError: if both `path` and `file` are supplied but `path` is not the same as the path of `file`.
"""
path, file_obj, driver = popargs('path', 'file', 'driver', kwargs)
path, file_obj, driver, aws_region = popargs('path', 'file', 'driver', 'aws_region', kwargs)

open_file_obj = cls.__resolve_file_obj(path, file_obj, driver)
open_file_obj = cls.__resolve_file_obj(path, file_obj, driver, aws_region=aws_region)
if file_obj is None: # need to close the file object that we just opened
with open_file_obj:
return cls.__get_namespaces(open_file_obj)
Expand Down Expand Up @@ -756,6 +775,9 @@ def open(self):
if self.driver is not None:
kwargs.update(driver=self.driver)

if self.driver == "ros3" and self.aws_region is not None:
kwargs.update(aws_region=bytes(self.aws_region, "ascii"))

self.__file = File(self.source, open_flag, **kwargs)

def close(self, close_links=True):
Expand Down
56 changes: 56 additions & 0 deletions tests/unit/test_io_hdf5_streaming.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@
import os
import urllib.request
import h5py
import warnings

from hdmf.backends.hdf5.h5tools import HDF5IO
from hdmf.build import TypeMap, BuildManager
from hdmf.common import get_hdf5io, get_type_map
from hdmf.spec import GroupSpec, DatasetSpec, SpecNamespace, NamespaceBuilder, NamespaceCatalog
from hdmf.testing import TestCase
from hdmf.utils import docval, get_docval



class TestRos3(TestCase):
"""Test reading an HDMF file using HDF5 ROS3 streaming.
Expand Down Expand Up @@ -77,6 +80,8 @@ def setUp(self):

self.manager = BuildManager(type_map)

warnings.filterwarnings(action="ignore", message="Ignoring cached namespace .*")

def tearDown(self):
if os.path.exists(self.ns_filename):
os.remove(self.ns_filename)
Expand All @@ -89,6 +94,57 @@ def test_basic_read(self):
with get_hdf5io(s3_path, "r", manager=self.manager, driver="ros3") as io:
io.read()

def test_basic_read_with_aws_region(self):
s3_path = "https://dandiarchive.s3.amazonaws.com/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991"

with get_hdf5io(s3_path, "r", manager=self.manager, driver="ros3", aws_region="us-east-2") as io:
io.read()

def test_basic_read_s3_with_aws_region(self):
# NOTE: if an s3 path is used with ros3 driver, aws_region must be specified
s3_path = "s3://dandiarchive/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991"

with get_hdf5io(s3_path, "r", manager=self.manager, driver="ros3", aws_region="us-east-2") as io:
io.read()
assert io.aws_region == "us-east-2"

def test_get_namespaces(self):
s3_path = "https://dandiarchive.s3.amazonaws.com/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991"

namespaces = HDF5IO.get_namespaces(s3_path, driver="ros3")
self.assertEqual(namespaces, {'core': '2.3.0', 'hdmf-common': '1.5.0', 'hdmf-experimental': '0.1.0'})

def test_get_namespaces_with_aws_region(self):
s3_path = "https://dandiarchive.s3.amazonaws.com/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991"

namespaces = HDF5IO.get_namespaces(s3_path, driver="ros3", aws_region="us-east-2")
self.assertEqual(namespaces, {'core': '2.3.0', 'hdmf-common': '1.5.0', 'hdmf-experimental': '0.1.0'})

def test_get_namespaces_s3_with_aws_region(self):
s3_path = "s3://dandiarchive/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991"

namespaces = HDF5IO.get_namespaces(s3_path, driver="ros3", aws_region="us-east-2")
self.assertEqual(namespaces, {'core': '2.3.0', 'hdmf-common': '1.5.0', 'hdmf-experimental': '0.1.0'})

def test_load_namespaces(self):
s3_path = "https://dandiarchive.s3.amazonaws.com/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991"

HDF5IO.load_namespaces(self.manager.namespace_catalog, path=s3_path, driver="ros3")
assert set(self.manager.namespace_catalog.namespaces) == set(["core", "hdmf-common", "hdmf-experimental"])

def test_load_namespaces_with_aws_region(self):
s3_path = "https://dandiarchive.s3.amazonaws.com/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991"

HDF5IO.load_namespaces(self.manager.namespace_catalog, path=s3_path, driver="ros3", aws_region="us-east-2")
assert set(self.manager.namespace_catalog.namespaces) == set(["core", "hdmf-common", "hdmf-experimental"])

def test_load_namespaces_s3_with_aws_region(self):
s3_path = "s3://dandiarchive/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991"

HDF5IO.load_namespaces(self.manager.namespace_catalog, path=s3_path, driver="ros3", aws_region="us-east-2")
assert set(self.manager.namespace_catalog.namespaces) == set(["core", "hdmf-common", "hdmf-experimental"])


# Util functions and classes to enable loading of the NWB namespace -- see pynwb/src/pynwb/spec.py


Expand Down
13 changes: 11 additions & 2 deletions tests/unit/utils_test/test_core_DataIO.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from hdmf.container import Data
from hdmf.data_utils import DataIO
from hdmf.testing import TestCase
import warnings


class DataIOTests(TestCase):
Expand Down Expand Up @@ -36,7 +37,9 @@ def test_set_dataio(self):
dataio = DataIO()
data = np.arange(30).reshape(5, 2, 3)
container = Data('wrapped_data', data)
container.set_dataio(dataio)
msg = "Data.set_dataio() is deprecated. Please use Data.set_data_io() instead."
with self.assertWarnsWith(DeprecationWarning, msg):
container.set_dataio(dataio)
self.assertIs(dataio.data, data)
self.assertIs(dataio, container.data)

Expand All @@ -48,7 +51,13 @@ def test_set_dataio_data_already_set(self):
data = np.arange(30).reshape(5, 2, 3)
container = Data('wrapped_data', data)
with self.assertRaisesWith(ValueError, "cannot overwrite 'data' on DataIO"):
container.set_dataio(dataio)
with warnings.catch_warnings(record=True):
warnings.filterwarnings(
action='ignore',
category=DeprecationWarning,
message="Data.set_dataio() is deprecated. Please use Data.set_data_io() instead.",
)
container.set_dataio(dataio)

def test_dataio_options(self):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,6 @@ def test_progress_bar(self):

@unittest.skipIf(not TQDM_INSTALLED, "optional tqdm module is not installed")
def test_progress_bar_class(self):
import tqdm

class MyCustomProgressBar(tqdm.tqdm):
def update(self, n: int = 1) -> Union[bool, None]:
Expand Down

0 comments on commit c18d1b3

Please sign in to comment.