From c18d1b3d108ee8798c977a1c994fc528b4fa05b0 Mon Sep 17 00:00:00 2001 From: Cody Baker <51133164+CodyCBakerPhD@users.noreply.github.com> Date: Mon, 20 May 2024 22:09:54 -0400 Subject: [PATCH] Expose AWS Region to HDF5IO (#1040) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Ryan Ly --- .github/workflows/run_all_tests.yml | 8 +-- .github/workflows/run_coverage.yml | 55 ++++++++++++++++++ .github/workflows/run_tests.yml | 4 +- CHANGELOG.md | 1 + src/hdmf/backends/hdf5/h5tools.py | 38 ++++++++++--- tests/unit/test_io_hdf5_streaming.py | 56 +++++++++++++++++++ tests/unit/utils_test/test_core_DataIO.py | 13 ++++- .../test_core_GenericDataChunkIterator.py | 1 - 8 files changed, 159 insertions(+), 17 deletions(-) diff --git a/.github/workflows/run_all_tests.yml b/.github/workflows/run_all_tests.yml index def51537f..8df190d55 100644 --- a/.github/workflows/run_all_tests.yml +++ b/.github/workflows/run_all_tests.yml @@ -197,7 +197,7 @@ jobs: run: | tox -e wheelinstall --installpkg dist/*.tar.gz - run-gallery-ros3-tests: + run-ros3-tests: name: ${{ matrix.name }} runs-on: ${{ matrix.os }} defaults: @@ -210,9 +210,9 @@ jobs: fail-fast: false matrix: include: - - { name: linux-gallery-python3.12-ros3 , python-ver: "3.12", os: ubuntu-latest } - - { name: windows-gallery-python3.12-ros3 , python-ver: "3.12", os: windows-latest } - - { name: macos-gallery-python3.12-ros3 , python-ver: "3.12", os: macos-latest } + - { name: linux-python3.12-ros3 , python-ver: "3.12", os: ubuntu-latest } + - { name: windows-python3.12-ros3 , python-ver: "3.12", os: windows-latest } + - { name: macos-python3.12-ros3 , python-ver: "3.12", os: macos-latest } steps: - name: Checkout repo with submodules uses: actions/checkout@v4 diff --git a/.github/workflows/run_coverage.yml b/.github/workflows/run_coverage.yml index a72a05e73..bd2eeb921 100644 --- a/.github/workflows/run_coverage.yml +++ b/.github/workflows/run_coverage.yml @@ -70,3 +70,58 @@ jobs: file: ./coverage.xml env: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} + + run-ros3-coverage: + name: ${{ matrix.name }} + runs-on: ${{ matrix.os }} + defaults: + run: + shell: bash -l {0} # necessary for conda + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.name }} + cancel-in-progress: true + strategy: + fail-fast: false + matrix: + include: + - { name: linux-python3.12-ros3 , python-ver: "3.12", os: ubuntu-latest } + steps: + - name: Checkout repo with submodules + uses: actions/checkout@v4 + with: + submodules: 'recursive' + fetch-depth: 0 # tags are required to determine the version + + - name: Set up Conda + uses: conda-incubator/setup-miniconda@v3 + with: + auto-update-conda: true + activate-environment: ros3 + environment-file: environment-ros3.yml + python-version: ${{ matrix.python-ver }} + channels: conda-forge + auto-activate-base: false + mamba-version: "*" + + - name: Install run dependencies + run: | + pip install . + pip list + + - name: Conda reporting + run: | + conda info + conda config --show-sources + conda list --show-channel-urls + + - name: Run ros3 tests # TODO include gallery tests after they are written + run: | + pytest --cov --cov-report=xml --cov-report=term tests/unit/test_io_hdf5_streaming.py + + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v4 + with: + fail_ci_if_error: true + file: ./coverage.xml + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 2723e03d0..5e0b3bff2 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -209,7 +209,7 @@ jobs: --token ${{ secrets.BOT_GITHUB_TOKEN }} \ --re-upload - run-gallery-ros3-tests: + run-ros3-tests: name: ${{ matrix.name }} runs-on: ${{ matrix.os }} defaults: @@ -222,7 +222,7 @@ jobs: fail-fast: false matrix: include: - - { name: linux-gallery-python3.12-ros3 , python-ver: "3.12", os: ubuntu-latest } + - { name: linux-python3.12-ros3 , python-ver: "3.12", os: ubuntu-latest } steps: - name: Checkout repo with submodules uses: actions/checkout@v4 diff --git a/CHANGELOG.md b/CHANGELOG.md index bf2134756..8a1d0b2c5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ - Updated `TermSetWrapper` to support validating a single field within a compound array. @mavaylon1 [#1061](https://github.com/hdmf-dev/hdmf/pull/1061) - Updated testing to not install in editable mode and not run `coverage` by default. @rly [#1107](https://github.com/hdmf-dev/hdmf/pull/1107) - Add `post_init_method` parameter when generating classes to perform post-init functionality, i.e., validation. @mavaylon1 [#1089](https://github.com/hdmf-dev/hdmf/pull/1089) +- Exposed `aws_region` to `HDF5IO` and downstream passes to `h5py.File`. @codycbakerphd [#1040](https://github.com/hdmf-dev/hdmf/pull/1040) - Exposed `progress_bar_class` to the `GenericDataChunkIterator` for more custom control over display of progress while iterating. @codycbakerphd [#1110](https://github.com/hdmf-dev/hdmf/pull/1110) - Updated loading, unloading, and getting the `TypeConfigurator` to support a `TypeMap` parameter. @mavaylon1 [#1117](https://github.com/hdmf-dev/hdmf/pull/1117) diff --git a/src/hdmf/backends/hdf5/h5tools.py b/src/hdmf/backends/hdf5/h5tools.py index 05ce36e13..0604881bb 100644 --- a/src/hdmf/backends/hdf5/h5tools.py +++ b/src/hdmf/backends/hdf5/h5tools.py @@ -62,15 +62,21 @@ def can_read(path): {'name': 'file', 'type': [File, "S3File", "RemFile"], 'doc': 'a pre-existing h5py.File, S3File, or RemFile object', 'default': None}, {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None}, + { + 'name': 'aws_region', + 'type': str, + 'doc': 'If driver is ros3, then specify the aws region of the url.', + 'default': None + }, {'name': 'herd_path', 'type': str, 'doc': 'The path to read/write the HERD file', 'default': None},) def __init__(self, **kwargs): """Open an HDF5 file for IO. """ self.logger = logging.getLogger('%s.%s' % (self.__class__.__module__, self.__class__.__qualname__)) - path, manager, mode, comm, file_obj, driver, herd_path = popargs('path', 'manager', 'mode', + path, manager, mode, comm, file_obj, driver, aws_region, herd_path = popargs('path', 'manager', 'mode', 'comm', 'file', 'driver', - 'herd_path', + 'aws_region', 'herd_path', kwargs) self.__open_links = [] # keep track of other files opened from links in this file @@ -91,6 +97,7 @@ def __init__(self, **kwargs): elif isinstance(manager, TypeMap): manager = BuildManager(manager) self.__driver = driver + self.__aws_region = aws_region self.__comm = comm self.__mode = mode self.__file = file_obj @@ -116,6 +123,10 @@ def _file(self): def driver(self): return self.__driver + @property + def aws_region(self): + return self.__aws_region + @classmethod def __check_path_file_obj(cls, path, file_obj): if isinstance(path, Path): @@ -133,13 +144,17 @@ def __check_path_file_obj(cls, path, file_obj): return path @classmethod - def __resolve_file_obj(cls, path, file_obj, driver): + def __resolve_file_obj(cls, path, file_obj, driver, aws_region=None): + """Helper function to return a File when loading or getting namespaces from a file.""" path = cls.__check_path_file_obj(path, file_obj) if file_obj is None: file_kwargs = dict() if driver is not None: file_kwargs.update(driver=driver) + + if aws_region is not None: + file_kwargs.update(aws_region=bytes(aws_region, "ascii")) file_obj = File(path, 'r', **file_kwargs) return file_obj @@ -150,6 +165,8 @@ def __resolve_file_obj(cls, path, file_obj, driver): {'name': 'namespaces', 'type': list, 'doc': 'the namespaces to load', 'default': None}, {'name': 'file', 'type': File, 'doc': 'a pre-existing h5py.File object', 'default': None}, {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None}, + {'name': 'aws_region', 'type': str, 'doc': 'If driver is ros3, then specify the aws region of the url.', + 'default': None}, returns=("dict mapping the names of the loaded namespaces to a dict mapping included namespace names and " "the included data types"), rtype=dict) @@ -162,10 +179,10 @@ def load_namespaces(cls, **kwargs): :raises ValueError: if both `path` and `file` are supplied but `path` is not the same as the path of `file`. """ - namespace_catalog, path, namespaces, file_obj, driver = popargs( - 'namespace_catalog', 'path', 'namespaces', 'file', 'driver', kwargs) + namespace_catalog, path, namespaces, file_obj, driver, aws_region = popargs( + 'namespace_catalog', 'path', 'namespaces', 'file', 'driver', 'aws_region', kwargs) - open_file_obj = cls.__resolve_file_obj(path, file_obj, driver) + open_file_obj = cls.__resolve_file_obj(path, file_obj, driver, aws_region=aws_region) if file_obj is None: # need to close the file object that we just opened with open_file_obj: return cls.__load_namespaces(namespace_catalog, namespaces, open_file_obj) @@ -214,6 +231,8 @@ def __check_specloc(cls, file_obj): @docval({'name': 'path', 'type': (str, Path), 'doc': 'the path to the HDF5 file', 'default': None}, {'name': 'file', 'type': File, 'doc': 'a pre-existing h5py.File object', 'default': None}, {'name': 'driver', 'type': str, 'doc': 'driver for h5py to use when opening HDF5 file', 'default': None}, + {'name': 'aws_region', 'type': str, 'doc': 'If driver is ros3, then specify the aws region of the url.', + 'default': None}, returns="dict mapping names to versions of the namespaces in the file", rtype=dict) def get_namespaces(cls, **kwargs): """Get the names and versions of the cached namespaces from a file. @@ -227,9 +246,9 @@ def get_namespaces(cls, **kwargs): :raises ValueError: if both `path` and `file` are supplied but `path` is not the same as the path of `file`. """ - path, file_obj, driver = popargs('path', 'file', 'driver', kwargs) + path, file_obj, driver, aws_region = popargs('path', 'file', 'driver', 'aws_region', kwargs) - open_file_obj = cls.__resolve_file_obj(path, file_obj, driver) + open_file_obj = cls.__resolve_file_obj(path, file_obj, driver, aws_region=aws_region) if file_obj is None: # need to close the file object that we just opened with open_file_obj: return cls.__get_namespaces(open_file_obj) @@ -756,6 +775,9 @@ def open(self): if self.driver is not None: kwargs.update(driver=self.driver) + if self.driver == "ros3" and self.aws_region is not None: + kwargs.update(aws_region=bytes(self.aws_region, "ascii")) + self.__file = File(self.source, open_flag, **kwargs) def close(self, close_links=True): diff --git a/tests/unit/test_io_hdf5_streaming.py b/tests/unit/test_io_hdf5_streaming.py index d1c9d1ab3..d82c9c5c3 100644 --- a/tests/unit/test_io_hdf5_streaming.py +++ b/tests/unit/test_io_hdf5_streaming.py @@ -2,7 +2,9 @@ import os import urllib.request import h5py +import warnings +from hdmf.backends.hdf5.h5tools import HDF5IO from hdmf.build import TypeMap, BuildManager from hdmf.common import get_hdf5io, get_type_map from hdmf.spec import GroupSpec, DatasetSpec, SpecNamespace, NamespaceBuilder, NamespaceCatalog @@ -10,6 +12,7 @@ from hdmf.utils import docval, get_docval + class TestRos3(TestCase): """Test reading an HDMF file using HDF5 ROS3 streaming. @@ -77,6 +80,8 @@ def setUp(self): self.manager = BuildManager(type_map) + warnings.filterwarnings(action="ignore", message="Ignoring cached namespace .*") + def tearDown(self): if os.path.exists(self.ns_filename): os.remove(self.ns_filename) @@ -89,6 +94,57 @@ def test_basic_read(self): with get_hdf5io(s3_path, "r", manager=self.manager, driver="ros3") as io: io.read() + def test_basic_read_with_aws_region(self): + s3_path = "https://dandiarchive.s3.amazonaws.com/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991" + + with get_hdf5io(s3_path, "r", manager=self.manager, driver="ros3", aws_region="us-east-2") as io: + io.read() + + def test_basic_read_s3_with_aws_region(self): + # NOTE: if an s3 path is used with ros3 driver, aws_region must be specified + s3_path = "s3://dandiarchive/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991" + + with get_hdf5io(s3_path, "r", manager=self.manager, driver="ros3", aws_region="us-east-2") as io: + io.read() + assert io.aws_region == "us-east-2" + + def test_get_namespaces(self): + s3_path = "https://dandiarchive.s3.amazonaws.com/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991" + + namespaces = HDF5IO.get_namespaces(s3_path, driver="ros3") + self.assertEqual(namespaces, {'core': '2.3.0', 'hdmf-common': '1.5.0', 'hdmf-experimental': '0.1.0'}) + + def test_get_namespaces_with_aws_region(self): + s3_path = "https://dandiarchive.s3.amazonaws.com/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991" + + namespaces = HDF5IO.get_namespaces(s3_path, driver="ros3", aws_region="us-east-2") + self.assertEqual(namespaces, {'core': '2.3.0', 'hdmf-common': '1.5.0', 'hdmf-experimental': '0.1.0'}) + + def test_get_namespaces_s3_with_aws_region(self): + s3_path = "s3://dandiarchive/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991" + + namespaces = HDF5IO.get_namespaces(s3_path, driver="ros3", aws_region="us-east-2") + self.assertEqual(namespaces, {'core': '2.3.0', 'hdmf-common': '1.5.0', 'hdmf-experimental': '0.1.0'}) + + def test_load_namespaces(self): + s3_path = "https://dandiarchive.s3.amazonaws.com/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991" + + HDF5IO.load_namespaces(self.manager.namespace_catalog, path=s3_path, driver="ros3") + assert set(self.manager.namespace_catalog.namespaces) == set(["core", "hdmf-common", "hdmf-experimental"]) + + def test_load_namespaces_with_aws_region(self): + s3_path = "https://dandiarchive.s3.amazonaws.com/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991" + + HDF5IO.load_namespaces(self.manager.namespace_catalog, path=s3_path, driver="ros3", aws_region="us-east-2") + assert set(self.manager.namespace_catalog.namespaces) == set(["core", "hdmf-common", "hdmf-experimental"]) + + def test_load_namespaces_s3_with_aws_region(self): + s3_path = "s3://dandiarchive/blobs/11e/c89/11ec8933-1456-4942-922b-94e5878bb991" + + HDF5IO.load_namespaces(self.manager.namespace_catalog, path=s3_path, driver="ros3", aws_region="us-east-2") + assert set(self.manager.namespace_catalog.namespaces) == set(["core", "hdmf-common", "hdmf-experimental"]) + + # Util functions and classes to enable loading of the NWB namespace -- see pynwb/src/pynwb/spec.py diff --git a/tests/unit/utils_test/test_core_DataIO.py b/tests/unit/utils_test/test_core_DataIO.py index 778dd2617..4c2ffac15 100644 --- a/tests/unit/utils_test/test_core_DataIO.py +++ b/tests/unit/utils_test/test_core_DataIO.py @@ -4,6 +4,7 @@ from hdmf.container import Data from hdmf.data_utils import DataIO from hdmf.testing import TestCase +import warnings class DataIOTests(TestCase): @@ -36,7 +37,9 @@ def test_set_dataio(self): dataio = DataIO() data = np.arange(30).reshape(5, 2, 3) container = Data('wrapped_data', data) - container.set_dataio(dataio) + msg = "Data.set_dataio() is deprecated. Please use Data.set_data_io() instead." + with self.assertWarnsWith(DeprecationWarning, msg): + container.set_dataio(dataio) self.assertIs(dataio.data, data) self.assertIs(dataio, container.data) @@ -48,7 +51,13 @@ def test_set_dataio_data_already_set(self): data = np.arange(30).reshape(5, 2, 3) container = Data('wrapped_data', data) with self.assertRaisesWith(ValueError, "cannot overwrite 'data' on DataIO"): - container.set_dataio(dataio) + with warnings.catch_warnings(record=True): + warnings.filterwarnings( + action='ignore', + category=DeprecationWarning, + message="Data.set_dataio() is deprecated. Please use Data.set_data_io() instead.", + ) + container.set_dataio(dataio) def test_dataio_options(self): """ diff --git a/tests/unit/utils_test/test_core_GenericDataChunkIterator.py b/tests/unit/utils_test/test_core_GenericDataChunkIterator.py index 2117eb6d0..cb1a727a4 100644 --- a/tests/unit/utils_test/test_core_GenericDataChunkIterator.py +++ b/tests/unit/utils_test/test_core_GenericDataChunkIterator.py @@ -410,7 +410,6 @@ def test_progress_bar(self): @unittest.skipIf(not TQDM_INSTALLED, "optional tqdm module is not installed") def test_progress_bar_class(self): - import tqdm class MyCustomProgressBar(tqdm.tqdm): def update(self, n: int = 1) -> Union[bool, None]: