From 7ccfa3ef9ff79014b55a38dd5533e3dc61dab682 Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Mon, 23 Nov 2020 10:29:06 -0800 Subject: [PATCH 1/7] add can_be_local logic --- intake_xarray/netcdf.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/intake_xarray/netcdf.py b/intake_xarray/netcdf.py index 651bf5b..d59b7a3 100644 --- a/intake_xarray/netcdf.py +++ b/intake_xarray/netcdf.py @@ -54,6 +54,10 @@ def __init__(self, urlpath, chunks=None, combine=None, concat_dim=None, self.storage_options = storage_options or {} self.xarray_kwargs = xarray_kwargs or {} self._ds = None + if isinstance(self.urlpath, list): + self._can_be_local = fsspec.utils.can_be_local(self.urlpath[0]) + else: + self._can_be_local = fsspec.utils.can_be_local(self.urlpath) super(NetCDFSource, self).__init__(metadata=metadata, **kwargs) def _open_dataset(self): @@ -76,7 +80,11 @@ def _open_dataset(self): kwargs.update(concat_dim=self.concat_dim) else: _open_dataset = xr.open_dataset - url = fsspec.open_local(url, **self.storage_options) + + if self._can_be_local: + url = fsspec.open_local(self.urlpath, **self.storage_options) + else: + url = fsspec.open(self.urlpath, **self.storage_options) self._ds = _open_dataset(url, chunks=self.chunks, **kwargs) From 592855e579289757123722484c9be0a90e23acd6 Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Mon, 23 Nov 2020 10:51:23 -0800 Subject: [PATCH 2/7] add basic remote read test --- intake_xarray/tests/test_remote.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/intake_xarray/tests/test_remote.py b/intake_xarray/tests/test_remote.py index a337535..885e1cf 100644 --- a/intake_xarray/tests/test_remote.py +++ b/intake_xarray/tests/test_remote.py @@ -50,6 +50,11 @@ def test_open_rasterio(data_server): da = source.to_dask() assert isinstance(da, xr.core.dataarray.DataArray) +def test_open_netcdf(data_server): + url = f'{data_server}/example_1.nc' + source = intake.open_netcdf(url, chunks={}) + da = source.to_dask() + assert isinstance(da, xr.core.dataarray.DataSet) # Remote catalogs with intake-server @pytest.fixture(scope='module') From c59f25b756489839f7feae0d18c875d25ad6d457 Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Mon, 23 Nov 2020 18:14:42 -0800 Subject: [PATCH 3/7] add a bunch of tests --- .github/workflows/main.yaml | 6 +- ci/environment-network.yml | 19 ++++ ci/environment-py36.yml | 1 + ci/environment-py37.yml | 1 + ci/environment-py38.yml | 1 + ci/environment-py39.yml | 18 ++++ ...-upstream.yml => environment-upstream.yml} | 5 +- intake_xarray/netcdf.py | 6 +- intake_xarray/raster.py | 15 ++- intake_xarray/tests/test_network.py | 66 +++++++++++++ intake_xarray/tests/test_remote.py | 92 ++++++++++++++++++- 11 files changed, 215 insertions(+), 15 deletions(-) create mode 100644 ci/environment-network.yml create mode 100644 ci/environment-py39.yml rename ci/{environment-py37-upstream.yml => environment-upstream.yml} (79%) create mode 100644 intake_xarray/tests/test_network.py diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 75bc953..0850888 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -13,13 +13,13 @@ jobs: strategy: fail-fast: false matrix: - CONDA_ENV: [36, 37, 38, 37-upstream] + CONDA_ENV: [36, 37, 38, 39, upstream] steps: - name: Checkout uses: actions/checkout@v2 - name: Setup Miniconda - uses: conda-incubator/setup-miniconda@v1 + uses: conda-incubator/setup-miniconda@v2 with: auto-update-conda: true auto-activate-base: false @@ -35,4 +35,4 @@ jobs: - name: Run Tests shell: bash -l {0} run: | - pytest --verbose + pytest --verbose --ignore=intake_xarray/tests/test_network.py diff --git a/ci/environment-network.yml b/ci/environment-network.yml new file mode 100644 index 0000000..105c936 --- /dev/null +++ b/ci/environment-network.yml @@ -0,0 +1,19 @@ +name: test_env +channels: + - conda-forge +dependencies: + - python + - aiohttp + - fsspec + - gcsfs + - h5netcdf + - intake + - netcdf4 + - pip + - pydap + - pytest + - rasterio + - s3fs + - scikit-image + - xarray + - zarr diff --git a/ci/environment-py36.yml b/ci/environment-py36.yml index 2522677..a2c7100 100644 --- a/ci/environment-py36.yml +++ b/ci/environment-py36.yml @@ -4,6 +4,7 @@ channels: dependencies: - python=3.6 - aiohttp + - h5netcdf - intake - netcdf4 - pip diff --git a/ci/environment-py37.yml b/ci/environment-py37.yml index 427f85f..bd83237 100644 --- a/ci/environment-py37.yml +++ b/ci/environment-py37.yml @@ -4,6 +4,7 @@ channels: dependencies: - python=3.7 - aiohttp + - h5netcdf - intake - netcdf4 - pip diff --git a/ci/environment-py38.yml b/ci/environment-py38.yml index ef797e4..9c34345 100644 --- a/ci/environment-py38.yml +++ b/ci/environment-py38.yml @@ -4,6 +4,7 @@ channels: dependencies: - python=3.8 - aiohttp + - h5netcdf - intake - netcdf4 - pip diff --git a/ci/environment-py39.yml b/ci/environment-py39.yml new file mode 100644 index 0000000..c5ece7a --- /dev/null +++ b/ci/environment-py39.yml @@ -0,0 +1,18 @@ +name: test_env +channels: + - conda-forge +dependencies: + - python=3.9 + - aiohttp + - h5netcdf + - intake + - netcdf4 + - pip + - pydap + - pytest + - rasterio + - scikit-image + - xarray + - zarr + - pip: + - rangehttpserver diff --git a/ci/environment-py37-upstream.yml b/ci/environment-upstream.yml similarity index 79% rename from ci/environment-py37-upstream.yml rename to ci/environment-upstream.yml index bd3f8be..f6f2120 100644 --- a/ci/environment-py37-upstream.yml +++ b/ci/environment-upstream.yml @@ -2,17 +2,18 @@ name: test_env channels: - conda-forge dependencies: - - python=3.7 + - python - aiohttp + - h5netcdf - netcdf4 - pip - pydap - pytest - rasterio - scikit-image - - xarray - zarr - pip: - git+https://github.com/intake/filesystem_spec.git - git+https://github.com/intake/intake.git + - git+https://github.com/pydata/xarray.git - rangehttpserver diff --git a/intake_xarray/netcdf.py b/intake_xarray/netcdf.py index d59b7a3..961095d 100644 --- a/intake_xarray/netcdf.py +++ b/intake_xarray/netcdf.py @@ -37,6 +37,8 @@ class NetCDFSource(DataSourceMixin, PatternMixin): Whether to treat the path as a pattern (ie. ``data_{field}.nc``) and create new coodinates in the output corresponding to pattern fields. If str, is treated as pattern to match on. Default is True. + xarray_kwargs: dict + Additional xarray kwargs for xr.open_dataset() or xr.open_mfdataset(). storage_options: dict If using a remote fs (whether caching locally or not), these are the kwargs to pass to that FS. @@ -84,8 +86,10 @@ def _open_dataset(self): if self._can_be_local: url = fsspec.open_local(self.urlpath, **self.storage_options) else: - url = fsspec.open(self.urlpath, **self.storage_options) + # https://github.com/intake/filesystem_spec/issues/476#issuecomment-732372918 + url = fsspec.open(self.urlpath, **self.storage_options).open() + print(kwargs) self._ds = _open_dataset(url, chunks=self.chunks, **kwargs) def _add_path_to_ds(self, ds): diff --git a/intake_xarray/raster.py b/intake_xarray/raster.py index 5b7292c..5a3262b 100644 --- a/intake_xarray/raster.py +++ b/intake_xarray/raster.py @@ -30,10 +30,10 @@ class RasterIOSource(DataSourceMixin, PatternMixin): - ``s3://data/landsat8_band{band}.tif`` - ``s3://data/{location}/landsat8_band{band}.tif`` - ``{{ CATALOG_DIR }}data/landsat8_{start_date:%Y%m%d}_band{band}.tif`` - chunks: int or dict + chunks: None or int or dict, optional Chunks is used to load the new dataset into dask arrays. ``chunks={}`` loads the dataset with dask using a single - chunk for all arrays. + chunk for all arrays. default `None` loads numpy arrays. path_as_pattern: bool or str, optional Whether to treat the path as a pattern (ie. ``data_{field}.tif``) and create new coodinates in the output corresponding to pattern @@ -41,7 +41,7 @@ class RasterIOSource(DataSourceMixin, PatternMixin): """ name = 'rasterio' - def __init__(self, urlpath, chunks, concat_dim='concat_dim', + def __init__(self, urlpath, chunks=None, concat_dim='concat_dim', xarray_kwargs=None, metadata=None, path_as_pattern=True, storage_options=None, **kwargs): self.path_as_pattern = path_as_pattern @@ -82,6 +82,7 @@ def _open_dataset(self): files = fsspec.open_local(self.urlpath, **self.storage_options) else: files = self.urlpath + #files = fsspec.open(self.urlpath, **self.storage_options).open() if isinstance(files, list): self._ds = self._open_files(files) else: @@ -115,11 +116,17 @@ def _get_schema(self): metadata[k] = v except TypeError: pass + + if hasattr(self._ds.data, 'npartitions'): + npart = self._ds.data.npartitions + else: + npart = None + self._schema = Schema( datashape=None, dtype=str(self._ds.dtype), shape=self._ds.shape, - npartitions=self._ds.data.npartitions, + npartitions=npart, extra_metadata=metadata) return self._schema diff --git a/intake_xarray/tests/test_network.py b/intake_xarray/tests/test_network.py new file mode 100644 index 0000000..4526b39 --- /dev/null +++ b/intake_xarray/tests/test_network.py @@ -0,0 +1,66 @@ +# Tests that read public data over the internet +import intake +import pytest +import xarray as xr +import s3fs +import gcsfs + +# RasterIOSource +def test_open_rasterio_http(): + prefix = 'https://landsat-pds.s3.us-west-2.amazonaws.com/L8/139/045' + image = 'LC81390452014295LGN00/LC81390452014295LGN00_B1.TIF' + url = f'{prefix}/{image}' + source = intake.open_rasterio(url, + chunks=dict(band=1)) + ds = source.to_dask() + assert isinstance(ds, xr.core.dataarray.DataArray) + + +def test_open_rasterio_s3(): + bucket = 's3://landsat-pds' + key = 'L8/139/045/LC81390452014295LGN00/LC81390452014295LGN00_B1.TIF' + url = f'{bucket}/{key}' + source = intake.open_rasterio(url, + chunks=dict(band=1), + storage_options = dict(anon=True)) + ds = source.to_dask() + assert isinstance(ds, xr.core.dataarray.DataArray) + + +# NETCDFSource +def test_open_netcdf_gs(): + bucket = 'gs://ldeo-glaciology' + key = 'bedmachine/BedMachineAntarctica_2019-11-05_v01.nc' + url = f'{bucket}/{key}' + source = intake.open_netcdf(url, + chunks=3000, + xarray_kwargs=dict(engine='h5netcdf'), + ) + ds = source.to_dask() + assert isinstance(ds._file_obj, xr.backends.h5netcdf_.H5NetCDFStore) + assert isinstance(ds, xr.core.dataarray.Dataset) + +def test_open_netcdf_s3(): + bucket = 's3://its-live-data.jpl.nasa.gov' + key = 'icesat2/alt06/rel003/ATL06_20181230162257_00340206_003_01.h5' + url = f'{bucket}/{key}' + source = intake.open_netcdf(url, + xarray_kwargs=dict(group='gt1l/land_ice_segments', engine='h5netcdf'), + storage_options=dict(anon=True), + ) + ds = source.to_dask() + assert isinstance(ds._file_obj, xr.backends.h5netcdf_.H5NetCDFStore) + assert isinstance(ds, xr.core.dataarray.Dataset) + + +def test_open_netcdf_s3_simplecache(): + bucket = 's3://its-live-data.jpl.nasa.gov' + key = 'icesat2/alt06/rel003/ATL06_20181230162257_00340206_003_01.h5' + url = f'simplecache::{bucket}/{key}' + source = intake.open_netcdf(url, + xarray_kwargs=dict(group='gt1l/land_ice_segments', engine='h5netcdf'), + storage_options=dict(s3={'anon': True}), + ) + ds = source.to_dask() + assert isinstance(ds._file_obj, xr.backends.h5netcdf_.H5NetCDFStore) + assert isinstance(ds, xr.core.dataarray.Dataset) diff --git a/intake_xarray/tests/test_remote.py b/intake_xarray/tests/test_remote.py index 885e1cf..ab364a4 100644 --- a/intake_xarray/tests/test_remote.py +++ b/intake_xarray/tests/test_remote.py @@ -1,3 +1,4 @@ +import aiohttp import intake import os import pytest @@ -6,6 +7,8 @@ import time import xarray as xr import fsspec +import dask +import numpy PORT = 8425 # for intake-server tests here = os.path.abspath(os.path.dirname(__file__)) @@ -37,24 +40,103 @@ def data_server(): P.communicate() -def test_list(data_server): +def test_list_server_files(data_server): + test_files = ['RGB.byte.tif', 'example_1.nc', 'example_2.nc', 'little_green.tif', 'little_red.tif'] h = fsspec.filesystem("http") out = h.glob(data_server + '/') assert len(out) > 0 - assert data_server+'/RGB.byte.tif' in out - + assert set([data_server+'/'+x for x in test_files]).issubset(set(out)) +# REMOTE GEOTIFF def test_open_rasterio(data_server): + url = f'{data_server}/RGB.byte.tif' + source = intake.open_rasterio(url) + da = source.read() + assert isinstance(da, xr.core.dataarray.DataArray) + assert isinstance(da.data, numpy.ndarray) + + +def test_open_rasterio_dask(data_server): url = f'{data_server}/RGB.byte.tif' source = intake.open_rasterio(url, chunks={}) da = source.to_dask() assert isinstance(da, xr.core.dataarray.DataArray) + assert isinstance(da.data, dask.array.core.Array) + + +def test_read_rasterio(data_server): + url = f'{data_server}/RGB.byte.tif' + source = intake.open_rasterio(url, chunks={}) + da = source.read() + assert da.attrs['crs'] == '+init=epsg:32618' + assert da.attrs['AREA_OR_POINT'] == 'Area' + assert da.dtype == 'uint8' + assert da.isel(band=2,x=300,y=500).values == 129 + + +def test_open_rasterio_auth(data_server): + url = f'{data_server}/RGB.byte.tif' + auth = dict(client_kwargs={'auth': aiohttp.BasicAuth('USER', 'PASS')}) + # NOTE: if url startswith 'https' use 'https' instead of 'http' for storage_options + source = intake.open_rasterio(url, + storage_options=dict(http=auth)) + source_auth = source.storage_options['http'].get('client_kwargs').get('auth') + assert isinstance(source_auth, aiohttp.BasicAuth) + +def test_open_rasterio_simplecache(data_server): + url = f'simplecache::{data_server}/RGB.byte.tif' + source = intake.open_rasterio(url, chunks={}) + da = source.to_dask() + assert isinstance(da, xr.core.dataarray.DataArray) + + +def test_open_rasterio_pattern(data_server): + url = [data_server+'/'+x for x in ('little_red.tif', 'little_green.tif')] + source = intake.open_rasterio(url, + path_as_pattern='{}/little_{color}.tif', + concat_dim='color', + chunks={}) + da = source.to_dask() + assert isinstance(da, xr.core.dataarray.DataArray) + assert set(da.color.data) == set(['red', 'green']) + assert da.shape == (2, 3, 64, 64) + + +# REMOTE NETCDF / HDF def test_open_netcdf(data_server): url = f'{data_server}/example_1.nc' + source = intake.open_netcdf(url) + ds = source.to_dask() + assert isinstance(ds, xr.core.dataset.Dataset) + assert isinstance(ds.temp.data, numpy.ndarray) + + +def test_read_netcdf(data_server): + url = f'{data_server}/example_1.nc' + source = intake.open_netcdf(url) + ds = source.read() + assert ds['rh'].isel(lat=0,lon=0,time=0).values.dtype == 'float32' + assert ds['rh'].isel(lat=0,lon=0,time=0).values == 0.5 + + +def test_open_netcdf_dask(data_server): + url = f'{data_server}/next_example_1.nc' + source = intake.open_netcdf(url, chunks={}, + xarray_kwargs=dict(engine='h5netcdf')) + ds = source.to_dask() + assert isinstance(ds._file_obj, xr.backends.h5netcdf_.H5NetCDFStore) + assert isinstance(ds, xr.core.dataset.Dataset) + assert isinstance(ds.temp.data, dask.array.core.Array) + + +def test_open_netcdf_simplecache(data_server): + url = f'simplecache::{data_server}/example_1.nc' source = intake.open_netcdf(url, chunks={}) - da = source.to_dask() - assert isinstance(da, xr.core.dataarray.DataSet) + ds = source.to_dask() + assert isinstance(ds, xr.core.dataset.Dataset) + assert isinstance(ds.temp.data, dask.array.core.Array) + # Remote catalogs with intake-server @pytest.fixture(scope='module') From 25a9255f1e59a964d9e5247dc2b86e1ecbea9a74 Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Mon, 23 Nov 2020 18:21:34 -0800 Subject: [PATCH 4/7] fix upstream test env --- .github/workflows/main.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 0850888..ee08977 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -13,7 +13,7 @@ jobs: strategy: fail-fast: false matrix: - CONDA_ENV: [36, 37, 38, 39, upstream] + CONDA_ENV: [py36, py37, py38, py39, upstream] steps: - name: Checkout uses: actions/checkout@v2 @@ -24,7 +24,7 @@ jobs: auto-update-conda: true auto-activate-base: false activate-environment: test_env - environment-file: ci/environment-py${{ matrix.CONDA_ENV }}.yml + environment-file: ci/environment-${{ matrix.CONDA_ENV }}.yml - name: Development Install Intake-Xarray shell: bash -l {0} From 7d3670de4f15e3adc809f21b8d4da7da5f8a91a2 Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Tue, 24 Nov 2020 11:25:57 -0800 Subject: [PATCH 5/7] add local s3 tests --- ci/environment-network.yml | 19 ----- ci/environment-py36.yml | 4 + ci/environment-py37.yml | 4 + ci/environment-py38.yml | 4 + ci/environment-py39.yml | 4 + ci/environment-upstream.yml | 4 + intake_xarray/netcdf.py | 1 - intake_xarray/raster.py | 1 + intake_xarray/tests/test_remote.py | 133 ++++++++++++++++++++++++----- 9 files changed, 131 insertions(+), 43 deletions(-) delete mode 100644 ci/environment-network.yml diff --git a/ci/environment-network.yml b/ci/environment-network.yml deleted file mode 100644 index 105c936..0000000 --- a/ci/environment-network.yml +++ /dev/null @@ -1,19 +0,0 @@ -name: test_env -channels: - - conda-forge -dependencies: - - python - - aiohttp - - fsspec - - gcsfs - - h5netcdf - - intake - - netcdf4 - - pip - - pydap - - pytest - - rasterio - - s3fs - - scikit-image - - xarray - - zarr diff --git a/ci/environment-py36.yml b/ci/environment-py36.yml index a2c7100..a0eead0 100644 --- a/ci/environment-py36.yml +++ b/ci/environment-py36.yml @@ -4,6 +4,8 @@ channels: dependencies: - python=3.6 - aiohttp + - flask + - gcsfs - h5netcdf - intake - netcdf4 @@ -11,8 +13,10 @@ dependencies: - pydap - pytest - rasterio + - s3fs - scikit-image - xarray - zarr - pip: - rangehttpserver + - moto[s3] diff --git a/ci/environment-py37.yml b/ci/environment-py37.yml index bd83237..ccd455e 100644 --- a/ci/environment-py37.yml +++ b/ci/environment-py37.yml @@ -4,6 +4,8 @@ channels: dependencies: - python=3.7 - aiohttp + - flask + - gcsfs - h5netcdf - intake - netcdf4 @@ -11,8 +13,10 @@ dependencies: - pydap - pytest - rasterio + - s3fs - scikit-image - xarray - zarr - pip: - rangehttpserver + - moto[s3] diff --git a/ci/environment-py38.yml b/ci/environment-py38.yml index 9c34345..4e839b9 100644 --- a/ci/environment-py38.yml +++ b/ci/environment-py38.yml @@ -4,6 +4,8 @@ channels: dependencies: - python=3.8 - aiohttp + - flask + - gcsfs - h5netcdf - intake - netcdf4 @@ -11,8 +13,10 @@ dependencies: - pydap - pytest - rasterio + - s3fs - scikit-image - xarray - zarr - pip: - rangehttpserver + - moto[s3] diff --git a/ci/environment-py39.yml b/ci/environment-py39.yml index c5ece7a..36cbba9 100644 --- a/ci/environment-py39.yml +++ b/ci/environment-py39.yml @@ -4,6 +4,8 @@ channels: dependencies: - python=3.9 - aiohttp + - flask + - gcsfs - h5netcdf - intake - netcdf4 @@ -11,8 +13,10 @@ dependencies: - pydap - pytest - rasterio + - s3fs - scikit-image - xarray - zarr - pip: - rangehttpserver + - moto[s3] diff --git a/ci/environment-upstream.yml b/ci/environment-upstream.yml index f6f2120..97d81d7 100644 --- a/ci/environment-upstream.yml +++ b/ci/environment-upstream.yml @@ -4,12 +4,15 @@ channels: dependencies: - python - aiohttp + - flask + - gcsfs - h5netcdf - netcdf4 - pip - pydap - pytest - rasterio + - s3fs - scikit-image - zarr - pip: @@ -17,3 +20,4 @@ dependencies: - git+https://github.com/intake/intake.git - git+https://github.com/pydata/xarray.git - rangehttpserver + - moto[s3] diff --git a/intake_xarray/netcdf.py b/intake_xarray/netcdf.py index 961095d..76d4e0b 100644 --- a/intake_xarray/netcdf.py +++ b/intake_xarray/netcdf.py @@ -89,7 +89,6 @@ def _open_dataset(self): # https://github.com/intake/filesystem_spec/issues/476#issuecomment-732372918 url = fsspec.open(self.urlpath, **self.storage_options).open() - print(kwargs) self._ds = _open_dataset(url, chunks=self.chunks, **kwargs) def _add_path_to_ds(self, ds): diff --git a/intake_xarray/raster.py b/intake_xarray/raster.py index 5a3262b..7ad33c9 100644 --- a/intake_xarray/raster.py +++ b/intake_xarray/raster.py @@ -81,6 +81,7 @@ def _open_dataset(self): if self._can_be_local: files = fsspec.open_local(self.urlpath, **self.storage_options) else: + # pass URLs to delegate remote opening to rasterio library files = self.urlpath #files = fsspec.open(self.urlpath, **self.storage_options).open() if isinstance(files, list): diff --git a/intake_xarray/tests/test_remote.py b/intake_xarray/tests/test_remote.py index ab364a4..a6c31d7 100644 --- a/intake_xarray/tests/test_remote.py +++ b/intake_xarray/tests/test_remote.py @@ -1,3 +1,4 @@ +# Tests for intake-server, local HTTP file server, local "S3" object server import aiohttp import intake import os @@ -9,8 +10,8 @@ import fsspec import dask import numpy +import s3fs -PORT = 8425 # for intake-server tests here = os.path.abspath(os.path.dirname(__file__)) cat_file = os.path.join(here, 'data', 'catalog.yaml') DIRECTORY = os.path.join(here, 'data') @@ -40,7 +41,7 @@ def data_server(): P.communicate() -def test_list_server_files(data_server): +def test_http_server_files(data_server): test_files = ['RGB.byte.tif', 'example_1.nc', 'example_2.nc', 'little_green.tif', 'little_red.tif'] h = fsspec.filesystem("http") out = h.glob(data_server + '/') @@ -48,25 +49,16 @@ def test_list_server_files(data_server): assert set([data_server+'/'+x for x in test_files]).issubset(set(out)) # REMOTE GEOTIFF -def test_open_rasterio(data_server): +def test_http_open_rasterio(data_server): url = f'{data_server}/RGB.byte.tif' source = intake.open_rasterio(url) - da = source.read() - assert isinstance(da, xr.core.dataarray.DataArray) - assert isinstance(da.data, numpy.ndarray) - - -def test_open_rasterio_dask(data_server): - url = f'{data_server}/RGB.byte.tif' - source = intake.open_rasterio(url, chunks={}) da = source.to_dask() assert isinstance(da, xr.core.dataarray.DataArray) - assert isinstance(da.data, dask.array.core.Array) -def test_read_rasterio(data_server): +def test_http_read_rasterio(data_server): url = f'{data_server}/RGB.byte.tif' - source = intake.open_rasterio(url, chunks={}) + source = intake.open_rasterio(url) da = source.read() assert da.attrs['crs'] == '+init=epsg:32618' assert da.attrs['AREA_OR_POINT'] == 'Area' @@ -74,7 +66,15 @@ def test_read_rasterio(data_server): assert da.isel(band=2,x=300,y=500).values == 129 -def test_open_rasterio_auth(data_server): +def test_http_open_rasterio_dask(data_server): + url = f'{data_server}/RGB.byte.tif' + source = intake.open_rasterio(url, chunks={}) + da = source.to_dask() + assert isinstance(da, xr.core.dataarray.DataArray) + assert isinstance(da.data, dask.array.core.Array) + + +def test_http_open_rasterio_auth(data_server): url = f'{data_server}/RGB.byte.tif' auth = dict(client_kwargs={'auth': aiohttp.BasicAuth('USER', 'PASS')}) # NOTE: if url startswith 'https' use 'https' instead of 'http' for storage_options @@ -84,14 +84,14 @@ def test_open_rasterio_auth(data_server): assert isinstance(source_auth, aiohttp.BasicAuth) -def test_open_rasterio_simplecache(data_server): +def test_http_read_rasterio_simplecache(data_server): url = f'simplecache::{data_server}/RGB.byte.tif' source = intake.open_rasterio(url, chunks={}) da = source.to_dask() assert isinstance(da, xr.core.dataarray.DataArray) -def test_open_rasterio_pattern(data_server): +def test_http_read_rasterio_pattern(data_server): url = [data_server+'/'+x for x in ('little_red.tif', 'little_green.tif')] source = intake.open_rasterio(url, path_as_pattern='{}/little_{color}.tif', @@ -104,7 +104,7 @@ def test_open_rasterio_pattern(data_server): # REMOTE NETCDF / HDF -def test_open_netcdf(data_server): +def test_http_open_netcdf(data_server): url = f'{data_server}/example_1.nc' source = intake.open_netcdf(url) ds = source.to_dask() @@ -112,7 +112,7 @@ def test_open_netcdf(data_server): assert isinstance(ds.temp.data, numpy.ndarray) -def test_read_netcdf(data_server): +def test_http_read_netcdf(data_server): url = f'{data_server}/example_1.nc' source = intake.open_netcdf(url) ds = source.read() @@ -120,7 +120,7 @@ def test_read_netcdf(data_server): assert ds['rh'].isel(lat=0,lon=0,time=0).values == 0.5 -def test_open_netcdf_dask(data_server): +def test_http_read_netcdf_dask(data_server): url = f'{data_server}/next_example_1.nc' source = intake.open_netcdf(url, chunks={}, xarray_kwargs=dict(engine='h5netcdf')) @@ -130,7 +130,7 @@ def test_open_netcdf_dask(data_server): assert isinstance(ds.temp.data, dask.array.core.Array) -def test_open_netcdf_simplecache(data_server): +def test_http_read_netcdf_simplecache(data_server): url = f'simplecache::{data_server}/example_1.nc' source = intake.open_netcdf(url, chunks={}) ds = source.to_dask() @@ -138,9 +138,96 @@ def test_open_netcdf_simplecache(data_server): assert isinstance(ds.temp.data, dask.array.core.Array) +# S3 +#based on: https://github.com/dask/s3fs/blob/master/s3fs/tests/test_s3fs.py +test_bucket_name = "test" +PORT_S3 = 8001 +#endpoint_uri = f"http://localhost:{PORT_S3}/" +endpoint_uri = "http://127.0.0.1:%s" % PORT_S3 +test_files = ['RGB.byte.tif', 'example_1.nc'] + +@pytest.fixture() +def s3_base(): + # writable local S3 system + import shlex + import subprocess + + proc = subprocess.Popen(shlex.split("moto_server s3 -p %s" % PORT_S3)) + + timeout = 5 + while timeout > 0: + try: + r = requests.get(endpoint_uri) + if r.ok: + break + except: + pass + timeout -= 0.1 + time.sleep(0.1) + yield + proc.terminate() + proc.wait() + +@pytest.fixture() +def s3(s3_base): + ''' anonymous access local s3 bucket for testing ''' + from botocore.session import Session + session = Session() + client = session.create_client("s3", endpoint_url=endpoint_uri) + client.create_bucket(Bucket=test_bucket_name, ACL="public-read") + + for file_name in [os.path.join(DIRECTORY,x) for x in test_files]: + with open(file_name, 'rb') as f: + data = f.read() + key = os.path.basename(file_name) + client.put_object(Bucket=test_bucket_name, Key=key, Body=data) + + # Make sure cache not being used + s3fs.S3FileSystem.clear_instance_cache() + s3 = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint_uri}) + s3.invalidate_cache() + yield + + +def test_s3_list_files(s3): + s3 = s3fs.S3FileSystem(anon=True, client_kwargs={"endpoint_url": endpoint_uri}) + files = s3.ls(test_bucket_name) + assert len(files) > 0 + assert set([test_bucket_name+'/'+x for x in test_files]).issubset(set(files)) + + +def test_s3_read_rasterio(s3): + # Lots of GDAL Environment variables needed for this to work ! + # https://gdal.org/user/virtual_file_systems.html#vsis3-aws-s3-files + os.environ['AWS_NO_SIGN_REQUEST']='YES' + os.environ['AWS_S3_ENDPOINT'] = endpoint_uri.lstrip('http://') + os.environ['AWS_VIRTUAL_HOSTING']= 'FALSE' + os.environ['AWS_HTTPS']= 'NO' + os.environ['GDAL_DISABLE_READDIR_ON_OPEN']='EMPTY_DIR' + os.environ['CPL_CURL_VERBOSE']='YES' + url = f's3://{test_bucket_name}/RGB.byte.tif' + source = intake.open_rasterio(url) + da = source.read() + assert da.attrs['crs'] == '+init=epsg:32618' + assert da.attrs['AREA_OR_POINT'] == 'Area' + assert da.dtype == 'uint8' + assert da.isel(band=2,x=300,y=500).values == 129 + + +def test_s3_read_netcdf(s3): + url = f's3://{test_bucket_name}/example_1.nc' + s3options = dict(client_kwargs={"endpoint_url": endpoint_uri}) + source = intake.open_netcdf(url, + storage_options=s3options) + ds = source.read() + assert ds['rh'].isel(lat=0,lon=0,time=0).values.dtype == 'float32' + assert ds['rh'].isel(lat=0,lon=0,time=0).values == 0.5 + + # Remote catalogs with intake-server @pytest.fixture(scope='module') def intake_server(): + PORT = 8002 command = ['intake-server', '-p', str(PORT), cat_file] try: P = subprocess.Popen(command) @@ -159,7 +246,7 @@ def intake_server(): P.communicate() -def test_remote_netcdf(intake_server): +def test_intake_server_netcdf(intake_server): cat_local = intake.open_catalog(cat_file) cat = intake.open_catalog(intake_server) assert 'xarray_source' in cat @@ -175,7 +262,7 @@ def test_remote_netcdf(intake_server): cat_local.xarray_source.read()).all() -def test_remote_tiff(intake_server): +def test_intake_server_tiff(intake_server): pytest.importorskip('rasterio') cat_local = intake.open_catalog(cat_file) cat = intake.open_catalog(intake_server) From c85ed4212c3e64386b85959b5fba353a63ef3642 Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Tue, 24 Nov 2020 11:49:36 -0800 Subject: [PATCH 6/7] use localhost for s3 test, boto3 from conda-forge --- ci/environment-py36.yml | 2 +- ci/environment-py37.yml | 2 +- ci/environment-py38.yml | 2 +- ci/environment-py39.yml | 2 +- ci/environment-upstream.yml | 2 +- intake_xarray/tests/test_remote.py | 3 +-- 6 files changed, 6 insertions(+), 7 deletions(-) diff --git a/ci/environment-py36.yml b/ci/environment-py36.yml index a0eead0..58ae5ab 100644 --- a/ci/environment-py36.yml +++ b/ci/environment-py36.yml @@ -4,8 +4,8 @@ channels: dependencies: - python=3.6 - aiohttp + - boto3 - flask - - gcsfs - h5netcdf - intake - netcdf4 diff --git a/ci/environment-py37.yml b/ci/environment-py37.yml index ccd455e..72a3ddf 100644 --- a/ci/environment-py37.yml +++ b/ci/environment-py37.yml @@ -4,8 +4,8 @@ channels: dependencies: - python=3.7 - aiohttp + - boto3 - flask - - gcsfs - h5netcdf - intake - netcdf4 diff --git a/ci/environment-py38.yml b/ci/environment-py38.yml index 4e839b9..a0fe59b 100644 --- a/ci/environment-py38.yml +++ b/ci/environment-py38.yml @@ -4,8 +4,8 @@ channels: dependencies: - python=3.8 - aiohttp + - boto3 - flask - - gcsfs - h5netcdf - intake - netcdf4 diff --git a/ci/environment-py39.yml b/ci/environment-py39.yml index 36cbba9..a43536a 100644 --- a/ci/environment-py39.yml +++ b/ci/environment-py39.yml @@ -4,8 +4,8 @@ channels: dependencies: - python=3.9 - aiohttp + - boto3 - flask - - gcsfs - h5netcdf - intake - netcdf4 diff --git a/ci/environment-upstream.yml b/ci/environment-upstream.yml index 97d81d7..b8177fc 100644 --- a/ci/environment-upstream.yml +++ b/ci/environment-upstream.yml @@ -4,8 +4,8 @@ channels: dependencies: - python - aiohttp + - boto3 - flask - - gcsfs - h5netcdf - netcdf4 - pip diff --git a/intake_xarray/tests/test_remote.py b/intake_xarray/tests/test_remote.py index a6c31d7..f9c43ce 100644 --- a/intake_xarray/tests/test_remote.py +++ b/intake_xarray/tests/test_remote.py @@ -142,8 +142,7 @@ def test_http_read_netcdf_simplecache(data_server): #based on: https://github.com/dask/s3fs/blob/master/s3fs/tests/test_s3fs.py test_bucket_name = "test" PORT_S3 = 8001 -#endpoint_uri = f"http://localhost:{PORT_S3}/" -endpoint_uri = "http://127.0.0.1:%s" % PORT_S3 +endpoint_uri = "http://localhost:%s" % PORT_S3 test_files = ['RGB.byte.tif', 'example_1.nc'] @pytest.fixture() From dc2ec1f9cae6a05fcf6bb1c4fbba910d1ba4f8a4 Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Tue, 24 Nov 2020 12:06:03 -0800 Subject: [PATCH 7/7] add fake aws credentials --- intake_xarray/tests/test_remote.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/intake_xarray/tests/test_remote.py b/intake_xarray/tests/test_remote.py index f9c43ce..8fa8a42 100644 --- a/intake_xarray/tests/test_remote.py +++ b/intake_xarray/tests/test_remote.py @@ -167,8 +167,18 @@ def s3_base(): proc.terminate() proc.wait() + +@pytest.fixture(scope='function') +def aws_credentials(): + """Mocked AWS Credentials for moto.""" + os.environ['AWS_ACCESS_KEY_ID'] = 'testing' + os.environ['AWS_SECRET_ACCESS_KEY'] = 'testing' + os.environ['AWS_SECURITY_TOKEN'] = 'testing' + os.environ['AWS_SESSION_TOKEN'] = 'testing' + + @pytest.fixture() -def s3(s3_base): +def s3(s3_base, aws_credentials): ''' anonymous access local s3 bucket for testing ''' from botocore.session import Session session = Session()