In [1]:
from pynhd import NLDI, WaterData
import fsspec
import boto3
import geopandas as gpd
import math
import xarray as xr

In [5]:
fs_write = fsspec.filesystem(
    's3',
    profile='osn-renc',  ## This is the profile name you configured above.
    client_kwargs={'endpoint_url': 'https://renc.osn.xsede.org'}
)

In [3]:
def convert_size(size_bytes):
   if size_bytes == 0:
       return "0B"
   size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
   i = int(math.floor(math.log(size_bytes, 1024)))
   p = math.pow(1024, i)
   s = round(size_bytes / p, 2)
   return "%s %s" % (s, size_name[i])

# Read data into geopandas dataframe
Data from https://www.sciencebase.gov/catalog/item/60cb5edfd34e86b938a373f4
Copied to OSN pod

In [9]:
gdb_file = 'zip+https://renc.osn.xsede.org/rsignellbucket2/hytest/wbd/WBD_National_GDB.zip'
huc12_basins = gpd.read_file(gdb_file, layer='WBDHU12')

In [None]:
#huc12_basins.plot()

In [None]:
#huc12_basins.head()

# Write Geoparquet

In [None]:
fname='rsignellbucket2/nwc/huc12/huc12.geoparquet'
with fs_write.open(fname, mode='wb') as f:
    huc12_basins.to_parquet(f)

In [6]:
fs_write.ls('rsignellbucket2/nwc/huc12/')

['rsignellbucket2/nwc/huc12/huc12.geoparquet',
 'rsignellbucket2/nwc/huc12/huc12.gpkg']

In [8]:
print(f'size of huc12 geoparquet: {convert_size(fs_write.size(fname))}')

size of huc12 geoparquet: 1.89 GB


In [None]:
with fs_write.open(fname, mode='rb') as f:
    huc12_df = gpd.read_parquet(f)

In [None]:
huc12_df.head()

In [None]:
huc12_df.plot()

# Write geopackage

In [5]:
fname='rsignellbucket2/nwc/huc12/huc12.gpkg'

In [8]:
print(f'size of huc12 geopackage: {convert_size(fs_write.size(fname))}')

size of huc12 geopackage: 270.02 MB


In [9]:
with fs_write.open(fname, mode='wb') as f:
    huc12_basins.to_file(f, layer='huc12', driver="GPKG")

In [10]:
fs_write.ls('rsignellbucket2/nwc/huc12/')

['rsignellbucket2/nwc/huc12/huc12.geoparquet',
 'rsignellbucket2/nwc/huc12/huc12.gpkg']

In [11]:
print(f'size of huc12 geopackage: {convert_size(fs_write.size(fname))}')

size of huc12 geopackage: 2.09 GB


In [None]:
with fs_write.open(fname, mode='rb') as f:
    huc12_basins = gpd.read_file(f, layer='huc12', driver="GPKG")  

In [25]:
huc12_basins['HUC12'].head()

0    031401030101
1    031401030102
2    031401030103
3    031401030104
4    031401030201
Name: HUC12, dtype: object

In [None]:
huc12_basins.plot()

# Experimental - Write geospatial time series data to geoparquet

Read in zarr time series data and check size

In [10]:
zarr_path = 'rsignellbucket2/nwc/nhm_prms_v1_1_gridmet/nhm_prms_v1_1_gridmet.zarr'
m = fs_write.get_mapper(zarr_path)
nhm_ds = xr.open_dataset(m, engine='zarr', consolidated=True, chunks={})
# ds.sel(date='1983-01-01', huc12='010100020101').load()
nhm_ds

Unnamed: 0,Array,Chunk
Bytes,271.78 MiB,1.06 MiB
Shape,"(432, 82460)","(27, 5154)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 271.78 MiB 1.06 MiB Shape (432, 82460) (27, 5154) Dask graph 256 chunks in 2 graph layers Data type float64 numpy.ndarray",82460  432,

Unnamed: 0,Array,Chunk
Bytes,271.78 MiB,1.06 MiB
Shape,"(432, 82460)","(27, 5154)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,271.78 MiB,1.06 MiB
Shape,"(432, 82460)","(27, 5154)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 271.78 MiB 1.06 MiB Shape (432, 82460) (27, 5154) Dask graph 256 chunks in 2 graph layers Data type float64 numpy.ndarray",82460  432,

Unnamed: 0,Array,Chunk
Bytes,271.78 MiB,1.06 MiB
Shape,"(432, 82460)","(27, 5154)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,271.78 MiB,1.06 MiB
Shape,"(432, 82460)","(27, 5154)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 271.78 MiB 1.06 MiB Shape (432, 82460) (27, 5154) Dask graph 256 chunks in 2 graph layers Data type float64 numpy.ndarray",82460  432,

Unnamed: 0,Array,Chunk
Bytes,271.78 MiB,1.06 MiB
Shape,"(432, 82460)","(27, 5154)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,271.78 MiB,1.06 MiB
Shape,"(432, 82460)","(27, 5154)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 271.78 MiB 1.06 MiB Shape (432, 82460) (27, 5154) Dask graph 256 chunks in 2 graph layers Data type float64 numpy.ndarray",82460  432,

Unnamed: 0,Array,Chunk
Bytes,271.78 MiB,1.06 MiB
Shape,"(432, 82460)","(27, 5154)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,271.78 MiB,1.06 MiB
Shape,"(432, 82460)","(27, 5154)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 271.78 MiB 1.06 MiB Shape (432, 82460) (27, 5154) Dask graph 256 chunks in 2 graph layers Data type float64 numpy.ndarray",82460  432,

Unnamed: 0,Array,Chunk
Bytes,271.78 MiB,1.06 MiB
Shape,"(432, 82460)","(27, 5154)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,271.78 MiB,1.06 MiB
Shape,"(432, 82460)","(27, 5154)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 271.78 MiB 1.06 MiB Shape (432, 82460) (27, 5154) Dask graph 256 chunks in 2 graph layers Data type float64 numpy.ndarray",82460  432,

Unnamed: 0,Array,Chunk
Bytes,271.78 MiB,1.06 MiB
Shape,"(432, 82460)","(27, 5154)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,271.78 MiB,1.06 MiB
Shape,"(432, 82460)","(27, 5154)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray
"Array Chunk Bytes 271.78 MiB 1.06 MiB Shape (432, 82460) (27, 5154) Dask graph 256 chunks in 2 graph layers Data type float64 numpy.ndarray",82460  432,

Unnamed: 0,Array,Chunk
Bytes,271.78 MiB,1.06 MiB
Shape,"(432, 82460)","(27, 5154)"
Dask graph,256 chunks in 2 graph layers,256 chunks in 2 graph layers
Data type,float64 numpy.ndarray,float64 numpy.ndarray


In [6]:
print(f'size of NHM zarr time series: {convert_size(fs_write.du(zarr_path))}')

size of NHM zarr time series: 1.46 GB


pull out just a slice of the time series (this is just to experiment, we don't need the whole dataset) and convert to a dataframe to zip up with huc12 basins geodataframe - this should be 12 data points (monthly)

In [11]:
nhm_df = nhm_ds.sel(time=slice('2013-01-01','2013-02-28')).to_dataframe()

zip up with geospatial data

In [12]:
huc12_basins_nhm_ts = huc12_basins.merge(nhm_df.reset_index(), left_on='HUC12', right_on='huc12')

In [17]:
huc12_basins_nhm_ts = huc12_basins_nhm_ts.drop('HUC12', axis=1)

In [18]:
huc12_basins_nhm_ts.where('huc12'=='031401030101')

Unnamed: 0,TNMID,METASOURCEID,SOURCEDATADESC,SOURCEORIGINATOR,SOURCEFEATUREID,LOADDATE,GNIS_ID,AREAACRES,AREASQKM,STATES,...,geometry,time,huc12,gwres_flow,hru_actet,hru_lateral_flow,hru_ppt,hru_storage,sroff,ssres_flow
0,{B1EF0C55-72ED-4FF6-A3BA-97A87C6A6C47},,,,,2013-01-18 07:07:56+00:00,,12663.63,51.25,AL,...,"MULTIPOLYGON (((-86.15784 31.42164, -86.15783 ...",2013-01-01,31401030101,52.06,61.75,62.43,86.94,820.51,4.37,6.02
1,{B1EF0C55-72ED-4FF6-A3BA-97A87C6A6C47},,,,,2013-01-18 07:07:56+00:00,,12663.63,51.25,AL,...,"MULTIPOLYGON (((-86.15784 31.42164, -86.15783 ...",2013-02-01,31401030101,54.26,62.38,149.89,582.18,1190.41,82.59,13.05
2,{F0D9874D-52BA-4FDC-A5E6-E259B627764D},,,,,2013-01-18 07:07:56+00:00,,37030.62,149.86,AL,...,"MULTIPOLYGON (((-86.18406 31.53503, -86.18406 ...",2013-01-01,31401030102,52.25,62.17,62.64,87.36,826.74,4.38,6.0
3,{F0D9874D-52BA-4FDC-A5E6-E259B627764D},,,,,2013-01-18 07:07:56+00:00,,37030.62,149.86,AL,...,"MULTIPOLYGON (((-86.18406 31.53503, -86.18406 ...",2013-02-01,31401030102,54.3,62.73,148.98,584.14,1199.16,82.88,11.79
4,{2E0CB201-5672-45B5-8CA7-A60070122697},,,,,2013-01-18 07:07:56+00:00,,26011.73,105.27,AL,...,"MULTIPOLYGON (((-86.29029 31.27059, -86.29089 ...",2013-01-01,31401030103,52.25,62.32,62.63,87.44,826.8,4.41,6.0


In [19]:
huc12_basins_nhm_ts.tail()

Unnamed: 0,TNMID,METASOURCEID,SOURCEDATADESC,SOURCEORIGINATOR,SOURCEFEATUREID,LOADDATE,GNIS_ID,AREAACRES,AREASQKM,STATES,...,geometry,time,huc12,gwres_flow,hru_actet,hru_lateral_flow,hru_ppt,hru_storage,sroff,ssres_flow
164919,{DEB7E9C4-9C1C-415A-A192-55345DAC2D38},{E21A2042-4A75-4209-8C85-079C6CD97C0E},,,,2019-04-02 10:07:21+00:00,,11615.54,47.01,WI,...,"MULTIPOLYGON (((-87.51307 44.87002, -87.51292 ...",2013-02-01,40301020402,24.22,14.34,55.11,90.36,131.9,4.86,26.01
164920,{1F644392-2D8A-4C26-9E90-848513EDFDCB},{E21A2042-4A75-4209-8C85-079C6CD97C0E},,,,2019-04-02 10:07:21+00:00,,5006.61,20.26,WI,...,"MULTIPOLYGON (((-87.39776 44.89996, -87.39755 ...",2013-01-01,40301020111,14.2,3.58,29.87,54.25,75.86,2.3,13.36
164921,{1F644392-2D8A-4C26-9E90-848513EDFDCB},{E21A2042-4A75-4209-8C85-079C6CD97C0E},,,,2019-04-02 10:07:21+00:00,,5006.61,20.26,WI,...,"MULTIPOLYGON (((-87.39776 44.89996, -87.39755 ...",2013-02-01,40301020111,27.74,14.81,53.7,78.66,86.03,2.57,23.38
164922,{8FCD3BC7-DB14-42DB-99E3-B5A521110A8F},{3BE315E6-ED72-4D29-BD78-CFF342F864E7},,,,2019-05-23 13:17:43+00:00,,30635.06,123.98,MN,...,"MULTIPOLYGON (((-90.17875 48.08237, -90.17745 ...",2013-01-01,40101010201,6.19,15.81,7.13,50.59,107.21,0.0,0.94
164923,{8FCD3BC7-DB14-42DB-99E3-B5A521110A8F},{3BE315E6-ED72-4D29-BD78-CFF342F864E7},,,,2019-05-23 13:17:43+00:00,,30635.06,123.98,MN,...,"MULTIPOLYGON (((-90.17875 48.08237, -90.17745 ...",2013-02-01,40101010201,3.77,17.81,3.77,32.65,118.28,0.0,0.0


save to geoparquet and check file size

In [20]:
fname='rsignellbucket2/asnyder/huc12_nhm_ts.geoparquet'
with fs_write.open(fname, mode='wb') as f:
    huc12_basins_nhm_ts.to_parquet(f)

In [21]:
fname='rsignellbucket2/asnyder/huc12_nhm_ts.geoparquet'
print(f'size of huc12 NHM time series geoparquet: {convert_size(fs_write.size(fname))}')

size of huc12 NHM time series geoparquet: 2.43 GB


# Experimental - Write geospatial time series data to geopackage

In [18]:
fname='rsignellbucket2/nwc/huc12/huc12_nhm_ts.gpkg'
with fs_write.open(fname, mode='wb') as f:
    huc12_basins_nhm_ts.to_file(f, layer='huc12', driver="GPKG") 

In [19]:
fname='rsignellbucket2/nwc/huc12/huc12_nhm_ts.gpkg'
print(f'size of huc12 NHM time series geoparquet: {convert_size(fs_write.size(fname))}')

size of huc12 NHM time series geoparquet: 3.27 GB


# Try half the data - 1month

## geoparquet

In [None]:
nhm_df = nhm_ds.sel(time=slice('2013-01-01','2013-01-31')).to_dataframe()
huc12_basins_nhm_ts = huc12_basins.merge(nhm_df.reset_index(), left_on='HUC12', right_on='huc12')

In [None]:
fname='rsignellbucket2/asnyder/huc12_nhm_ts_half.geoparquet'
with fs_write.open(fname, mode='wb') as f:
    huc12_basins_nhm_ts.to_parquet(f)

In [None]:
print(f'size of huc12 NHM time series geoparquet: {convert_size(fs_write.size(fname))}')

## geopackage

In [None]:

fname='rsignellbucket2/nwc/huc12/huc12_nhm_ts_half.gpkg'
with fs_write.open(fname, mode='wb') as f:
    huc12_basins_nhm_ts.to_file(f, layer='huc12', driver="GPKG") 

In [None]:
print(f'size of huc12 NHM time series geoparquet: {convert_size(fs_write.size(fname))}')