In [None]:
from pynhd import NLDI, WaterData
import fsspec
import boto3
import geopandas as gpd
import math
import xarray as xr

In [None]:
fs_hytest_internal = fsspec.filesystem(
    's3',
    profile='osn-hytest-internal',  ## aws profile name for bucket you are writing data to
    client_kwargs={'endpoint_url': 'https://usgs.osn.mghpcc.org'}
)

In [None]:
fs_hytest = fsspec.filesystem(
    's3',
    profile='osn-hytest',  ## aws profile name for bucket you are writing data to
    client_kwargs={'endpoint_url': 'https://usgs.osn.mghpcc.org'}
)

In [None]:
def convert_size(size_bytes):
   if size_bytes == 0:
       return "0B"
   size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
   i = int(math.floor(math.log(size_bytes, 1024)))
   p = math.pow(1024, i)
   s = round(size_bytes / p, 2)
   return "%s %s" % (s, size_name[i])

# Read HUC12 data into geopandas dataframe
Data from https://www.sciencebase.gov/catalog/item/60cb5edfd34e86b938a373f4
Copied to OSN pod

In [None]:
# option 1 - get all HUC12 basins in the Delaware River Basin - using WBD geodatabase
# note - reading this file requires >4 GB of memory
gdb_file = 'zip+https://usgs.osn.mghpcc.org/hytest/wbd/WBD_National_GDB.zip'
huc12_basins = gpd.read_file(gdb_file, layer='WBDHU12')

In [None]:
# # option 2 - get all HUC12 basins in in the Delaware River Basin - using NLDI - this is a very large query for NLDI, not recommended
# # USGS gage 01482100 Delaware River at Del Mem Bridge at Wilmington De
# gage_id = '01482100'
# nldi = NLDI()
# del_basins = nldi.get_basins(gage_id)
# huc12_basins_del = WaterData('wbd12').bygeom(del_basins.geometry[0])
# #huc12_basins = WaterData('huc12')
# huc12_basins_del.plot()

In [None]:
# # option 3 - get all HUC12 basins in a bbox that encompasses the DRB
# # bbox (west, south, east, north)
# #huc12_basins = WaterData('wbd12').bybox((-124.7844079, 24.7433195, -66.9513812, 49.3457868), box_crs=4326, sort_attr='huc12')
# huc12_basins = WaterData('wbd12').bybox((-124.7844079, 24.7433195, -66.9513812, 49.3457868), box_crs=4326)

In [None]:
huc12_basins.plot()

In [None]:
#huc12_basins.head()

# Write HUC12 Geoparquet

In [None]:
fname='hytest/wbd/huc12/huc12.geoparquet'

In [None]:
with fs_hytest.open(fname, mode='wb') as f:
    huc12_basins.to_parquet(f)

In [None]:
fs_hytest.ls('hytest/wbd/huc12/')

In [None]:
print(f'size of huc12 geoparquet: {convert_size(fs_hytest.size(fname))}')

In [None]:
with fs_hytest.open(fname, mode='rb') as f:
    huc12_df = gpd.read_parquet(f)

In [None]:
huc12_df.head()

In [None]:
huc12_df.plot()

# Write geopackage

In [None]:
fname='hytest/wbd/huc12/huc12.gpkg'

In [None]:
with fs_hytest.open(fname, mode='wb') as f:
    huc12_basins.to_file(f, layer='huc12', driver="GPKG")

In [None]:
fs_hytest.ls('hytest/wbd/huc12/')

In [None]:
print(f'size of huc12 geopackage: {convert_size(fs_hytest.size(fname))}')

In [None]:
with fs_hytest.open(fname, mode='rb') as f:
    huc12_basins = gpd.read_file(f, layer='huc12', driver="GPKG")  

In [None]:
huc12_basins['HUC12'].head()

In [None]:
huc12_basins.plot()

# Experimental - Write geospatial time series data to geoparquet

Read in zarr time series data and check size

In [None]:
zarr_path = 'hytest-internal/nhm_prms_v1_1_gridmet/nhm_prms_v1_1_gridmet.zarr'
m = fs_hytest_internal.get_mapper(zarr_path)
nhm_ds = xr.open_dataset(m, engine='zarr', consolidated=False, chunks={})
# ds.sel(date='1983-01-01', huc12='010100020101').load()
nhm_ds

In [None]:
print(f'size of NHM zarr time series: {convert_size(fs_hytest_internal.du(zarr_path))}')

pull out just a slice of the time series (this is just to experiment, we don't need the whole dataset) and convert to a dataframe to zip up with huc12 basins geodataframe - this should be 12 data points (monthly)

In [None]:
nhm_df = nhm_ds.sel(time=slice('2013-01-01','2013-02-28')).to_dataframe()

zip up with geospatial data

In [None]:
huc12_basins_nhm_ts = huc12_basins.merge(nhm_df.reset_index(), left_on='HUC12', right_on='huc12')

In [None]:
huc12_basins_nhm_ts = huc12_basins_nhm_ts.drop('HUC12', axis=1)

In [None]:
huc12_basins_nhm_ts

In [None]:
huc12_basins_nhm_ts.loc[huc12_basins_nhm_ts['huc12']=='031401030101']

save to geoparquet and check file size

In [None]:
fname='hytest-internal/nhm_prms_v1_1_gridmet/huc12_nhm_ts.geoparquet'
with fs_hytest_internal.open(fname, mode='wb') as f:
    huc12_basins_nhm_ts.to_parquet(f)

In [None]:
fname='hytest-internal/nhm_prms_v1_1_gridmet/huc12_nhm_ts.geoparquet'
print(f'size of huc12 NHM time series geoparquet: {convert_size(fs_hytest_internal.size(fname))}')

# Experimental - Write geospatial time series data to geopackage

In [None]:
fname='hytest-internal/nhm_prms_v1_1_gridmet/huc12_nhm_ts.gpkg'
with fs_hytest_internal.open(fname, mode='wb') as f:
    huc12_basins_nhm_ts.to_file(f, layer='huc12', driver="GPKG") 

In [None]:
fname='hytest-internal/nhm_prms_v1_1_gridmet/huc12_nhm_ts.gpkg'
print(f'size of huc12 NHM time series geoparquet: {convert_size(fs_hytest_internal.size(fname))}')