In [1]:
from datetime import datetime, timedelta
import os
import netCDF4
import numpy as np
import bisect

# Overview of flexible data-loading functionality

1) Below you'll find the function I currently use to subset the data.
All this would subsetting in space & time and more (like averaging currents over time instead of the current temporal slides) should be done via the C3 platform by dynamically loading the hindcast data from the server and/or the 
archive of forecast files. And then just return the right current matrices and grids back for use in the simulator.

2) A sketch of how the function may look like in the end
Of course we can start a lot simpler without any of the averaging/interpolation yet =)

Thanks a lot for looking into this for us, that helps a lot!

# 1) Current implementation of subsetting from nc file example

In [None]:
from statistics import mean
def get_current_data_subset(nc_file, x_0, x_T, deg_around_x0_xT_box, fixed_time=None,
                            temporal_stride=1, temp_horizon_in_h=None):
    """ Function to read a subset of the nc_file current data bounded by a box spanned by the x_0 and x_T points.
    Inputs:
        nc_file                 full path to nc file
        x_0                     [lat, lon, charge, timestamp in POSIX]
        x_T                     [lon, lat] goal locations
        deg_around_x0_xT_box    float, buffer around the box in degrees
        fixed_time              if None returns time-varying currents, 
                                otherwise datetime object of the fixed time -> returns ocean current grid at or before time
                                the time of x_0 is then ignored
        temporal_stride         int, if a stride of the temporal values is used (every temporal_stride hours)
        temp_horizon            if None: all available time of the file will be provided
                                otherwise float, maximum temp_horizon to look ahead of x_0 time in hours
                                
    Outputs:
        grids_dict              dict containing x_grid, y_grid, t_grid, fixed_time_idx
        u_data                  [T, Y, X] matrix of the ocean currents in x direction in m/s
        v_data                  [T, Y, X] matrix of the ocean currents in y direction in m/s
        
    """
    
    f = netCDF4.Dataset(nc_file)

    # extract positiond & start_time for the indexing
    x_0_pos = x_0[:2]
    x_0_posix_time = x_0[3]
    x_T = x_T[:2]

    # Step 1: get the grids
    xgrid = f.variables['lon'][:]
    print(0,xgrid[0])
    print(1,xgrid[1])
    print(42,xgrid[42])
    print(len(xgrid)-1,xgrid[len(xgrid)-1])
    print(xgrid[1]-xgrid[0])
    print(xgrid[42]-xgrid[41])
    print(xgrid[540]-xgrid[539])
    print(len(xgrid))
    ygrid = f.variables['lat'][:]
    print(ygrid[0])
    print(ygrid[len(ygrid)-1])
    print(ygrid[1]-ygrid[0])
    print(ygrid[42]-ygrid[41])
    print(len(ygrid))
    diffs= [
        xgrid[i+1]-xgrid[i]
        for i in range(0,len(xgrid)-2)
    ]
    print(mean(diffs))
    
    
    t_grid = f.variables['time'][:] # not this is in hours from HYCOM data!
    
    # this is needed because the time origin in hindcast and forecase nc files is different. Very handcrafted.
    try:
        time_origin = datetime.strptime(f.variables['time'].__dict__['time_origin'] + ' +0000',
                                    '%Y-%m-%d %H:%M:%S %z')
    except:
        time_origin = datetime.strptime(f.variables['time'].__dict__['units'] + ' +0000',
                                                 'hours since %Y-%m-%d %H:%M:%S.000 UTC %z')

    # Step 2: find the sub-setting
    # find the lat & lon sub-set bounds
    lon_bnds = [min(x_0_pos[0], x_T[0]) - deg_around_x0_xT_box, max(x_0_pos[0], x_T[0]) + deg_around_x0_xT_box]
    lat_bnds = [min(x_0_pos[1], x_T[1]) - deg_around_x0_xT_box, max(x_0_pos[1], x_T[1]) + deg_around_x0_xT_box]

    # get the respective indices from the grids
    ygrid_inds = np.where((ygrid > lat_bnds[0]) & (ygrid < lat_bnds[1]))[0]
    xgrid_inds = np.where((xgrid > lon_bnds[0]) & (xgrid < lon_bnds[1]))[0]

    # for time indexing transform to POSIX time
    abs_t_grid = [(time_origin + timedelta(hours=X)).timestamp() for X in t_grid.data]
    
    # get the idx of the value left of the demanded time (for interpolation function)
    t_start_idx = bisect.bisect_right(abs_t_grid, x_0_posix_time) - 1
    if t_start_idx == len(abs_t_grid) - 1 or t_start_idx == -1:
        raise ValueError("Requested subset time is outside of the nc4 file.")

    # get the max time if provided as input
    if temp_horizon_in_h is None:   # all data provided
        t_end_idx = len(abs_t_grid)-1
    else:
        t_end_idx = bisect.bisect_right(abs_t_grid, x_0_posix_time + temp_horizon_in_h*3600.)
        if t_end_idx == len(abs_t_grid):
            raise ValueError("nc4 file does not contain requested temporal horizon.")

    # fixed time logic if necessary
    if fixed_time is None:
        slice_for_time_dim = np.s_[t_start_idx:(t_end_idx+1):temporal_stride]
        fixed_time_idx = None
    else:
        fixed_time_idx = bisect.bisect_right(abs_t_grid, fixed_time.timestamp()) - 1
        slice_for_time_dim = np.s_[fixed_time_idx]

    # Step 2: extract data
    # raw water_u is [tdim, zdim, ydim, xdim]
    if len(f.variables['water_u'].shape) == 4:  # if there is a depth dimension in the dataset
        u_data = f.variables['water_u'][slice_for_time_dim, 0, ygrid_inds, xgrid_inds]
        v_data = f.variables['water_v'][slice_for_time_dim, 0, ygrid_inds, xgrid_inds]
    # raw water_u is [tdim, ydim, xdim]
    elif len(f.variables['water_u'].shape) == 3:  # if there is no depth dimension in the dataset
        u_data = f.variables['water_u'][slice_for_time_dim, ygrid_inds, xgrid_inds]
        v_data = f.variables['water_v'][slice_for_time_dim, ygrid_inds, xgrid_inds]
    else:
        raise ValueError("Current data in nc file has neither 3 nor 4 dimensions. Check file.")

    # create dict to output
    grids_dict = {'x_grid': xgrid[xgrid_inds], 'y_grid': ygrid[ygrid_inds],
                  't_grid': abs_t_grid[slice_for_time_dim], 'fixed_time_idx': fixed_time_idx}

    # log what data has been subsetted
    if fixed_time is None:
        print("Subsetted data from {start} to {end} in {n_steps} time steps of {time:.2f} hour(s) resolution".format(
            start=datetime.utcfromtimestamp(grids_dict['t_grid'][0]).strftime('%Y-%m-%d %H:%M:%S UTC'),
            end=datetime.utcfromtimestamp(grids_dict['t_grid'][-1]).strftime('%Y-%m-%d %H:%M:%S UTC'),
            n_steps=len(grids_dict['t_grid']), time=(grids_dict['t_grid'][1] - grids_dict['t_grid'][0])/3600.))
    else:
        print("Subsetted data to fixed time at: {time}".format(
            time=datetime.utcfromtimestamp(grids_dict['t_grid'][0]).strftime('%Y-%m-%d %H:%M:%S UTC')))

    #TODO: we replace the masked array with fill value 0 because otherwise interpolation doesn't work.
    # Though that means we cannot anymore detect if we're on land or not (need a way to do that/detect stranding)
    # not sure yet if we'll do it in the simulator or where.
    # return grids_dict, u_data.filled(fill_value=0.), v_data.filled(fill_value=0.)
    return grids_dict, u_data, v_data

# Download hindcast file using C3 Helper Utility
The specialize C3 type called `HindcastArchive` has a helper function called `downloadLocal`. The procedure for using this is:
    * Get an instance of the `HindcastArchive` type corresponding to a hindcast year.
    * Construct a set of options using the `HycomSubsetOptions` type.
    * call the `downloadLocal` member function using the aformention instane, options plus the desired local path.

In [3]:
# Make sure there are records for HindcastArchive
import requests
import xmltodict
url = "https://tds.hycom.org/thredds/catalogs/GOMu0.04/expt_90.1m000.xml"
with requests.get(url) as r:
        doc = xmltodict.parse(r.text)
for ds in doc['catalog']['dataset']['dataset']['dataset']:
    if 'ALL' in ds['@name'] or 'FMRC' in ds['@name']: # restrict to Hindcast
        continue
    har = c3.HindcastArchive(
        **{
            "id": ds['@ID'],
            "dataset": {"id": "GOMu0.04_901m000_FMRC_1.0.1"},
            "name":ds['@ID'],
            "description": ds['@name'],
            "urlPath": ds['@urlPath']
            
          }
    ).upsert()

In [4]:
# Convenience function to download Hindcast file to local path
def downloadHindcast(da, opts, localPath='/tmp'):
    # Generate the thredds query url
    url = c3.HycomUtil.createThreddsUrl(da.urlPath, opts)
    print(url)
    # Download the file to c3 storage
    if opts.timeRange.start == opts.timeRange.end:
        filename = da.id + '-' + opts.timeRange.start.strftime("%Y-%m-%dT%H:%M:%SZ") + '.nc'
    else:
        filename = da.id + '-' + opts.timeRange.start.strftime("%Y-%m-%dT%H:%M:%SZ") + '-' + opts.timeRange.end.strftime("%Y-%m-%dT%H:%M:%SZ") + '.nc'
    print(f"downloading {filename}...")
    extPath = c3.HycomUtil.downloadToExternal(url, filename, 'hycom-tmp')
    # Move file to "local" jupyter pod filesystem
    ret1 = c3.Client.copyFilesToLocalClient(extPath, localPath)
    ret2 = c3.FileSystem.inst().deleteFiles(extPath, True)
    return localPath + '/' + filename

In [None]:
# Retrive a hindcast file
da = c3.HindcastArchive.get("GOMu0.04-expt_90.1m000-2021") # Get the 2021 Archive record
# Set options for single file download
opts = c3.HycomSubsetOptions(
    **{
        "timeRange": {
            "start": datetime(2021,6,1,12),
            "end": datetime(2021,6,6,12)
        },
        "vars": "water_u,water_v"
    }
)

hindcast_file = downloadHindcast(da, opts)
print(hindcast_file)
ds = netCDF4.Dataset(hindcast_file)
print(ds)

https://ncss.hycom.org/thredds/ncss/GOMu0.04/expt_90.1m000/data/hindcasts/2021?var=water_u&var=water_v&disableLLSubset=on&disableProjSubset=on&horizStride=1&timeStride=1&vertStride=1&addLatLon=false&accept=netcdf4&time_start=2021-06-01T12%3A00%3A00Z&time_end=2021-06-06T12%3A00%3A00Z
downloading GOMu0.04-expt_90.1m000-2021-2021-06-01T12:00:00Z-2021-06-06T12:00:00Z.nc...


# Execute Current Subsetting

In [22]:
print(datetime.fromtimestamp(1622549410))

2021-06-01 12:10:10


In [48]:
# example execution

# settings
hindcast_file = '2021_06_1-05_hourly.nc4'
x_0 = [-88.0, 25.0, 1, 1622549410.0]  # lon, lat, battery, posix_time
x_T = [-88.0, 26.3]
deg_around_x0_xT_box = 0.5
fixed_time = None
temporal_stride = 1

# function call
grids_dict, u_data, v_data = get_current_data_subset(hindcast_file,
                                                     x_0, x_T,
                                                     deg_around_x0_xT_box,
                                                     fixed_time,
                                                     temporal_stride)

0 -98.0
1 -97.96002197265625
42 -96.32000732421875
540 -76.4000244140625
0.03997802734375
0.03997802734375
0.03997802734375
541
18.1200008392334
31.920000076293945
0.03999900817871094
0.03999900817871094
346
0.03999999547048933
Subsetted data from 2021-06-01 12:00:00 UTC to 2021-06-06 11:00:00 UTC in 120 time steps of 1.00 hour(s) resolution


# 2) Sketch of how C3 function interface might look like

In [None]:
def get_current_data_subset_from_c3(
    t_interval, temp_res_in_h,
    lat_interval, lat_res_in_deg,
    lon_interval, lon_res_in_deg,
    depth_interval_to_avg_over):
    
    """ Function to get a subset of current data via the C3 data integration.
    
    Inputs:
        t_interval              if time-varying: [t_0, t_T] in POSIX time
                                where t_0 and t_T are the start and end timestamps respectively
                                if fixed_time:   [fixed_timestamp] in POSIX
        temp_res_in_h           which temporal resolution the time-axis should have
                                e.g. if temp_res_in_h = 1, t_grid = [t_0, t_0 + 3600s, ... t_T]
                                if temp_res_in_h = 5,      t_grid = [t_0, t_0 + 5*3600s, ... t_T]
                                if temp_res_in_h = 0.5,      t_grid = [t_0, t_0 + 1800s, ... t_T]
                                => so either averaging or interpolation needs to be done in the backend
        lat_interval            [y_lower, y_upper] in degrees
        lat_res_in_deg          which spatial resolution in y direction in degrees
                                e.g. if lat_res_in_deg = 1, y_grid = [y_lower, y_lower + 1, ... y_upper]
                                 => so either averaging or interpolation needs to be done in the backend
        lon_interval            [x_lower, x_upper] in degrees
        lon_res_in_deg          which spatial resolution in x direction in degrees
                                e.g. if lon_res_in_deg = 1, x_grid = [x_lower, x_lower + 1, ... x_upper]
                                 => so either averaging or interpolation needs to be done in the backend
        depth_interval_to_avg_over
                   b             Interval to average over the current dimension in meters
                                e.g. [0, 10] then the currents are averaged over the depth 0-10m.
                                
    Outputs:
        grids_dict              dict containing x_grid, y_grid, t_grid
        u_data                  [T, Y, X] matrix of the ocean currents in x direction in m/s
        v_data                  [T, Y, X] matrix of the ocean currents in y direction in m/s
    """
    
    # some C3 magic =)
    
    return grids_dict, u_data, v_data

In [21]:
help(c3.LatLong)

In [34]:
#Generate seed data for latlong pairs
import json
x0 = -98.0
dx = 0.03997802734375
y0 = 18.1200008392334
dy = 0.03999900817871094
seed = [
    {
        "id": "GOMu0.04"+'_'+str(i)+'-'+str(j),
        "i":i,
        "j":j,
        "lng": x0 + i*dx,
        "lat": y0 + j*dy,
        "pair": c3.LatLong(
            **{
               "latitude": x0 + i*dx,
               "longitude": y0 + j*dy
            }
        ).toJson()
        
    }
    for i in range(541)
        for j in range(346)
]
with open('seed.json','w') as f:
    json.dump(seed,f)

In [52]:
import json
import csv
nc_file = '2021_06_1-05_hourly.nc4'
f = netCDF4.Dataset(nc_file)
xgrid = f.variables['lon'][:]
for i in range(0,len(xgrid)):
    if i == 0:
        print('id,dataset,index,longitude')
    print('GOMu0.04_'+str(i)+',GOMu0.04/expt_90.1m000,'+str(i)+','+str(xgrid[i]))

id,dataset,index,longitude
GOMu0.04_0,GOMu0.04/expt_90.1m000,0,-98.0
GOMu0.04_1,GOMu0.04/expt_90.1m000,1,-97.96002197265625
GOMu0.04_2,GOMu0.04/expt_90.1m000,2,-97.91998291015625
GOMu0.04_3,GOMu0.04/expt_90.1m000,3,-97.8800048828125
GOMu0.04_4,GOMu0.04/expt_90.1m000,4,-97.8399658203125
GOMu0.04_5,GOMu0.04/expt_90.1m000,5,-97.79998779296875
GOMu0.04_6,GOMu0.04/expt_90.1m000,6,-97.760009765625
GOMu0.04_7,GOMu0.04/expt_90.1m000,7,-97.719970703125
GOMu0.04_8,GOMu0.04/expt_90.1m000,8,-97.67999267578125
GOMu0.04_9,GOMu0.04/expt_90.1m000,9,-97.6400146484375
GOMu0.04_10,GOMu0.04/expt_90.1m000,10,-97.5999755859375
GOMu0.04_11,GOMu0.04/expt_90.1m000,11,-97.55999755859375
GOMu0.04_12,GOMu0.04/expt_90.1m000,12,-97.52001953125
GOMu0.04_13,GOMu0.04/expt_90.1m000,13,-97.47998046875
GOMu0.04_14,GOMu0.04/expt_90.1m000,14,-97.44000244140625
GOMu0.04_15,GOMu0.04/expt_90.1m000,15,-97.4000244140625
GOMu0.04_16,GOMu0.04/expt_90.1m000,16,-97.3599853515625
GOMu0.04_17,GOMu0.04/expt_90.1m000,17,-97.32000732421