## File Processing

In [1]:
files = c3.HindcastFile.fetch(
    {
        'include': "this",
        'filter': 'status=="downloaded"'
    })
files

c3.FetchResult<HindcastFile>(
 objs=c3.Arry<HindcastFile>([c3.HindcastFile(
         id='007e2816-3905-434b-b957-f7b0d0b03f46',
         name='GOMu0.04-expt_90.1m000-2021-2021-09-15T04:00:00Z.nc',
         meta=c3.Meta(
                tenantTagId=150,
                tenant='dev',
                tag='tc01d',
                created=datetime.datetime(2021, 10, 15, 16, 9, 43, tzinfo=datetime.timezone.utc),
                createdBy='dadams@illinois.edu',
                updated=datetime.datetime(2021, 10, 15, 16, 11, 12, tzinfo=datetime.timezone.utc),
                updatedBy='worker',
                timestamp=datetime.datetime(2021, 10, 15, 16, 11, 12, tzinfo=datetime.timezone.utc),
                fetchInclude='[]',
                fetchType='HindcastFile'),
         version=3,
         hindcastArchive=c3.HindcastArchive(
                           id='7a187ef2-d88d-4658-b19f-451aef87535f'),
         subsetOptions=c3.HycomSubsetOptions(
                         timeRange=c3.TimeRan

In [56]:
# Prototype for prosessing hindcast files
from datetime import datetime, timedelta

def process(this):
    """ Process a single Hindcast NetCDF file into the Hindcast__Data types"""
    # extract surface data for a variable
    hycom_file = c3.HycomUtil.nc_open(this.file.url)
    time_origin = datetime.strptime(hycom_file.variables['time'].__dict__['time_origin'] + ' +0000',
                                        '%Y-%m-%d %H:%M:%S %z')
    
    # extract lat-long, or derive this from types
    # Not yet done:determine the offset for each based on the subsetOptions for this file
    # Note: for now it's just an integer list
    latitudes = range(len(hycom_file['lat']))
    longitudes = range(len(hycom_file['lon']))
    time = time_origin + timedelta(hours=hycom_file.variables['time'][:][0])

    # Create list of instantiated SurfaceHindcastData types
    # Create a parent id for each lat-long pair in the file.
    # use lat-long indicies to create a string parent id

    data_records = [
        c3.SurfaceHindcastData( # look at the code to understand, TimeDataPoint (Check out Type)
            **{
                'start': time,
                'parent': 'HNDCST_SRFC_' + str(i) + '-' + str(j),
                'name': 'water_u',  # variable
                'for': time,           # timestamp
                'water_u': hycom_file.variables['water_u'][:].data[0,0,i,j],
                'water_v': hycom_file.variables['water_v'][:].data[0,0,i,j]
            }
        )
        for i in latitudes[:2]
            for j in longitudes[:2]
    ]
    # upsert to data store
    c3.SurfaceHindcastData.upsertBatch(data_records)

    # close the file ds, url
    c3.HycomUtil.nc_close(ds=hycom_file, url=this.file.url)
    
    # aggregation in space is possible but a bit more complex

In [60]:
files = c3.HindcastFile.fetch({
    'include': "this,file.file.url",
    'filter': 'hindcastArchive=="ef9fdcb6-050e-4986-8fda-4868c9a67db7"'
}).objs
cntr = 0
for file in files:
    print(cntr)
    process(file)
    cntr+=1
    

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167


## Time Check
It should not be neccasary to introspect the file for time since the time information that was used to request teh file is soted in the db record.  Below is a verification of this assumption.

In [11]:
from datetime import datetime, timedelta
# Grab a file from an archive
arid = 'ef9fdcb6-050e-4986-8fda-4868c9a67db7'
ar = c3.HindcastArchive.get(arid)
file = c3.HindcastFile.fetch(spec={
    'filter': 'hindcastArchive=="'+arid+'" && status=="downloaded"'
}).objs[0]

# Open to netCDF dataset
ds = c3.HycomUtil.nc_open(file.file.url)
# Check time in NetCDF file
time_origin = datetime.strptime(ds.variables['time'].__dict__['time_origin'] + ' +0000',
                                        '%Y-%m-%d %H:%M:%S %z')
print(f"time origin: {time_origin}")
time = time_origin + timedelta(hours=ds.variables['time'][:][0])
print(f"time: {time}")
c3.HycomUtil.nc_close(ds,file.file.url)

# Cross check time from NetCDF file with time in DB
timeRange = file.subsetOptions.timeRange
timeStep = timedelta(hours = file.subsetOptions.timeStride)
def gentimes():
    t = timeRange.start
    while t <= timeRange.end:
        yield t
        t += timeStep
times = list(gentimes())
print (f"db times: {times}")

time origin: 2000-01-01 00:00:00+00:00
time: 2021-10-01 12:00:00+00:00
db times: [datetime.datetime(2021, 10, 1, 12, 0)]


## Lat Long Check

In [45]:

# Grab a file from an archive
arid = 'ef9fdcb6-050e-4986-8fda-4868c9a67db7'
ar = c3.HindcastArchive.get(arid)
file = c3.HindcastFile.fetch(spec={
    'include': "this,hindcastArchive.hindcast.dataset.geospatialCoverage,hindcastArchive.hindcast.dataset.geospatialResolution",
    'filter': 'hindcastArchive=="'+arid+'" && status=="downloaded"'
}).objs[0]

# Open to netCDF dataset
ds = c3.HycomUtil.nc_open(file.file.url)
xgrid = ds.variables['lon'][:]
print(f"From file: {xgrid[0:3]}")
print(f"Diffs from file: {[xgrid[1]-xgrid[0],xgrid[2]-xgrid[1]]}")
print(f"From subsetOptions: {file.subsetOptions.geospatialCoverage.start.longitude}")

# compute expected offset
#dataset = c3.HycomDataset.get('GOMu0.04/expt_90.1m000')
xres = file.hindcastArchive.hindcast.dataset.geospatialResolution.lonResolution
big_xgrid_start = file.hindcastArchive.hindcast.dataset.geospatialCoverage.start.longitude
big_xgrid_end = file.hindcastArchive.hindcast.dataset.geospatialCoverage.end.longitude
xsteps = (xgrid[0] - big_xgrid_start)/xres
xstart_offset = int (xsteps)
print (xsteps)
print (xstart_offset)

# Cross check location in the HycomXGrid type
big_xgrid = c3.HycomXGrid.fetch(spec={
    'order': "ascending(index)",
    'filter': f"index=={diff}"
}).objs
for x in big_xgrid:
    print(x.longitude)


From file: [-96.47998047 -96.44000244 -96.40002441]
Diffs from file: [0.03997802734375, 0.03997802734375]
From subsetOptions: -96.5
38.02137404580153
38
-96.47998046875


In [1]:
# Grab a file from an archive
arid = 'ef9fdcb6-050e-4986-8fda-4868c9a67db7'
ar = c3.HindcastArchive.get(arid)
file = c3.HindcastFile.fetch(spec={
    'include': "this,hindcastArchive.hindcast.dataset.geospatialCoverage,hindcastArchive.hindcast.dataset.geospatialResolution",
    'filter': 'hindcastArchive=="'+arid+'" && status=="downloaded"'
}).objs[0]

# Open to netCDF dataset
ds = c3.HycomUtil.nc_open(file.file.url)
xgrid = ds.variables['lon'][:]
big_xgrid = c3.HycomXGrid.fetch(spec={
    'order': "ascending(abs("+str(xgrid[0])+"-longitude))",
    'limit': 1
}).objs
for x in big_xgrid:
    print(x.index,x.longitude, abs(xgrid[0] - x.longitude))

38 -96.47998046875 0.0


## Use EvalMetricsSourceSpec

In [123]:
from datetime import datetime
import numpy as np

def getDataSubset(imin,jmin,imax,jmax,metric,start,end,batchSize=100):
    """Returns a subset for the given metric data from the given start and end time.

    Args:
        imin (int): Minimum longitude from lat-long pair indices
        jmin (int): Minimum latitude from lat-long pair indices
        imax (int): Maximum longitude from lat-long pair indices
        jmax (int): Maximum latitude from lat-long pair indices
        metric (str): Metric to be extracted from the dataset
        start (datetime): Start time of the subset
        end (datetime): End time of the subset
        batchSize (int): Number of lat-long pairs to be extracted in each batch

    Returns:
        array: Numpy array of the subsetted data ordered as [time,lat,lon]

    Notes:
        - Currently limited to hour resolution
        - Only handles a single metric
        - The data indexing is not efficient since the indices must be parsed from the
        lat-long pair ids.  If the HycomLatLong pair table was ordered by i,j, then
        the pairs would get extracted in i,j order and there would be no need to parse.
        this could be done by defining a new id that is a simple ascending integer.

    """
    
    my_spec = c3.EvalMetricsSpec(
            filter = "i>="+str(imin)+" && i<="+str(imax)+" && j>="+str(jmin)+" && j<="+str(jmax),
            limit = -1,
            expressions = [metric],
            start = start.strftime("%Y-%m-%d"),
            end = end.strftime("%Y-%m-%d"),
            interval = "HOUR" 
        )
    # Evaluate the Spec using EvalSourceSpec which returns a stream of numpy arrays
    sourceType = c3.TypeRef(typeName="HycomLatLongPair")
    em_source_spec = c3.EvalMetricsSourceSpec.createNdArraySourceSpec(my_spec, sourceType)
    em_source_spec.batchExport()
    stream_spec = c3.BatchStreamSpec(batchSize=batchSize)
    source_stream = em_source_spec.toStream(stream_spec)
    
    # Process the data in to a (time,long,lat) data array
    duration = end - start
    duration_in_s = duration.total_seconds()
    nt = int(divmod(duration_in_s, 3600)[0]) # total duration in hours
    data = np.zeros((nt,imax-imin+1,jmax-jmin+1))
    while source_stream.hasNext():
        stream = source_stream.next()
        idi = -1
        # Transpose time series
        for id in stream.indices[0]:
            idi += 1
            istr,jstr = id.split('_')[1].split('-')
            i = int(istr) - imin
            j = int(jstr) - jmin
            ti = 0
            for t in stream.indices[2]:
                data[ti,i,j] = stream.data[idi,0,ti]
                ti += 1
    
    # Cleans up any persisted files storing snapshot of data source
    em_source_spec.cleanUp()
    # Removes spec from database
    em_source_spec.remove()
    return data

In [128]:
udata = getDataSubset(
    imin = 81,
    jmin = 81,
    imax = 90,
    jmax = 90,
    metric = "TestAverageWaterU",
    start = datetime(2021,10,1),
    end = datetime(2021,10,8)
)
udata.shape

(168, 10, 10)

## Current implementation of subsetting from nc file example

In [23]:
from statistics import mean
from datetime import datetime, timedelta
import os
import netCDF4
import numpy as np
import bisect
def get_current_data_subset(nc_file, x_0, x_T, deg_around_x0_xT_box, fixed_time=None,
                            temporal_stride=1, temp_horizon_in_h=None):
    """ Function to read a subset of the nc_file current data bounded by a box spanned by the x_0 and x_T points.
    Inputs:
        nc_file                 full path to nc file
        x_0                     [lat, lon, charge, timestamp in POSIX]
        x_T                     [lon, lat] goal locations
        deg_around_x0_xT_box    float, buffer around the box in degrees
        fixed_time              if None returns time-varying currents, 
                                otherwise datetime object of the fixed time -> returns ocean current grid at or before time
                                the time of x_0 is then ignored
        temporal_stride         int, if a stride of the temporal values is used (every temporal_stride hours)
        temp_horizon            if None: all available time of the file will be provided
                                otherwise float, maximum temp_horizon to look ahead of x_0 time in hours
                                
    Outputs:
        grids_dict              dict containing x_grid, y_grid, t_grid, fixed_time_idx
        u_data                  [T, Y, X] matrix of the ocean currents in x direction in m/s
        v_data                  [T, Y, X] matrix of the ocean currents in y direction in m/s
        
    """
    
    f = netCDF4.Dataset(nc_file)

    # extract positiond & start_time for the indexing
    x_0_pos = x_0[:2]
    x_0_posix_time = x_0[3]
    x_T = x_T[:2]

    # Step 1: get the grids
    xgrid = f.variables['lon'][:]
    ygrid = f.variables['lat'][:]
    t_grid = f.variables['time'][:] # not this is in hours from HYCOM data!
    
    # this is needed because the time origin in hindcast and forecase nc files is different. Very handcrafted.
    try:
        time_origin = datetime.strptime(f.variables['time'].__dict__['time_origin'] + ' +0000',
                                    '%Y-%m-%d %H:%M:%S %z')
    except:
        time_origin = datetime.strptime(f.variables['time'].__dict__['units'] + ' +0000',
                                                 'hours since %Y-%m-%d %H:%M:%S.000 UTC %z')

    # Step 2: find the sub-setting
    # find the lat & lon sub-set bounds
    lon_bnds = [min(x_0_pos[0], x_T[0]) - deg_around_x0_xT_box, max(x_0_pos[0], x_T[0]) + deg_around_x0_xT_box]
    lat_bnds = [min(x_0_pos[1], x_T[1]) - deg_around_x0_xT_box, max(x_0_pos[1], x_T[1]) + deg_around_x0_xT_box]

    # get the respective indices from the grids
    ygrid_inds = np.where((ygrid > lat_bnds[0]) & (ygrid < lat_bnds[1]))[0]
    xgrid_inds = np.where((xgrid > lon_bnds[0]) & (xgrid < lon_bnds[1]))[0]
    print(xgrid_inds)

    # for time indexing transform to POSIX time
    abs_t_grid = [(time_origin + timedelta(hours=X)).timestamp() for X in t_grid.data]
    
    # get the idx of the value left of the demanded time (for interpolation function)
    t_start_idx = bisect.bisect_right(abs_t_grid, x_0_posix_time) - 1
    if t_start_idx == len(abs_t_grid) - 1 or t_start_idx == -1:
        raise ValueError("Requested subset time is outside of the nc4 file.")

    # get the max time if provided as input
    if temp_horizon_in_h is None:   # all data provided
        t_end_idx = len(abs_t_grid)-1
    else:
        t_end_idx = bisect.bisect_right(abs_t_grid, x_0_posix_time + temp_horizon_in_h*3600.)
        if t_end_idx == len(abs_t_grid):
            raise ValueError("nc4 file does not contain requested temporal horizon.")

    # fixed time logic if necessary
    if fixed_time is None:
        slice_for_time_dim = np.s_[t_start_idx:(t_end_idx+1):temporal_stride]
        fixed_time_idx = None
    else:
        fixed_time_idx = bisect.bisect_right(abs_t_grid, fixed_time.timestamp()) - 1
        slice_for_time_dim = np.s_[fixed_time_idx]

    # Step 2: extract data
    # raw water_u is [tdim, zdim, ydim, xdim]
    if len(f.variables['water_u'].shape) == 4:  # if there is a depth dimension in the dataset
        u_data = f.variables['water_u'][slice_for_time_dim, 0, ygrid_inds, xgrid_inds]
        v_data = f.variables['water_v'][slice_for_time_dim, 0, ygrid_inds, xgrid_inds]
    # raw water_u is [tdim, ydim, xdim]
    elif len(f.variables['water_u'].shape) == 3:  # if there is no depth dimension in the dataset
        u_data = f.variables['water_u'][slice_for_time_dim, ygrid_inds, xgrid_inds]
        v_data = f.variables['water_v'][slice_for_time_dim, ygrid_inds, xgrid_inds]
    else:
        raise ValueError("Current data in nc file has neither 3 nor 4 dimensions. Check file.")

    # create dict to output
    grids_dict = {'x_grid': xgrid[xgrid_inds], 'y_grid': ygrid[ygrid_inds],
                  't_grid': abs_t_grid[slice_for_time_dim], 'fixed_time_idx': fixed_time_idx}

    # log what data has been subsetted
    if fixed_time is None:
        print("Subsetted data from {start} to {end} in {n_steps} time steps of {time:.2f} hour(s) resolution".format(
            start=datetime.utcfromtimestamp(grids_dict['t_grid'][0]).strftime('%Y-%m-%d %H:%M:%S UTC'),
            end=datetime.utcfromtimestamp(grids_dict['t_grid'][-1]).strftime('%Y-%m-%d %H:%M:%S UTC'),
            n_steps=len(grids_dict['t_grid']), time=(grids_dict['t_grid'][1] - grids_dict['t_grid'][0])/3600.))
    else:
        print("Subsetted data to fixed time at: {time}".format(
            time=datetime.utcfromtimestamp(grids_dict['t_grid'][0]).strftime('%Y-%m-%d %H:%M:%S UTC')))

    #TODO: we replace the masked array with fill value 0 because otherwise interpolation doesn't work.
    # Though that means we cannot anymore detect if we're on land or not (need a way to do that/detect stranding)
    # not sure yet if we'll do it in the simulator or where.
    # return grids_dict, u_data.filled(fill_value=0.), v_data.filled(fill_value=0.)
    return grids_dict, u_data, v_data

In [19]:
# Grab a file from an archive
arid = 'ef9fdcb6-050e-4986-8fda-4868c9a67db7'
ar = c3.HindcastArchive.get(arid)
file = c3.HindcastFile.fetch(spec={
    'include': "this",
    'filter': 'hindcastArchive=="'+arid+'" && status=="downloaded"'
}).objs[0]

# Open to netCDF dataset
ds = c3.HycomUtil.nc_open(file.file.url)
!ls /tmp

GOMu0.04-expt_90.1m000-2021-2021-01-01T12:00:00Z-2021-01-02T12:00:00Z.nc
GOMu0.04-expt_90.1m000-2021-2021-01-01T12:00:00Z.nc
GOMu0.04-expt_90.1m000-2021-2021-01-03T12:00:00Z.nc
GOMu0.04-expt_90.1m000-2021-2021-05-01T12:00:00Z-2021-01-05T12:00:00Z.nc
GOMu0.04-expt_90.1m000-2021-2021-10-01T12:00:00Z.nc


In [24]:
# settings
hindcast_file = '/tmp/GOMu0.04-expt_90.1m000-2021-2021-10-01T12:00:00Z.nc'
x_0 = [-88.0, 25.0, 1, 1622549410.0]  # lon, lat, battery, posix_time
x_T = [-88.0, 26.3]
deg_around_x0_xT_box = 0.5
fixed_time = None
temporal_stride = 1

# function call
grids_dict, u_data, v_data = get_current_data_subset(hindcast_file,
                                                     x_0, x_T,
                                                     deg_around_x0_xT_box,
                                                     fixed_time,
                                                     temporal_stride)

[200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
 218 219 220 221 222 223 224]


ValueError: Requested subset time is outside of the nc4 file.