In [1]:
from datetime import datetime, timedelta
import time
import netCDF4
import numpy as np
import bisect
import math

In [2]:
#Reference function
def get_current_data_subset(nc_file, x_0, x_T, deg_around_x0_xT_box, fixed_time=None,
                            temporal_stride=1, temp_horizon_in_h=None
):
    """ Function to read a subset of the nc_file current data bounded by a box spanned by the x_0 and x_T points.
    Inputs:
        nc_file                 full path to nc file
        x_0                     [lon, lat, charge, timestamp in POSIX]
        x_T                     [lon, lat] goal locations
        deg_around_x0_xT_box    float, buffer around the box in degrees
        fixed_time              if None returns time-varying currents, 
                                otherwise datetime object of the fixed time -> returns ocean current grid at or before time
                                the time of x_0 is then ignored
        temporal_stride         int, if a stride of the temporal values is used (every temporal_stride hours)
        temp_horizon            if None: all available time of the file will be provided
                                otherwise float, maximum temp_horizon to look ahead of x_0 time in hours
                                
    Outputs:
        grids_dict              dict containing x_grid, y_grid, t_grid, fixed_time_idx
        u_data                  [T, Y, X] matrix of the ocean currents in x direction in m/s
        v_data                  [T, Y, X] matrix of the ocean currents in y direction in m/s
        
    """
    
    f = netCDF4.Dataset(nc_file)

    # extract positiond & start_time for the indexing
    x_0_pos = x_0[:2]
    x_0_posix_time = x_0[3]
    x_T = x_T[:2]

    # Step 1: get the grids
    xgrid = f.variables['lon'][:]
    ygrid = f.variables['lat'][:]
    t_grid = f.variables['time'][:] # not this is in hours from HYCOM data!
    
    # this is needed because the time origin in hindcast and forecase nc files is different. Very handcrafted.
    try:
        time_origin = datetime.strptime(f.variables['time'].__dict__['time_origin'] + ' +0000',
                                        '%Y-%m-%d %H:%M:%S %z')
    except:
        time_origin = datetime.strptime(f.variables['time'].__dict__['units'] + ' +0000',
                                                 'hours since %Y-%m-%d %H:%M:%S.000 UTC %z')

    # Step 2: find the sub-setting
    # find the lat & lon sub-set bounds
    lon_bnds = [min(x_0_pos[0], x_T[0]) - deg_around_x0_xT_box, max(x_0_pos[0], x_T[0]) + deg_around_x0_xT_box]
    lat_bnds = [min(x_0_pos[1], x_T[1]) - deg_around_x0_xT_box, max(x_0_pos[1], x_T[1]) + deg_around_x0_xT_box]

    # get the respective indices from the grids
    ygrid_inds = np.where((ygrid >= lat_bnds[0]) & (ygrid <= lat_bnds[1]))[0]
    xgrid_inds = np.where((xgrid >= lon_bnds[0]) & (xgrid <= lon_bnds[1]))[0]

    # for time indexing transform to POSIX time
    abs_t_grid = [(time_origin + timedelta(hours=X)).timestamp() for X in t_grid.data]
    
    # get the idx of the value left of the demanded time (for interpolation function)
    t_start_idx = bisect.bisect_right(abs_t_grid, x_0_posix_time) - 1
    if t_start_idx == len(abs_t_grid) - 1 or t_start_idx == -1:
        raise ValueError("Requested subset time is outside of the nc4 file.")

    # get the max time if provided as input
    if temp_horizon_in_h is None:   # all data provided
        t_end_idx = len(abs_t_grid)-1
    else:
        t_end_idx = bisect.bisect_right(abs_t_grid, x_0_posix_time + temp_horizon_in_h*3600.)
        if t_end_idx == len(abs_t_grid):
            raise ValueError("nc4 file does not contain requested temporal horizon.")

    # fixed time logic if necessary
    if fixed_time is None:
        slice_for_time_dim = np.s_[t_start_idx:(t_end_idx+1):temporal_stride]
        fixed_time_idx = None
    else:
        fixed_time_idx = bisect.bisect_right(abs_t_grid, fixed_time.timestamp()) - 1
        slice_for_time_dim = np.s_[fixed_time_idx]

    # Step 2: extract data
    # raw water_u is [tdim, zdim, ydim, xdim]
    if len(f.variables['water_u'].shape) == 4:  # if there is a depth dimension in the dataset
        u_data = f.variables['water_u'][slice_for_time_dim, 0, ygrid_inds, xgrid_inds]
        v_data = f.variables['water_v'][slice_for_time_dim, 0, ygrid_inds, xgrid_inds]
    # raw water_u is [tdim, ydim, xdim]
    elif len(f.variables['water_u'].shape) == 3:  # if there is no depth dimension in the dataset
        u_data = f.variables['water_u'][slice_for_time_dim, ygrid_inds, xgrid_inds]
        v_data = f.variables['water_v'][slice_for_time_dim, ygrid_inds, xgrid_inds]
    else:
        raise ValueError("Current data in nc file has neither 3 nor 4 dimensions. Check file.")

    # create dict to output
    grids_dict = {'x_grid': xgrid[xgrid_inds], 'y_grid': ygrid[ygrid_inds],
                  't_grid': abs_t_grid[slice_for_time_dim], 'fixed_time_idx': fixed_time_idx}

    # log what data has been subsetted
    if fixed_time is None:
        print("Subsetted data from {start} to {end} in {n_steps} time steps of {time:.2f} hour(s) resolution".format(
            start=datetime.utcfromtimestamp(grids_dict['t_grid'][0]).strftime('%Y-%m-%d %H:%M:%S UTC'),
            end=datetime.utcfromtimestamp(grids_dict['t_grid'][-1]).strftime('%Y-%m-%d %H:%M:%S UTC'),
            n_steps=len(grids_dict['t_grid']), time=(grids_dict['t_grid'][1] - grids_dict['t_grid'][0])/3600.))
    else:
        print("Subsetted data to fixed time at: {time}".format(
            time=datetime.utcfromtimestamp(grids_dict['t_grid'][0]).strftime('%Y-%m-%d %H:%M:%S UTC')))

    #TODO: we replace the masked array with fill value 0 because otherwise interpolation doesn't work.
    # Though that means we cannot anymore detect if we're on land or not (need a way to do that/detect stranding)
    # not sure yet if we'll do it in the simulator or where.
    return grids_dict, u_data.filled(fill_value=0.), v_data.filled(fill_value=0.)

In [35]:
filter_string

'start>="2021-09-02" && end<="2021-09-03T23:00:00.000" && status=="downloaded"'

In [36]:
start = datetime.utcfromtimestamp(t_interval[0])
end = datetime.utcfromtimestamp(t_interval[1])
# Step 1.2: Getting correct range of nc files from database
filter_string = 'start>=' + '"'+ start.strftime("%Y-%m-%d") + '"' + \
                ' && end<=' + '"' + end.strftime("%Y-%m-%d") + "T23:00:00.000" + '"'\
                ' && status==' + '"' + 'downloaded' + '"'
objs_list = c3.HindcastFile.fetch({'filter':filter_string, "order": "start"}).objs

# some basic sanity checks
if objs_list is None:
    raise ValueError("No files in the database for the selected t_interval")
if len(objs_list) != (end - start).days + 1:
    raise ValueError("DB Query didn't return the expected number of files (one per day), check DB and code.")

In [20]:
start_date

datetime.datetime(2021, 9, 2, 12, 0)

In [27]:
(end_date - start_date).days

1

In [11]:
end_date

'2021-09-03'

In [9]:
len(objs_list)

2

In [38]:
#C3 file based function
def get_current_data_subset_from_c3_file(
    t_interval, #temp_res_in_h,   ----> separate function
    lat_interval, #lat_res_in_deg,
    lon_interval, #lon_res_in_deg,
    #depth_interval_to_avg_over
):
    
    # scipy.interpolate.interp1d
    
    """ Function to get a subset of current data via the C3 data integration.
    
    Inputs:
        t_interval              if time-varying: [t_0, t_T] in POSIX time
                                where t_0 and t_T are the start and end timestamps respectively
                                if fixed_time:   [fixed_timestamp] in POSIX
        temp_res_in_h           which temporal resolution the time-axis should have
                                e.g. if temp_res_in_h = 1, t_grid = [t_0, t_0 + 3600s, ... t_T]
                                if temp_res_in_h = 5,      t_grid = [t_0, t_0 + 5*3600s, ... t_T]
                                if temp_res_in_h = 0.5,      t_grid = [t_0, t_0 + 1800s, ... t_T]
                                => so either averaging or interpolation needs to be done in the backend
        lat_interval            [y_lower, y_upper] in degrees
        lat_res_in_deg          which spatial resolution in y direction in degrees
                                e.g. if lat_res_in_deg = 1, y_grid = [y_lower, y_lower + 1, ... y_upper]
                                 => so either averaging or interpolation needs to be done in the backend
        lon_interval            [x_lower, x_upper] in degrees
        lon_res_in_deg          which spatial resolution in x direction in degrees
                                e.g. if lon_res_in_deg = 1, x_grid = [x_lower, x_lower + 1, ... x_upper]
                                 => so either averaging or interpolation needs to be done in the backend
        depth_interval_to_avg_over
                                Interval to average over the current dimension in meters
                                e.g. [0, 10] then the currents are averaged over the depth 0-10m.
                                
    Outputs:
        grids_dict              dict containing x_grid, y_grid, t_grid
        u_data                  [T, Y, X] matrix of the ocean currents in x direction in m/s
        v_data                  [T, Y, X] matrix of the ocean currents in y direction in m/s
    """
    
    # Step 1: get required file references and data from C3 file DB
    # Step 1.1: Getting time and formatting for the db query
    start = datetime.utcfromtimestamp(t_interval[0])
    end = datetime.utcfromtimestamp(t_interval[1])

    # Step 1.2: Getting correct range of nc files from database
    filter_string = 'start>=' + '"'+ start.strftime("%Y-%m-%d") + '"' + \
                    ' && end<=' + '"' + end.strftime("%Y-%m-%d") + "T23:00:00.000" + '"'\
                    ' && status==' + '"' + 'downloaded' + '"'
    objs_list = c3.HindcastFile.fetch({'filter':filter_string, "order": "start"}).objs

    # some basic sanity checks
    if objs_list is None:
        raise ValueError("No files in the database for the selected t_interval")
    if len(objs_list) != (end - start).days + 1:
        raise ValueError("DB Query didn't return the expected number of files (one per day), check DB and code.")
    
    # Step 1.3: extract url and start list from the query results
    urls_list = [obj.file.url for obj in objs_list]
    start_list = [obj.start for obj in objs_list]
    
    # Step 2: Prepare the stacking loop by getting the x, y grids and subsetting indices in x, y 
    # Note: these stay constant across files in this case where all files have same lat-lon range
    
    # Step 2.1: open the file and get the x and y grid
    f = c3.HycomUtil.nc_open(urls_list[0])
    xgrid = f.variables['lon'][:].data
    ygrid = f.variables['lat'][:].data
    
    # Step 2.2: get the respective indices of the lat, lon subset from the file grids
    ygrid_inds = np.where((ygrid >= lat_interval[0]) & (ygrid <= lat_interval[1]))[0]
    xgrid_inds = np.where((xgrid >= lon_interval[0]) & (xgrid <= lon_interval[1]))[0]
    
    # Step 2.3 initialze t_grid stacking variable
    full_t_grid = []

    # Step 3: iterate over all files in order and stack the current data and absolute t_grids
    for idx in range(len(start_list)):
        # Step 3.0: load the current data file
        f = c3.HycomUtil.nc_open(urls_list[idx])
        # set the default start and end time
        start_hr, end_hr = 0, 24
        
        # Step 3.1: do the time-subsetting
        #Case 1: file is first -- get data from the file from the hour before or at t_0
        if idx == 0:
            start_hr = math.floor((t_interval[0] - start_list[idx].timestamp())/3600)
        #Case 2: file is last -- get data from file until or after the hour t_T
        if idx == len(start_list)-1:
            end_hr = math.ceil((t_interval[1] - start_list[idx].timestamp())/3600)+1

        # Step 3.2: extract data from the file
        u_data = f.variables['water_u'][start_hr:end_hr, 0, ygrid_inds, xgrid_inds]
        v_data = f.variables['water_v'][start_hr:end_hr, 0, ygrid_inds, xgrid_inds]

        # Step 3.3: stack the sub-setted abs_t_grid and current data
        full_t_grid = full_t_grid + [start_list[idx].timestamp() + i*3600 for i in range(start_hr, end_hr)]
        
        if idx == 0:
            full_u_data = u_data
            full_v_data = v_data
        else:
            full_u_data = np.concatenate((full_u_data, u_data), axis=0)
            full_v_data = np.concatenate((full_v_data, v_data), axis=0)

    # Step 4: create dict to output
    grids_dict = {'x_grid': xgrid[xgrid_inds], 'y_grid': ygrid[ygrid_inds], 't_grid': full_t_grid}
    
    # Step 5: # log what data has been subsetted
    print("Subsetted data from {start} to {end} in {n_steps} time steps of {time:.2f} hour(s) resolution".format(
        start=datetime.utcfromtimestamp(grids_dict['t_grid'][0]).strftime('%Y-%m-%d %H:%M:%S UTC'),
        end=datetime.utcfromtimestamp(grids_dict['t_grid'][-1]).strftime('%Y-%m-%d %H:%M:%S UTC'),
        n_steps=len(grids_dict['t_grid']), time=(grids_dict['t_grid'][1] - grids_dict['t_grid'][0])/3600.))

    # Step 6: return the grids_dict and the stacked data
    # TODO: currently, we just do fill_value =0 but then we can't detect if we're on land. 
    # We need a way to do that in the simulator, doing it via the currents could be one way.
    return grids_dict, full_u_data.filled(fill_value=0.), full_v_data.filled(fill_value=0.)

In [6]:
#C3 data based function
def get_current_data_subset_from_c3_database(t_interval, lat_interval, lon_interval, 
                                             metric="TestAverageWaterU", interval="HOUR"
):
    """
    Returns a subset for the given metric data from the given start and end time. 
    This version does not using Streaming and so may be slower (?)
 
    Args:
        t_interval (float): [t_0, t_T] in POSIX time where t_0 and t_T are the start and end timestamps respectively
        lat_interval (float tuple): [y_lower, y_upper] in degrees
        lon_interval (float tuple): [x_lower, x_upper] in degrees
        
        metric (str): Metric to be extracted from the dataset   
        interval (string): frequency of datapoints to output (only works for "HOUR" right now!)
 
    Returns:
        array: Numpy array of the subsetted data ordered as [time,lat,lon]
 
    Notes:
        - Currently designed/tested for hour resolution
        - Only tested for single metric
        - Comments included with times for different sections of the function. These times are not averaged 
          (only one run used) and are there to present an idea of the runtime of different sections of the code.
          NOTE: significnat time variance when fetching from c3
    """
    ###PRE-QUERY SECTION###
    #takes 0.1423 seconds for 1x1 lat-lon, 1 day
    #convert times to datetime
    start_time = datetime.fromtimestamp(t_interval[0])
    
    #+ interval_len because EvalMetrics exclusive for end time (NOTE: modify to work for other intervals)
    end_time = datetime.fromtimestamp(t_interval[1] + 3600)
    

    #filter for query
    filter = "lat>={} && lat<={} && lon>={} && lon<={}".format(lat_interval[0], lat_interval[1], 
                                                               lon_interval[0], lon_interval[1])

    #get lat, lon dimensions
    objs_list = c3.HycomLatLongPair.fetch(spec={"include": "id, lat, lon", "filter": filter, "limit": -1}).objs
    lat_dim =  len(np.unique([obj.lat for obj in objs_list]))
    lon_dim = len(np.unique([obj.lon for obj in objs_list]))

    ###QUERY SECTION###
    #takes 17.17 seconds for 1x1 lat-lon, 1 day
    # Query the server for EvalMetrics data
    my_spec = c3.EvalMetricsSpec(
                filter = filter,
                limit = -1,
                expressions = [metric],
                start = start_time.strftime("%Y-%m-%dT%H:00:00"),
                end = end_time.strftime("%Y-%m-%dT%H:00:00"),
                interval = interval
            )

    evalMetricsResult = c3.HycomLatLongPair.evalMetrics(spec=my_spec)
    
    ###POST_QUERY SECTION###
    #takes 0.02296 seconds for 1x1 lat-lon, 1 day
    
    #calculate number of discrete time points 
    #(currently calculates number of hours, need to modify to work for different intervals)
    duration = end_time - start_time
    duration_in_s = duration.total_seconds()
    num_times = int(divmod(duration_in_s, 3600)[0])

    #extract data into array
    keys = sorted([key for key in evalMetricsResult.result])
    arr = np.zeros(shape=(num_times,len(keys)))
    for i in range(len(keys)):
        arr[:, i] = np.array(evalMetricsResult.result[keys[i]]["TestAverageWaterU"].m_data, dtype="float32")

    arr = np.swapaxes(arr.reshape(num_times, lon_dim, lat_dim), 1, 2)
    
    return arr

# Testing behavior of EvalMetricSpec (candidate for deletion)

In [9]:
lon_interval = [-90,-89]
lat_interval = [24, 25]
t_interval = [datetime(2021, 9, 2, 12, 0).timestamp(), datetime(2021, 9, 3, 12, 0).timestamp()]
metric="TestAverageWaterU"
interval="HOUR"

###PRE-QUERY SECTION###
#takes 0.1423 seconds for 1x1 lat-lon, 1 day
#convert times to datetime
start_time = datetime.fromtimestamp(t_interval[0])

#+interval_len because EvalMetrics exclusive for end time (NOTE: modify to work for other intervals)
end_time = datetime.fromtimestamp(t_interval[1] + 3600) 


#filter for query
filter = "lat>={} && lat<={} && lon>={} && lon<={}".format(lat_interval[0], lat_interval[1], 
                                                           lon_interval[0], lon_interval[1])

#get lat, lon dimensions
objs_list = c3.HycomLatLongPair.fetch(spec={"include": "id, lat, lon", "filter": filter}).objs
lat_dim =  len(np.unique([obj.lat for obj in objs_list]))
lon_dim = len(np.unique([obj.lon for obj in objs_list]))

my_spec = c3.EvalMetricsSpec(
                filter = filter,
                limit = 10,
                expressions = [metric],
                start = start_time.strftime("%Y-%m-%dT%H:00:00"),
                end = end_time.strftime("%Y-%m-%dT%H:00:00"),
                interval = interval
            )

evalMetricsResult = c3.HycomLatLongPair.evalMetrics(spec=my_spec)

In [None]:
# let's check for the point 24, -90 the values in the file

In [21]:
# evalMetricsResult.result["GOMu0.04_200-147"]

In [26]:
# Note: these values are the same as in the DB so the query and function work correctly..
np.array(evalMetricsResult.result["GOMu0.04_200-147"]["TestAverageWaterU"].m_data)

array([-0.42000002, -0.40900001, -0.40700001, -0.40700001, -0.40300003,
       -0.39600003, -0.38100001, -0.36500001, -0.34500003, -0.32800001,
       -0.31800002, -0.32000002, -0.33500001, -0.36300001, -0.40200001,
       -0.44700003, -0.49300003, -0.54200006, -0.59600002, -0.64100003,
       -0.66700006, -0.67200005, -0.65200001, -0.61000001, -0.59100002])

In [27]:
# => that means, the data processing didn't work correctly?

# Generate reference arrays

In [4]:
hindcast_file = 'Sanity_check_data.nc4'

In [5]:
# settings for directly from subsetted file and old approach
x_0 = [-90.0, 24.0, 1, datetime(2021, 9, 2, 12, 0).timestamp()]  # lon, lat, battery, posix_time
x_T = [-89.0, 25.0]
deg_around_x0_xT_box = 0.
fixed_time = None
temporal_stride = 1

file_grid_dict, solution_1day_1x1, file_v_water = get_current_data_subset(hindcast_file,
                                                  x_0, x_T,
                                                  deg_around_x0_xT_box,
                                                  fixed_time,
                                                  temporal_stride,
                                                  temp_horizon_in_h=None)

FileNotFoundError: [Errno 2] No such file or directory: b'Sanity_check_data.nc4'

# Sanity Check

In [39]:
lon_interval = [-90,-89]
lat_interval = [24, 25]
t_interval = [datetime(2021, 9, 2, 12, 0).timestamp(), datetime(2021, 9, 3, 12, 0).timestamp()]

_, c3_file_u_data, _ = get_current_data_subset_from_c3_file(t_interval, lat_interval, lon_interval)
# c3_database_u_data = get_current_data_subset_from_c3_database(t_interval, lat_interval, lon_interval)

Subsetted data from 2021-09-02 12:00:00 UTC to 2021-09-03 12:00:00 UTC in 25 time steps of 1.00 hour(s) resolution


In [51]:
print("c3_file function correct:", np.all(c3_file_u_data==solution_1day_1x1))
print("c3_database function correct:", np.all(c3_database_u_data==solution_1day_1x1))

c3_file function correct: True
c3_database function correct: False


In [114]:
np.mean(c3_database_u_data)

-0.35630895178772637

In [115]:
np.mean(solution_1day_1x1)

0.118622966

# Individual lat-lon point sanity check

In [52]:
# Get the numbers from file-based approach
i, j = 0, 0
t_idx = 0
print("lon ", file_grid_dict['x_grid'][i])
print("lat ", file_grid_dict['y_grid'][j])
print(datetime.fromtimestamp(file_grid_dict['t_grid'][t_idx]))
# File-based output matrix is [T, Y, X]
# across time for the -90, 24 array!
solution_1day_1x1[t_idx:,i,j]

lon  -90.0
lat  24.0
2021-09-02 12:00:00


array([0.25      , 0.294     , 0.34500003, 0.397     , 0.437     ,
       0.46300003, 0.47200003, 0.46800002, 0.44700003, 0.41300002,
       0.372     , 0.335     , 0.305     , 0.284     , 0.279     ,
       0.28500003, 0.293     , 0.296     , 0.28300002, 0.25300002,
       0.20700002, 0.16000001, 0.134     , 0.133     , 0.165     ],
      dtype=float32)

In [54]:
# Get the numbers from c3 pipeline based approach
lon_interval = [-90,-89]
lat_interval = [24, 25]
t_interval = [datetime(2021, 9, 2, 12, 0).timestamp(), datetime(2021, 9, 3, 12, 0).timestamp()]
metric="TestAverageWaterU"
interval="HOUR"

start_time = datetime.fromtimestamp(t_interval[0])
end_time = datetime.fromtimestamp(t_interval[1] + 3600) 
#filter for query
filter = "lat>={} && lat<={} && lon>={} && lon<={}".format(lat_interval[0], lat_interval[1], 
                                                           lon_interval[0], lon_interval[1])
#get lat, lon dimensions
objs_list = c3.HycomLatLongPair.fetch(spec={"include": "id, lat, lon", "filter": filter}).objs
lat_dim =  len(np.unique([obj.lat for obj in objs_list]))
lon_dim = len(np.unique([obj.lon for obj in objs_list]))

my_spec = c3.EvalMetricsSpec(
                filter = filter,
                limit = 10,
                expressions = [metric],
                start = start_time.strftime("%Y-%m-%dT%H:00:00"),
                end = end_time.strftime("%Y-%m-%dT%H:00:00"),
                interval = interval
            )

evalMetricsResult = c3.HycomLatLongPair.evalMetrics(spec=my_spec)

print(np.array(evalMetricsResult.result["GOMu0.04_147-200"]["TestAverageWaterU"].m_data))

KeyError: 'GOMu0.04_147-200'

In [None]:
# Note: these values are the same as in the DB so the query and function work correctly..

# Run timing tests
fixed 1x1 lat-lon grid, varied time frame

In [5]:
num_trials = 6
timeframe = 5

#these are of shape (num_trials, timeframe)
file_times_1x1 = np.zeros(shape=(num_trials, timeframe))
database_times_1x1 = np.zeros(shape=(num_trials, timeframe))

lon_interval = [-90,-89]
lat_interval = [24, 25]

for n in range(num_trials):
    print("\n***Trial {}***".format(n))
    for i in range(timeframe):
        t_interval = [datetime(2021, 9, 2, 12, 0).timestamp(), datetime(2021, 9, 3 + i, 12, 0).timestamp()]

        print("get data using files for {} days".format(i+1))
        start = time.time()
        _, c3_file_u_data, _ = get_current_data_subset_from_c3_file(t_interval, lat_interval, lon_interval)
        end = time.time()
        file_times_1x1[n][i] = end - start
        
        print("get data using database for {} days\n".format(i+1))
        start = time.time()
        c3_database_u_data = get_current_data_subset_from_c3_database(t_interval, lat_interval, lon_interval)
        end = time.time()
        database_times_1x1[n][i] = end - start
          


***Trial 0***
get data using files for 1 days
Subsetted data from 2021-09-02 12:00:00 UTC to 2021-09-03 12:00:00 UTC in 25 time steps of 1.00 hour(s) resolution
get data using database for 1 days

get data using files for 2 days
Subsetted data from 2021-09-02 12:00:00 UTC to 2021-09-04 12:00:00 UTC in 49 time steps of 1.00 hour(s) resolution
get data using database for 2 days

get data using files for 3 days
Subsetted data from 2021-09-02 12:00:00 UTC to 2021-09-05 12:00:00 UTC in 73 time steps of 1.00 hour(s) resolution
get data using database for 3 days

get data using files for 4 days
Subsetted data from 2021-09-02 12:00:00 UTC to 2021-09-06 12:00:00 UTC in 97 time steps of 1.00 hour(s) resolution
get data using database for 4 days

get data using files for 5 days
Subsetted data from 2021-09-02 12:00:00 UTC to 2021-09-07 12:00:00 UTC in 121 time steps of 1.00 hour(s) resolution
get data using database for 5 days


***Trial 1***
get data using files for 1 days
Subsetted data from 20

In [6]:
file_var = np.var(file_times_1x1, axis=0)
file_mean = np.mean(file_times_1x1, axis=0)
database_var = np.var(database_times_1x1, axis=0)
database_mean = np.mean(database_times_1x1, axis=0)

print("file:\n\tmean: {}\n\tvar: {}\n\ndatabase:\n\tmean: {}\n\tvar: {}".format(file_mean, file_var, database_mean, database_var))

file:
	mean: [1.8538355  2.31808281 2.89785949 3.69447128 4.02903986]
	var: [0.04045876 0.02098448 0.0314754  0.08742363 0.06771048]

database:
	mean: [18.34618036 18.04307504 17.7885772  18.62299089 18.09564626]
	var: [1.04474556 0.57995429 0.33370975 0.31703722 1.30089722]


fixed 2x2 lat-lon grid, varied time frame

In [7]:
num_trials = 6
timeframe = 5

#these are of shape (num_trials, timeframe)
file_times_2x2 = np.zeros(shape=(num_trials, timeframe))
database_times_2x2 = np.zeros(shape=(num_trials, timeframe))

lon_interval = [-90,-88]
lat_interval = [24, 26]

for n in range(num_trials):
    print("\n***Trial {}***".format(n))
    for i in range(timeframe):
        t_interval = [datetime(2021, 9, 2, 12, 0).timestamp(), datetime(2021, 9, 3 + i, 12, 0).timestamp()]

        print("get data using files for {} days".format(i+1))
        start = time.time()
        _, c3_file_u_data, _ = get_current_data_subset_from_c3_file(t_interval, lat_interval, lon_interval)
        end = time.time()
        file_times_2x2[n][i] = end - start
        
        print("get data using database for {} days\n".format(i+1))
        start = time.time()
        c3_database_u_data = get_current_data_subset_from_c3_database(t_interval, lat_interval, lon_interval)
        end = time.time()
        database_times_2x2[n][i] = end - start
        
        if n==0:
            print("sanity check: file output shape is {}".format(c3_file_u_data.shape))
            print("sanity check: database output shape is {}".format(c3_database_u_data.shape))
        


***Trial 0***
get data using files for 1 days
Subsetted data from 2021-09-02 12:00:00 UTC to 2021-09-03 12:00:00 UTC in 25 time steps of 1.00 hour(s) resolution
get data using database for 1 days

sanity check: file output shape is (25, 51, 51)
sanity check: database output shape is (25, 51, 51)
get data using files for 2 days
Subsetted data from 2021-09-02 12:00:00 UTC to 2021-09-04 12:00:00 UTC in 49 time steps of 1.00 hour(s) resolution
get data using database for 2 days

sanity check: file output shape is (49, 51, 51)
sanity check: database output shape is (49, 51, 51)
get data using files for 3 days
Subsetted data from 2021-09-02 12:00:00 UTC to 2021-09-05 12:00:00 UTC in 73 time steps of 1.00 hour(s) resolution
get data using database for 3 days

sanity check: file output shape is (73, 51, 51)
sanity check: database output shape is (73, 51, 51)
get data using files for 4 days
Subsetted data from 2021-09-02 12:00:00 UTC to 2021-09-06 12:00:00 UTC in 97 time steps of 1.00 hour(s) 

In [8]:
file_var = np.var(file_times_2x2, axis=0)
file_mean = np.mean(file_times_2x2, axis=0)
database_var = np.var(database_times_2x2, axis=0)
database_mean = np.mean(database_times_2x2, axis=0)

print("file:\n\tmean: {}\n\tvar: {}\n\ndatabase:\n\tmean: {}\n\tvar: {}".format(file_mean, file_var, database_mean, database_var))

file:
	mean: [1.70359941 2.27754378 4.35405966 5.6173619  3.96298444]
	var: [1.11089262e-02 6.74401598e-03 8.34421237e+00 2.12287547e+01
 3.41539796e-02]

database:
	mean: [70.64219558 74.12515362 69.7883081  71.59635059 73.95672381]
	var: [ 8.25638858  7.52758677 12.77819794  9.44663634  2.33355941]


varied lat-lon grid size, fixed time scale of 3 days

In [10]:
num_trials = 5
gridsize_range = 6

#these are of shape (num_trials, gridsize_range)
file_times_3_days = np.zeros(shape=(num_trials, gridsize_range))
database_times_3_days = np.zeros(shape=(1, gridsize_range))

lon_intervals = [[-90,-89], [-90,-88], [-90,-87], [-91,-87], [-92,-87], [-93,-87]]
lat_intervals = [[24, 25], [24, 26], [24, 27], [23, 27], [22, 27], [22, 28]]
t_interval = [datetime(2021, 9, 2, 12, 0).timestamp(), datetime(2021, 9, 5, 12, 0).timestamp()]

for n in range(num_trials):
    print("\n***Trial {}***".format(n))
    for i in range(gridsize_range):
        lon_interval = lon_intervals[i]
        lat_interval = lat_intervals[i]
        print("get data using files for {}x{}".format(i+1, i+1))
        start = time.time()
        _, c3_file_u_data, _ = get_current_data_subset_from_c3_file(t_interval, lat_interval, lon_interval)
        end = time.time()
        file_times_3_days[n][i] = end - start
        
        if n==0:
            print("sanity check: output shape is {}".format(c3_file_u_data.shape))
        
        
for n in range(1):
    print("\n***Trial {}***".format(n))
    for i in range(gridsize_range):    
        lon_interval = lon_intervals[i]
        lat_interval = lat_intervals[i]
        print("get data using database for {}x{}\n".format(i+1, i+1))
        start = time.time()
        c3_database_u_data = get_current_data_subset_from_c3_database(t_interval, lat_interval, lon_interval)
        end = time.time()
        database_times_3_days[n][i] = end - start
        
        if n==0:
            print("sanity check: output shape is {}".format(c3_database_u_data.shape))
        


***Trial 0***
get data using files for 1x1
Subsetted data from 2021-09-02 12:00:00 UTC to 2021-09-05 12:00:00 UTC in 73 time steps of 1.00 hour(s) resolution
sanity check: output shape is (73, 26, 26)
get data using files for 2x2
Subsetted data from 2021-09-02 12:00:00 UTC to 2021-09-05 12:00:00 UTC in 73 time steps of 1.00 hour(s) resolution
sanity check: output shape is (73, 51, 51)
get data using files for 3x3
Subsetted data from 2021-09-02 12:00:00 UTC to 2021-09-05 12:00:00 UTC in 73 time steps of 1.00 hour(s) resolution
sanity check: output shape is (73, 76, 76)
get data using files for 4x4
Subsetted data from 2021-09-02 12:00:00 UTC to 2021-09-05 12:00:00 UTC in 73 time steps of 1.00 hour(s) resolution
sanity check: output shape is (73, 101, 101)
get data using files for 5x5
Subsetted data from 2021-09-02 12:00:00 UTC to 2021-09-05 12:00:00 UTC in 73 time steps of 1.00 hour(s) resolution
sanity check: output shape is (73, 126, 126)
get data using files for 6x6
Subsetted data fr

Json request to /api/1/dev/tc01d/HycomLatLongPair?action=evalMetrics failed with response ServerResponse(statusCode=504, content='<html>\r\n<head><title>504 Gateway Time-out</title></head>\r\n<body>\r\n<center><h1>504 Gateway Time-out</h1></center>\r\n<hr><center>Microsoft-Azure-Application-Gateway/v2</center>\r\n</body>\r\n</html>\r\n', headers={'Server': 'Microsoft-Azure-Application-Gateway/v2', 'Date': 'Sat, 13 Nov 2021 23:03:13 GMT', 'Content-Type': 'text/html', 'Content-Length': '193', 'Connection': 'keep-alive'})


RuntimeError: Json request to /api/1/dev/tc01d/HycomLatLongPair?action=evalMetrics failed with response ServerResponse(statusCode=504, content='<html>\r\n<head><title>504 Gateway Time-out</title></head>\r\n<body>\r\n<center><h1>504 Gateway Time-out</h1></center>\r\n<hr><center>Microsoft-Azure-Application-Gateway/v2</center>\r\n</body>\r\n</html>\r\n', headers={'Server': 'Microsoft-Azure-Application-Gateway/v2', 'Date': 'Sat, 13 Nov 2021 23:03:13 GMT', 'Content-Type': 'text/html', 'Content-Length': '193', 'Connection': 'keep-alive'})

In [16]:
file_var = np.var(file_times_3_days, axis=0)
file_mean = np.mean(file_times_3_days, axis=0)
database_var = np.var(database_times_3_days, axis=0)
database_mean = np.mean(database_times_3_days, axis=0)

print("file:\n\tmean: {}\n\tvar: {}\n\ndatabase:\n\tmean: {}\n\tvar: {}".format(file_mean, file_var, database_mean, database_var))

file:
	mean: [2.83002944 2.78199716 2.61506758 2.74448647 3.03419528 2.84414096]
	var: [0.02529586 0.01998459 0.00340468 0.00780639 0.61341426 0.01870923]

database:
	mean: [  16.83094835   69.68803072  178.31722736  286.96635199 1427.42446494
    0.        ]
	var: [0. 0. 0. 0. 0. 0.]


In [18]:
#save data
import pickle
d = {"file_times_1x1": file_times_1x1, "file_times_2x2": file_times_2x2, 
     "database_times_1x1": database_times_1x1, "database_times_2x2": database_times_2x2, 
     "file_times_3_days": file_times_3_days, "database_times_3_days": database_times_3_days}

filename = "test_output.txt"
f = open(filename,'wb')

pickle.dump(d,f)
f.close()

In [19]:
#load data
f = open(filename,'rb')
new_d = pickle.load(f)
f.close()