# access thredds

Download sliced netcdf data from Thredds

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import os
import time
import numpy as np
import xarray as xr

import src.utils as utils
from src.thredds_utils import ThreddsCode, slice_dataset, retrieve_dataset
from src.parcels_utils import xr_dataset_to_fieldset

In [None]:
def get_latest_span(delta):
    # GMT, data recorded hourly
    time_now = np.datetime64("now", "h")
    return (time_now - delta, time_now)


def get_time_slice(time_range):
    if len(time_range) == 2:
        return slice(np.datetime64(time_range[0]), np.datetime64(time_range[1]))
    if len(time_range) == 3:
        # step size is an integer in hours
        return slice(np.datetime64(time_range[0]), np.datetime64(time_range[1]), time_range[2])
    

def get_regs_year(year, name, lat_rng, lon_rng):
    regions = []
    months = np.arange(str(year), str(year + 1), dtype="datetime64[M]")
    for m in months:
        days = np.arange(m, m + np.timedelta64(1, "M"), dtype="datetime64[D]")
        timerng = (np.datetime64(days[0], "h"), days[-1] + np.timedelta64(23, "h"))
        for code in (ThreddsCode.USWC_1KM_HOURLY, ThreddsCode.USWC_2KM_HOURLY, ThreddsCode.USWC_6KM_HOURLY):
            regions.append({
                "name": f"{name}_{m}",
                "code": code,
                "time_range": timerng,
                "lat_range": lat_rng,
                "lon_range": lon_rng,
                "include_range_endpoints": False if code == ThreddsCode.USWC_1KM_HOURLY else True
            })
    return regions

### format of region_data stuff

(name, resolution, time range, lat range, lon range, expand range)

### about tj_sample

the purpose of tj_sample is a quick and dirty way to sample the thredds data from a bunch of different times to find out the positions of where data exists. data in close time ranges could all have the same holes in data, and we would never know if data was supposed to be there in the first place.

so tj_sample is generated for the sole purpose of creating a mask showing where data shouldn't exist.

## data masks

where is there data? every timestep of HFR data is not always complete, so we need to know what nan points were supposed to have data and which ones were never meant to have data.

A good way to find this out is to take several slices of data over a long period of time and check the coverage of each timestamp. This is the easiest way to kind of see the true coverage of HFR.

In [None]:
# tuple reference
# (name, region code, time range, lat range, lon range, include domain endpoints)
region_data = [
#     {
#         "name": "tj_sample",
#         "code": ThreddsCode.USWC_1KM_HOURLY,
#         "time_range": ("2019-01-01T00", "2021-01-01T00", 300),
#         "lat_range": (32.11093, 32.73124),
#         "lon_range": (-117.565, -116.9924),
#         "include_range_endpoints": False
#     },
    # {
    #     "name": "hunington_latest",
    #     "code": ThreddsCode.USWC_6KM_HOURLY,
    #     "time_range": ("2021-10-01T00", "2021-10-10T00"),
    #     "lat_range": (32, 34.2),
    #     "lon_range": (-119, -117.4),
    #     "include_range_endpoints": True
    # },
    {
        "name": "tj_plume_1km_2020-03",
        "code": ThreddsCode.USWC_1KM_HOURLY,
        "time_range": ("2020-03-09T01:00", "2020-03-14T01:00"),
        "lat_range": (32.11093, 32.73124),
        "lon_range": (-117.565, -116.9924),
        "include_range_endpoints": True,
        "generate_mask": True
    },
    # {
    #   "name": "hycom_mwbproj",
    #   "code": ThreddsCode.DATA_HYCOMFORE,
    #   "time_range": (np.datetime64("now") - np.timedelta64(1, "D"), np.datetime64("now") + np.timedelta64(7, "D")),
    #   "lat_range": (22, 45),
    #   "lon_range": (273, 295),
    #   "include_range_endpoints": False
    # },
    # stuff below here hasn't been updated with the new format
#     ("tj_plume", ThreddsCode.USWC_1KM_HOURLY, ("2020-08-01T01", "2020-08-14T13"), (32.11093, 32.73124), (-117.565, -116.9924), False),
#     ("tj_plume", ThreddsCode.USWC_2KM_HOURLY, ("2020-08-01T01", "2020-08-14T13"), (32.11093, 32.73124), (-117.565, -116.9924), True),
#     ("tj_plume", ThreddsCode.USWC_6KM_HOURLY, ("2020-08-01T01", "2020-08-14T13"), (32.11093, 32.73124), (-117.565, -116.9924), True),
#     ("tijuana_river", ThreddsCode.USWC_1KM_HOURLY, ("2020-06-16T21", "2020-06-23T21"), (32.528, 32.71), (-117.29, -117.11), False),
#     ("tijuana_river", ThreddsCode.USWC_2KM_HOURLY, ("2020-06-16T21", "2020-06-23T21"), (32.524, 32.75), (-117.32, -117.09), False),
#     ("tijuana_river", ThreddsCode.USWC_6KM_HOURLY, ("2020-06-16T21", "2020-06-23T21"), (32.35, 32.80), (-117.33, -116.9), False),
#     ("tijuana_river_small", ThreddsCode.USWC_1KM_HOURLY, ("2020-06-16T21", "2020-06-23T21"), (32.528, 32.6), (-117.19, -117.11), False)
#     ("tijuana_river_now", ThreddsCode.USWC_1KM_HOURLY, get_latest_span(np.timedelta64(300, "D")), (32.528, 32.71), (-117.29, -117.11), False),
#     ("tijuana_river_now", ThreddsCode.USWC_2KM_HOURLY, get_latest_span(np.timedelta64(300, "D")), (32.524, 32.75), (-117.32, -117.09), False),
#     ("tijuana_river_now", ThreddsCode.USWC_6KM_HOURLY, ("2019-09-28T21:00", "2020-07-24T20"), (32.35, 32.80), (-117.33, -116.9), False),
#     ("missing_buoy", ThreddsCode.USWC_6KM_HOURLY, ("2021-01-29T05", "2021-02-02T16"), (33.15, 33.778072), (-118.697986, -117.6), False)
]


# for rd in get_regs_year(2020, "tj_plume", (32.11093, 32.73124), (-117.565, -116.9924)):
#     region_data.append(rd)

In [None]:
regions = []
for rd in region_data:
    dataset = slice_dataset(
        rd["code"], rd["time_range"], rd["lat_range"],
        rd["lon_range"], inclusive=rd["include_range_endpoints"]
    )
    new_reg = {"name": rd["name"], "dataset": dataset}
    regions.append(new_reg)
    print(f"region {new_reg['name']} data megabytes: {new_reg['dataset'].nbytes / 1024 / 1024}")
    if rd.get("generate_mask", False):
        # automatically generate a mask
        full = retrieve_dataset(rd["code"])
        hours = (full["time"].max() - full["time"].min()) / np.timedelta64(1, "h")
        # 50 equal timesteps, should be a good enough sample to show coverage right?
        step = int(hours / 50)
        mask = slice_dataset(
            rd["code"], time_range=(full["time"].min().values, full["time"].max().values, step),
            lat_range=rd["lat_range"], lon_range=rd["lon_range"],
            inclusive=rd["include_range_endpoints"]
        )
        new_reg = {"name": f"{rd['name']}_mask", "dataset": mask}
        regions.append(new_reg)
        print(f"region {new_reg['name']} data megabytes: {new_reg['dataset'].nbytes / 1024 / 1024}")

In [None]:
for i, r in enumerate(regions):
    save_dir = utils.create_path(utils.CURRENT_NETCDF_DIR)
    filename = f"{r['name']}.nc"
    # save file
    r["dataset"].to_netcdf(save_dir / filename)
    print(f"saved to {save_dir / filename}")
print("done")