# access thredds

Download sliced netcdf data from Thredds

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import os
import time
import numpy as np
import xarray as xr

from pyplume.constants import FIELD_NETCDF_DIR
from pyplume.dataloaders import slice_dataset, dataset_to_fieldset
from pyplume import utils
from pyplume.thredds_data import retrieve_dataset, retrieve_dataloader

In [None]:
def get_latest_span(delta):
    # GMT, data recorded hourly
    time_now = np.datetime64("now", "h")
    return (time_now - delta, time_now)

def get_regs_year(year, name, lat_rng, lon_rng):
    regions = []
    months = np.arange(str(year), str(year + 1), dtype="datetime64[M]")
    for m in months:
        days = np.arange(m, m + np.timedelta64(1, "M"), dtype="datetime64[D]")
        timerng = (np.datetime64(days[0], "h"), days[-1] + np.timedelta64(23, "h"))
        for code in ("USWC_1KM_HOURLY", "USWC_2KM_HOURLY", "USWC_6KM_HOURLY"):
            regions.append({
                "name": f"{name}_{m}",
                "code": code,
                "time_range": timerng,
                "lat_range": lat_rng,
                "lon_range": lon_rng,
                "include_range_endpoints": False if code == "USWC_1KM_HOURLY" else True
            })
    return regions

### format of region_data stuff

(name, resolution, time range, lat range, lon range, expand range)

### about tj_sample

the purpose of tj_sample is a quick and dirty way to sample the thredds data from a bunch of different times to find out the positions of where data exists. data in close time ranges could all have the same holes in data, and we would never know if data was supposed to be there in the first place.

so tj_sample is generated for the sole purpose of creating a mask showing where data shouldn't exist.

## data masks

where is there data? every timestep of HFR data is not always complete, so we need to know what nan points were supposed to have data and which ones were never meant to have data.

A good way to find this out is to take several slices of data over a long period of time and check the coverage of each timestamp. This is the easiest way to kind of see the true coverage of HFR.

In [None]:
# tuple reference
# (name, region code, time range, lat range, lon range, include domain endpoints)
region_data = [
    # {
    #     "name": "tj_sample",
    #     "datasrc": "THREDDS_HFRNET_UCSD",
    #     "code": "USWC_1KM_HOURLY",
    #     "time_range": ("2019-01-01T00", "2021-01-01T00", 300),
    #     "lat_range": (32.11093, 32.73124),
    #     "lon_range": (-117.565, -116.9924),
    #     "include_range_endpoints": False
    # },
    # {
    #     "name": "hunington_latest",
    #     "datasrc": "THREDDS_HFRNET_UCSD",
    #     "code": "USWC_6KM_HOURLY",
    #     "time_range": ("2021-10-01T00", "2021-10-10T00"),
    #     "lat_range": (32, 34.2),
    #     "lon_range": (-119, -117.4),
    #     "include_range_endpoints": True
    # },
    # {
    #     "name": "tj_plume_1km_2020-03",
    #     "datasrc": "THREDDS_HFRNET_UCSD",
    #     "code": "USWC_1KM_HOURLY",
    #     "time_range": ("2020-03-09T01:00", "2020-03-14T01:00"),
    #     "lat_range": (32.11093, 32.73124),
    #     "lon_range": (-117.565, -116.9924),
    #     "include_range_endpoints": True,
    #     "generate_mask": True
    # },
    # {
    #   "name": "hycom_mwbproj",
    #   "datasrc": "THREDDS_HYCOM",
    #   "code": "FMRC_HYCOM",
    #   "time_range": (np.datetime64("now") - np.timedelta64(1, "D"), np.datetime64("now") + np.timedelta64(7, "D")),
    #   "lat_range": (22, 45),
    #   "lon_range": (273, 295),
    #   "include_range_endpoints": False
    # },
    # {
    #   "name": "tj_plume_1km_2022-09",
    #   "datasrc": "THREDDS_HFRNET_UCSD",
    #   "code": "USWC_1KM_HOURLY",
    #   "time_range": ("2022-09-01T00:00", "2022-09-30T23:00"),
    #   "lat_range": (32.11093, 32.73124),
    #   "lon_range": (-117.565, -116.9924),
    #   "include_range_endpoints": True,
    #   "generate_mask": True
    # },
    {
      "name": "hycom_hurrhenri",
      "datasrc": "THREDDS_HYCOM",
      "code": "GLOBAL_HINDCAST",
      "time_range": ("2021-08-21T12:00", "2021-08-23T18:00"),
      "lat_range": (38.162201, 41.520008),
      "lon_range": (284.290368, 290.276249),
      "include_range_endpoints": True,
      "generate_mask": False
    }
    # stuff below here hasn't been updated with the new format
#     ("tj_plume", "USWC_1KM_HOURLY", ("2020-08-01T01", "2020-08-14T13"), (32.11093, 32.73124), (-117.565, -116.9924), False),
#     ("tj_plume", "USWC_2KM_HOURLY", ("2020-08-01T01", "2020-08-14T13"), (32.11093, 32.73124), (-117.565, -116.9924), True),
#     ("tj_plume", "USWC_6KM_HOURLY", ("2020-08-01T01", "2020-08-14T13"), (32.11093, 32.73124), (-117.565, -116.9924), True),
#     ("tijuana_river", "USWC_1KM_HOURLY", ("2020-06-16T21", "2020-06-23T21"), (32.528, 32.71), (-117.29, -117.11), False),
#     ("tijuana_river", "USWC_2KM_HOURLY", ("2020-06-16T21", "2020-06-23T21"), (32.524, 32.75), (-117.32, -117.09), False),
#     ("tijuana_river", "USWC_6KM_HOURLY", ("2020-06-16T21", "2020-06-23T21"), (32.35, 32.80), (-117.33, -116.9), False),
#     ("tijuana_river_small", "USWC_1KM_HOURLY", ("2020-06-16T21", "2020-06-23T21"), (32.528, 32.6), (-117.19, -117.11), False)
#     ("tijuana_river_now", "USWC_1KM_HOURLY", get_latest_span(np.timedelta64(300, "D")), (32.528, 32.71), (-117.29, -117.11), False),
#     ("tijuana_river_now", "USWC_2KM_HOURLY", get_latest_span(np.timedelta64(300, "D")), (32.524, 32.75), (-117.32, -117.09), False),
#     ("tijuana_river_now", "USWC_6KM_HOURLY", ("2019-09-28T21:00", "2020-07-24T20"), (32.35, 32.80), (-117.33, -116.9), False),
#     ("missing_buoy", "USWC_6KM_HOURLY", ("2021-01-29T05", "2021-02-02T16"), (33.15, 33.778072), (-118.697986, -117.6), False)
]


# for rd in get_regs_year(2020, "tj_plume", (32.11093, 32.73124), (-117.565, -116.9924)):
#     region_data.append(rd)

In [None]:
save_dir = utils.get_dir(FIELD_NETCDF_DIR)
regions = []
for rd in region_data:
    print(rd)
    with retrieve_dataloader(
        rd["datasrc"], rd["code"], time_range=rd["time_range"], lat_range=rd["lat_range"],
        lon_range=rd["lon_range"], inclusive=rd["include_range_endpoints"]
    ) as dl:
        dl.save(save_dir / f"{rd['name']}.nc")
        if rd.get("generate_mask", False):
            dl.save_mask(save_dir / f"{rd['name']}_mask.npy", num_samples=50)