# access thredds

getting different regions of current data manually instead of using the auto-generated regions.

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
import os
import time
import numpy as np
import xarray as xr

import utils
from parcels_utils import xr_dataset_to_fieldset

In [None]:
dataset_url_6kmhourly = "http://hfrnet-tds.ucsd.edu/thredds/dodsC/HFR/USWC/6km/hourly/RTV/HFRADAR_US_West_Coast_6km_Resolution_Hourly_RTV_best.ncd"
dataset_url_2kmhourly = "http://hfrnet-tds.ucsd.edu/thredds/dodsC/HFR/USWC/2km/hourly/RTV/HFRADAR_US_West_Coast_2km_Resolution_Hourly_RTV_best.ncd"
dataset_url_1kmhourly = "http://hfrnet-tds.ucsd.edu/thredds/dodsC/HFR/USWC/1km/hourly/RTV/HFRADAR_US_West_Coast_1km_Resolution_Hourly_RTV_best.ncd"

In [None]:
num_chunks = 50

thredds_data = {
    utils.DATA_6KM: xr.open_dataset(dataset_url_6kmhourly, chunks={"time": num_chunks}),
    utils.DATA_2KM: xr.open_dataset(dataset_url_2kmhourly, chunks={"time": num_chunks}),
    utils.DATA_1KM: xr.open_dataset(dataset_url_1kmhourly, chunks={"time": num_chunks})
}

In [None]:
def get_region(data):
    time_range = get_time_slice(data[2])
    if data[5]:
        lat_range = utils.expand_coord_rng(data[3], thredds_data[data[1]]["lat"].values)
        lon_range = utils.expand_coord_rng(data[4], thredds_data[data[1]]["lon"].values)
    else:
        lat_range = data[3]
        lon_range = data[4]
    return dict(
        name = data[0],
        category = data[1],
        time = time_range,
        lat = lat_range,
        lon = lon_range,
        domain = {
            "S": lat_range[0],
            "N": lat_range[1],
            "W": lon_range[0],
            "E": lon_range[1],
        }
    )

def get_latest_span(delta):
    # GMT, data recorded hourly
    time_now = np.datetime64("now", "h")
    return (time_now - delta, time_now)


def get_time_slice(time_range):
    if len(time_range) == 2:
        return slice(np.datetime64(time_range[0]), np.datetime64(time_range[1]))
    if len(time_range) == 3:
        # step size is an integer in hours
        return slice(np.datetime64(time_range[0]), np.datetime64(time_range[1]), time_range[2])
    

def get_regs_year(year, name, lat_rng, lon_rng):
    regions = []
    months = np.arange(str(year), str(year + 1), dtype="datetime64[M]")
    for m in months:
        days = np.arange(m, m + np.timedelta64(1, "M"), dtype="datetime64[D]")
        timerng = (np.datetime64(days[0], "h"), days[-1] + np.timedelta64(23, "h"))
        reg1 = (f"{name}_{m}", utils.DATA_1KM, timerng, lat_rng, lon_rng, False)
        reg2 = (f"{name}_{m}", utils.DATA_2KM, timerng, lat_rng, lon_rng, True)
        reg6 = (f"{name}_{m}", utils.DATA_6KM, timerng, lat_rng, lon_rng, True)
        regions.append(reg1)
        regions.append(reg2)
        regions.append(reg6)
    return regions

### format of region_data stuff

(name, resolution, time range, lat range, lon range, expand range)

### about tj_sample

the purpose of tj_sample is a quick and dirty way to sample the thredds data from a bunch of different times to find out the positions of where data exists. data in close time ranges could all have the same holes in data, and we would never know if data was supposed to be there in the first place.

so tj_sample is generated for the sole purpose of creating a mask showing where data shouldn't exist.

In [None]:
region_data = [
    # ("tj_sample", utils.DATA_1KM, ("2020-01-10T00", "2020-08-11T00", 200), (32.11093, 32.73124), (-117.565, -116.9924), False),
#     ("tj_plume", utils.DATA_1KM, ("2020-08-01T01", "2020-08-14T13"), (32.11093, 32.73124), (-117.565, -116.9924), False),
#     ("tj_plume", utils.DATA_2KM, ("2020-08-01T01", "2020-08-14T13"), (32.11093, 32.73124), (-117.565, -116.9924), True),
#     ("tj_plume", utils.DATA_6KM, ("2020-08-01T01", "2020-08-14T13"), (32.11093, 32.73124), (-117.565, -116.9924), True),
#     ("tijuana_river", utils.DATA_1KM, ("2020-06-16T21", "2020-06-23T21"), (32.528, 32.71), (-117.29, -117.11), False),
#     ("tijuana_river", utils.DATA_2KM, ("2020-06-16T21", "2020-06-23T21"), (32.524, 32.75), (-117.32, -117.09), False),
#     ("tijuana_river", utils.DATA_6KM, ("2020-06-16T21", "2020-06-23T21"), (32.35, 32.80), (-117.33, -116.9), False),
#     ("tijuana_river_small", utils.DATA_1KM, ("2020-06-16T21", "2020-06-23T21"), (32.528, 32.6), (-117.19, -117.11), False)
#     ("tijuana_river_now", utils.DATA_1KM, get_latest_span(np.timedelta64(300, "D")), (32.528, 32.71), (-117.29, -117.11), False),
#     ("tijuana_river_now", utils.DATA_2KM, get_latest_span(np.timedelta64(300, "D")), (32.524, 32.75), (-117.32, -117.09), False),
#     ("tijuana_river_now", utils.DATA_6KM, ("2019-09-28T21:00", "2020-07-24T20"), (32.35, 32.80), (-117.33, -116.9), False),
    ("missing_buoy", utils.DATA_6KM, ("2021-01-29T05", "2021-02-02T16"), (33.15, 33.778072), (-118.697986, -117.6), False)
]


# for rd in get_regs_year(2020, "tj_plume", (32.11093, 32.73124), (-117.565, -116.9924)):
#     region_data.append(rd)

In [None]:
regions = []
for rd in region_data:
    new_reg = get_region(rd)
    if new_reg["time"].start >= np.datetime64("now", "h"):
        print(f"data starting from time {new_reg['time'].start} is in the future")
        continue
    new_reg["dataset"] = thredds_data[new_reg["category"]].sel(
        time=new_reg["time"],
        lat=slice(new_reg["lat"][0], new_reg["lat"][1]),
        lon=slice(new_reg["lon"][0], new_reg["lon"][1]),
    )
    regions.append(new_reg)
    print(f"region {new_reg['name']} data megabytes: {new_reg['dataset'].nbytes / 1024 / 1024}")

In [None]:
for r in regions:
    save_dir = utils.create_path(utils.CURRENT_NETCDF_DIR / utils.filename_dict[r["category"]])
    filename = f"{r['name']}.nc"
    # save file
    r["dataset"].to_netcdf(save_dir / filename)
    print(f"saved to {save_dir / filename}")
print("done")