In [8]:
# Import standard libraries
from pathlib import Path
from datetime import datetime

# import third-party libraries
import numpy as np
import pandas as pd
import scipy as sp
import xarray as xr


In [9]:
# Define constants
DATA_DIR = Path.cwd() / "data/"
STANDARD_DEPTHS = np.array(
    [
        0,
        10,
        20,
        30,
        50,
        75,
        100,
        125,
        150,
        200,
        250,
        300,
        400,
        500,
    ]
)

In [10]:
def concat_data_array(param_name: str, sdate: str, edate: str) -> np.ndarray:
    data_list = []
    sdate_dt = datetime.strptime(sdate, "%Y-%m-%d")
    edate_dt = datetime.strptime(edate, "%Y-%m-%d")
    start_year = sdate_dt.year
    end_year = edate_dt.year
    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            f = DATA_DIR / f"sooList_{year}{month:02d}.json"
            if not f.is_file():
                continue
            df = pd.read_json(f)
            arr = df[param_name].replace(["", None], np.nan)
            data_list.append(arr.values)
    if data_list:
        return np.concatenate(data_list)
    return np.array([])


In [11]:
sdate = "1968-01-01"
edate = "2024-12-31"
temperature = concat_data_array("wtr_tmp", sdate, edate)
depth = concat_data_array("wtr_dep", sdate, edate)
obs_time = concat_data_array("obs_dtm", sdate, edate)
longitude = concat_data_array("lon", sdate, edate)
latitude = concat_data_array("lat", sdate, edate)
salinity = concat_data_array("sal", sdate, edate)
dissolved_oxygen = concat_data_array("dox", sdate, edate)


KeyboardInterrupt: 

In [None]:


def save_sparse_to_netcdf_spatiotemporal(
    df: pd.DataFrame, filename: str, output_dir: str = DATA_DIR
) -> None:
    t_vals = np.sort(pd.to_datetime(df["datetime"].unique()))
    x_vals = np.sort(df["longitude"].unique())
    y_vals = np.sort(df["latitude"].unique())
    z_vals = np.sort(df["depth"].unique())

    t_index = {v: i for i, v in enumerate(t_vals)}
    x_index = {v: i for i, v in enumerate(x_vals)}
    y_index = {v: i for i, v in enumerate(y_vals)}
    z_index = {v: i for i, v in enumerate(z_vals)}

    t_idx, x_idx, y_idx, z_idx = [], [], [], []
    wtr_tmp_data, sal_data, dox_data = [], [], []

    for _, row in df.iterrows():
        if (
            pd.isna(row["datetime"])
            or np.isnan(row["longitude"])
            or np.isnan(row["latitude"])
            or np.isnan(row["depth"])
        ):
            continue
        if not (
            np.isnan(row["temperature"])
            and np.isnan(row["salinity"])
            and np.isnan(row["dissolved_oxygen"])
        ):
            t_idx.append(t_index[pd.to_datetime(row["datetime"])])
            x_idx.append(x_index[row["longitude"]])
            y_idx.append(y_index[row["latitude"]])
            z_idx.append(z_index[row["depth"]])
            wtr_tmp_data.append(np.float32(row["temperature"]))
            sal_data.append(np.float32(row["salinity"]))
            dox_data.append(np.float32(row["dissolved_oxygen"]))

    n_obs = len(t_idx)

    with nc.Dataset(filename, "w", format="NETCDF4") as ds:
        ds.createDimension("n_obs", n_obs)
        ds.createDimension("t", len(t_vals))
        ds.createDimension("x", len(x_vals))
        ds.createDimension("y", len(y_vals))
        ds.createDimension("z", len(z_vals))

        # Coordinate variables
        time_var = ds.createVariable("t", "str", ("t",))
        time_var[:] = np.array([str(dt) for dt in t_vals])
        ds.createVariable("x", "f4", ("x",))[:] = x_vals
        ds.createVariable("y", "f4", ("y",))[:] = y_vals
        ds.createVariable("z", "f4", ("z",))[:] = z_vals

        # Index variables
        ds.createVariable("t_idx", "i4", ("n_obs",))[:] = np.array(t_idx, dtype=np.int32)
        ds.createVariable("x_idx", "i4", ("n_obs",))[:] = np.array(x_idx, dtype=np.int32)
        ds.createVariable("y_idx", "i4", ("n_obs",))[:] = np.array(y_idx, dtype=np.int32)
        ds.createVariable("z_idx", "i4", ("n_obs",))[:] = np.array(z_idx, dtype=np.int32)

        # Sparse data variables as float32
        ds.createVariable("wtr_tmp", "f4", ("n_obs",), fill_value=np.nan)[:] = np.array(
            wtr_tmp_data, dtype=np.float32
        )
        ds.createVariable("sal", "f4", ("n_obs",), fill_value=np.nan)[:] = np.array(
            sal_data, dtype=np.float32
        )
        ds.createVariable("dox", "f4", ("n_obs",), fill_value=np.nan)[:] = np.array(
            dox_data, dtype=np.float32
        )

        ds.title = "KODC Serial Oceanographic Observation Data"
        ds.history = "Created by script (sparse spatiotemporal representation)"

