In [1]:
import xarray as xr
import numpy as np
import pandas as pd
from dask.distributed import Client, wait
from dask import array as darr

from pathlib import Path

import hextraj

In [2]:
import cartopy
from matplotlib import pyplot as plt

## Parameters

We need to specify a few things for the trajectory data (location, chunks).

We also need to specify the hex projectoin. We choose a central position well withing the NWE shelf region (3W, 54.5N) and a hex size (radius) of 10 km

In [3]:
# parameters

data_path = "../../../output_data/010_lagrangian_experiment/"
_chunks_traj = 20_000
_chunks_obs = -1

chunks_traj = 20_000
chunks_obs = -1

# ~ center of NWE Shelf dataset
lon_origin = -3.0
lat_origin = 54.5

hex_size_meters = 10_000  # radius of hexes

max_land_allowed = 0

## Open the trajectory data

In [4]:
data_files = {y: sorted(
              f for f in Path(data_path).glob(f"*{y:4d}-05-15*.nc")
              if "start-depth-meters-010" in f.name
              ) 
              for y in [2019, 2020, 2021, 2022]
}
print(data_files[2019][0])
print(len(data_files[2019]))
print(len(data_files[2020]))
print(len(data_files[2021]))
print(len(data_files[2022]))
print(data_files[2019][-1])

../../../output_data/010_lagrangian_experiment/north-sea_oysters_start-ref-2019-05-15T00:00:00_start-offset-000_RT-0028_N-000100000_seed-576251074_start-depth-meters-010.nc
122
122
122
73
../../../output_data/010_lagrangian_experiment/north-sea_oysters_start-ref-2019-05-15T00:00:00_start-offset-121_RT-0028_N-000100000_seed-723534670_start-depth-meters-010.nc


In [5]:
from tqdm import tqdm

In [None]:
data_files_valid = {}
for y in data_files.keys():
    _valid_files = []
    for f in tqdm(data_files[y]):
        try:
            _ = xr.open_dataset(f)
            _valid_files.append(f)
        except:
            pass
    data_files_valid[y] = _valid_files

100%|██████████| 122/122 [00:25<00:00,  4.70it/s]
100%|██████████| 122/122 [00:26<00:00,  4.62it/s]
 10%|▉         | 12/122 [00:02<00:20,  5.42it/s]

In [None]:
data_files = data_files_valid

In [None]:
ds = {
    y: xr.open_mfdataset(
        data_files[y],
        chunks={"traj": _chunks_traj, "obs": _chunks_obs},
        combine="nested",
        concat_dim="offset",
        # not sure what's wrong here, but we cannot write
        # zarr store in parallel if we decode:
        decode_cf=False,
    )
    for y in data_files.keys()
}
for y in ds.keys():
    ds[y] = ds[y].assign_coords(offset=("offset", list(range(ds[y].dims["offset"]))))
    
ds = xr.concat(ds.values(), dim="year")

display(ds)
print(ds.nbytes / 1e9, "GiB")

## Dask

In [None]:
#client = Client(n_workers=4, threads_per_worker=4, memory_limit=32e9, ip="0.0.0.0")
client = Client(scheduler_file="../../../scheduler.json")
#client.restart()
client

In [None]:
ds.encoding = {}
for v in ds.data_vars:
    ds[v].encoding = {}

In [21]:
%%time

_ = ds.to_zarr(
    "../../../output_data/010_lagrangian_experiment_10m.zarr/",
    mode="w",
);

CPU times: user 5.25 s, sys: 669 ms, total: 5.92 s
Wall time: 7min 10s
