In [None]:
import os
from pathlib import Path

import xarray as xr
from dask.distributed import Client, LocalCluster
from dask.diagnostics import ProgressBar

In [6]:
DIR_DATA = Path(os.path.dirname(os.path.abspath(''))).resolve() / "data"
DIR_SOURCE = DIR_DATA / "clean"

DIR_TEMP = DIR_DATA / "temp"
DIR_TEMP.mkdir(parents=True, exist_ok=True)

DIR_OUTPUT = DIR_DATA / "final"
DIR_OUTPUT.mkdir(parents=True, exist_ok=True)
print(os.listdir(DIR_SOURCE))

['density.zarr', 'mixedlayer.zarr', 'chlorophyll.zarr', 'sst.zarr', 'currents.zarr', 'pic.zarr', 'ssh.zarr', 'poc.zarr', 'freshflux.zarr']


In [None]:
clean_files = {
    "chlorophyll": DIR_SOURCE / "chlorophyll.zarr",
    "pic": DIR_SOURCE  / "pic.zarr",
    "poc":  DIR_SOURCE  / "poc.zarr",
    "density":  DIR_SOURCE / "density.zarr",
    "freshflux":  DIR_SOURCE  / "freshflux.zarr",
    "mixedlayer":  DIR_SOURCE  / "mixedlayer.zarr",
    "ssh":  DIR_SOURCE  / "ssh.zarr",
    "currents":  DIR_SOURCE  / "currents.zarr",
    "sst":  DIR_SOURCE  / "sst.zarr",
}

with xr.open_zarr(clean_files['sst']) as sst_ds:

    target_template = xr.Dataset(coords=sst_ds.coords)
    target_template.load()



In [None]:
with LocalCluster(n_workers=5, threads_per_worker=2, memory_limit="5GiB") as cluster:
    with Client(cluster) as client:
        print("Dask client started:", client)

        regridded_paths = []

        for name, path in clean_files.items():
            if name == 'sst':
                with xr.open_zarr(path) as ds:
                    temp_path = DIR_TEMP / f"{name}.zarr"
                    ds.to_zarr(temp_path, mode='w')
                    regridded_paths.append(temp_path)
                continue

            with xr.open_zarr(path) as ds:
                regridded_ds = ds.interp_like(target_template, method='nearest')
     
                for var in regridded_ds.variables:
                    if 'chunks' in regridded_ds[var].encoding:
                        del regridded_ds[var].encoding['chunks']

                regridded_ds_chunked = regridded_ds.chunk({'time': 10, 'lat': 500, 'lon': 500})

                temp_path = DIR_TEMP / f"{name}.zarr"

                with ProgressBar():
                    regridded_ds_chunked.to_zarr(temp_path, mode='w')
                
                regridded_paths.append(temp_path)

        with xr.open_mfdataset(regridded_paths, engine='zarr', combine='by_coords') as ocean:
            
            with ProgressBar():
                ocean.to_netcdf(DIR_OUTPUT / "oceanographic_data.nc", mode='w', engine='netcdf4')

Dask client started: <Client: 'tcp://127.0.0.1:33865' processes=5 threads=10, memory=25.00 GiB>
Processing and regridding 'chlorophyll'...


This may cause some slowdown.
Consider loading the data with Dask directly
 or using futures or delayed objects to embed the data into the graph without repetition.
See also https://docs.dask.org/en/stable/best-practices.html#load-data-with-dask for more information.
2025-10-05 20:09:04,115 - tornado.application - ERROR - Uncaught exception GET /status/ws (127.0.0.1)
HTTPServerRequest(protocol='http', host='localhost:8787', method='GET', uri='/status/ws', version='HTTP/1.1', remote_ip='127.0.0.1')
Traceback (most recent call last):
  File "/home/isekar/.conda/envs/SpaceApps2025/lib/python3.13/site-packages/tornado/web.py", line 1848, in _execute
    result = await result
             ^^^^^^^^^^^^
  File "/home/isekar/.conda/envs/SpaceApps2025/lib/python3.13/site-packages/tornado/websocket.py", line 277, in get
    await self.ws_connection.accept_connection(self)
  File "/home/isekar/.conda/envs/SpaceApps2025/lib/python3.13/site-packages/tornado/websocket.py", line 890, in accept_connec