In [1]:
import os
from pathlib import Path

import xarray as xr
import numpy as np

In [2]:
DIR_DATA = Path(os.path.dirname(os.path.abspath(''))).resolve() / "data"
DIR_SOURCE = DIR_DATA / "processed"
DIR_OUTPUT = DIR_DATA / "clean"
DIR_OUTPUT.mkdir(parents=True, exist_ok=True)
print(os.listdir(DIR_SOURCE))

['SEA_SURFACE_HEIGHT_ALT_GRIDS_L4_2SATS_5DAY_6THDEG_V_JPL2205', 'ECCO_L4_DENS_STRAT_PRESS_05DEG_DAILY_V4R4', 'ECCO_L4_GEOMETRY_05DEG_V4R4', 'ECCO_L4_MIXED_LAYER_DEPTH_05DEG_DAILY_V4R4', 'ECCO_L4_SSH_05DEG_DAILY_V4R4', 'AQUA_MODIS_L3M_DAILY_4KM_POC', 'AQUA_MODIS_L3M_DAILY_4KM_CHLCONC', 'TOPP', 'OSCAR_L4_OC_FINAL_V2.0', 'AQUA_MODIS_L3M_DAILY_4KM_PIC', 'ECCO_L4_FRESH_FLUX_05DEG_DAILY_V4R4', 'OSTIA-UKMO-L4-GLOB-REP-v2.0']


In [3]:
source_files = {
    "chlorophyll": DIR_SOURCE / "AQUA_MODIS_L3M_DAILY_4KM_CHLCONC" / "CHLOROPHYLL.zarr",
    "pic": DIR_SOURCE / "AQUA_MODIS_L3M_DAILY_4KM_PIC" / "PIC.zarr",
    "poc":  DIR_SOURCE / "AQUA_MODIS_L3M_DAILY_4KM_POC" / "POC.zarr",
    "density":  DIR_SOURCE / "ECCO_L4_DENS_STRAT_PRESS_05DEG_DAILY_V4R4" / "DENSITY.zarr",
    "freshflux":  DIR_SOURCE / "ECCO_L4_FRESH_FLUX_05DEG_DAILY_V4R4" / "FRESHWATER_FLUX.zarr",
    "mixedlayer":  DIR_SOURCE / "ECCO_L4_MIXED_LAYER_DEPTH_05DEG_DAILY_V4R4" / "MIXED_LAYER_DEPTH.zarr",
    "ssh":  DIR_SOURCE / "ECCO_L4_SSH_05DEG_DAILY_V4R4" / "SSH_DAILY.zarr",
    "currents":  DIR_SOURCE / "OSCAR_L4_OC_FINAL_V2.0" / "CURRENTS_EDDIES.zarr",
    "sst":  DIR_SOURCE / "OSTIA-UKMO-L4-GLOB-REP-v2.0" / "SST.zarr",
}

In [4]:
IDEAL_CHUNKS = {'time': 10, 'lat': 500, 'lon': 500}

for name, path in source_files.items():
    print(f"Processing and rewriting {name}...")
    
    ds = xr.open_zarr(path)

    if name == 'density':
        ds = ds.sel(Z=slice(-5, -100))
        ds = ds.mean(dim='Z')

    if 'cftime' in str(type(ds['time'].values[0])):
        datetime_index = ds.indexes['time'].to_datetimeindex(unsafe=True)
        ds = ds.assign_coords(time=datetime_index)

    for var in ds.variables:
        if 'chunks' in ds[var].encoding:
            del ds[var].encoding['chunks']

    ds = ds.astype(np.float32)
        
    ds = ds.transpose('time', 'lon', 'lat')
    ds_rechunked = ds.chunk(IDEAL_CHUNKS)

    new_path = DIR_OUTPUT / f"{name}.zarr"
    
    ds_rechunked.to_zarr(new_path, mode='w', consolidated=True)

Processing and rewriting chlorophyll...




Processing and rewriting pic...




Processing and rewriting poc...




Processing and rewriting density...




Processing and rewriting freshflux...




Processing and rewriting mixedlayer...




Processing and rewriting ssh...




Processing and rewriting currents...


  datetime_index = ds.indexes['time'].to_datetimeindex(unsafe=True)


Processing and rewriting sst...


