In [1]:
%load_ext griblib.jupyter

import io
import asyncio
from pathlib import Path
from datetime import datetime, timedelta
import cfgrib
import netCDF4
from typing import Literal

import s3fs

import xarray as xr
import pandas as pd
import numpy as np

from griblib import extract

GMGSIProducts = Literal["GMGSI_LW", "GMGSI_SSR", "GMGSI_SW", "GMGSI_VIS", "GMGSI_WV"]

In [12]:
from typing import Iterable


BUCKETS = {
    "GMGSI_LW": "GLOBCOMPLIR_nc",
    "GMGSI_SSR": "GLOBCOMPSSR_nc",
    "GMGSI_SW": "GLOBCOMPSIR_nc",
    "GMGSI_VIS": "GLOBCOMPVIS_nc",
    "GMGSI_WV": "GLOBCOMPWV_nc",
}

fs = s3fs.S3FileSystem(anon=True)


def get_url_template(prod: GMGSIProducts):
    return f"s3://noaa-gmgsi-pds.s3.amazonaws.com/{prod}"


# -3 limits output for testing purposes to prevent memory overload
def daily_file_report(date: datetime, prod: GMGSIProducts = "GMGSI_LW") -> Iterable[str]:
    for path in fs.ls(f"s3://noaa-gmgsi-pds/{prod}/{date:%Y}/{date:%m}/{date:%d}"):
        for file in fs.ls(path):
            yield file

def generate_objs(files: Iterable[str]):
    for file in files:
        with fs.open(file, "rb") as f:
            yield xr.open_dataset(io.BytesIO(f.read()), engine="h5netcdf", chunks={})

def main():

    for date in pd.date_range("2022-01-01", "2022-01-02", freq="d"):
        files = tuple(daily_file_report(date))
        if not files:
            continue
        ds = xr.concat(generate_objs(files), dim="time")
        lat, lon = (np.unique(ds[key]) for key in ("lat", "lon"))
        ds = ds.drop(["lat", "lon"])
        print(ds)



main()

<xarray.Dataset>
Dimensions:  (time: 24, yc: 3000, xc: 4999)
Coordinates:
  * time     (time) datetime64[ns] 2022-01-01 ... 2022-01-01T23:00:00
Dimensions without coordinates: yc, xc
Data variables:
    data     (time, yc, xc) float32 dask.array<chunksize=(1, 3000, 4999), meta=np.ndarray>
Attributes:
    Conventions:          CF-1.4
    Source:               McIDAS Area File
    Satellite Sensor:     DERIVED DATA
    time_coverage_start:  2022-01-01T00:00:00
    instrument_name:      GLOBCOMPLIR
    history:              Sat Jan  1 00:38:22 2022: ncks -d xc,0,4998 tempo.n...
    NCO:                  netCDF Operators version 4.7.5 (Homepage = http://n...
<xarray.Dataset>
Dimensions:  (time: 24, yc: 3000, xc: 4999)
Coordinates:
  * time     (time) datetime64[ns] 2022-01-02 ... 2022-01-02T23:00:00
Dimensions without coordinates: yc, xc
Data variables:
    data     (time, yc, xc) float32 dask.array<chunksize=(1, 3000, 4999), meta=np.ndarray>
Attributes:
    Conventions:          CF-1.4
  