In [1]:
%load_ext jupyter_black
import nest_asyncio

nest_asyncio.apply()

In [2]:
import io
from typing import Literal, Union
from datetime import datetime, timedelta

import s3fs
import pandas as pd
import numpy as np
import xarray as xr

from griblib.extract import get_url_template

GMGSIProducts = Literal["GMGSI_LW", "GMGSI_SSR", "GMGSI_SW", "GMGSI_VIS", "GMGSI_WV"]
TimeLike = Union[str, datetime, pd.Timestamp]

In [12]:
BUCKETS = {
    "GMGSI_LW": "GLOBCOMPLIR_nc",
    "GMGSI_SSR": "GLOBCOMPSSR_nc",
    "GMGSI_SW": "GLOBCOMPSIR_nc",
    "GMGSI_VIS": "GLOBCOMPVIS_nc",
    "GMGSI_WV": "GLOBCOMPWV_nc",
}

fs = s3fs.S3FileSystem(anon=True)

def get_url_template(prod: GMGSIProducts):
    return f"s3://noaa-gmgsi-pds.s3.amazonaws.com/{prod}" 

# -3 limits output for testing purposes to prevent memory overload
def daily_file_report(date: datetime):
    for path in fs.ls(f"s3://noaa-gmgsi-pds/GMGSI_LW/{date:%Y}/{date:%m}/{date:%d}"):
        for file in fs.ls(path):
            yield file

def generate():
    for date in pd.date_range("2022-01-01", "2022-01-02", freq="d"):
        files = tuple(daily_file_report(date))
        if not files:
            continue
        for file in files:
            with fs.open(file, "rb") as f:
                yield xr.open_dataset(io.BytesIO(f.read()), engine="h5netcdf")


def main():
    return xr.concat(generate(), dim="time")
ds = main()
ds