In [1]:
%load_ext jupyter_black

In [94]:
from glob import glob
from typing import Iterable, Mapping
import xarray as xr
import pandas as pd
import re
from datetime import datetime

In [92]:
def extract_times(s: pd.Series) -> pd.Series:
    df = pd.concat([s, s.str.extract(r"(?P<valid_time>\d*)\.(?P<forecast_hour>\d*)$")], axis=1)
    valid_times = pd.to_datetime(df.valid_time, format="%Y%m%d%H")
    fcst_hours = pd.to_timedelta(df["forecast_hour"].astype(int), unit="h")
    s.index = valid_times + fcst_hours
    return s

grib_files = pd.Series(glob("/media/external/data/GLOBAL.grib2.*")).pipe(extract_times)
grib_files

2022-06-04 06:00:00    /media/external/data/GLOBAL.grib2.2022060400.0006
2022-06-04 12:00:00    /media/external/data/GLOBAL.grib2.2022060400.0012
2022-06-04 00:00:00    /media/external/data/GLOBAL.grib2.2022060400.0000
2022-06-04 18:00:00    /media/external/data/GLOBAL.grib2.2022060400.0018
dtype: object

In [151]:
import re
from datetime import datetime
import IPython
from IPython import display

variables = ["{0}_P0_L100_GLL0".format(x) for x in ("TMP", "RH", "UGRD", "VGRD", "HGT")]


def grib_concat(grib_files: Mapping[datetime, str], variables: list[str]) -> xr.Dataset:
    def generate():
        for timestamp, file in grib_files.items():
            ds: xr.Dataset = xr.open_dataset(file, engine="pynio")
            yield ds.expand_dims({"validTime": [timestamp.value]})[variables]
            # pyino prints a bunch of stupid warning
            display.clear_output()

    return xr.concat(generate(), dim="validTime")


ds = grib_concat(grib_files, variables=variables).rename(
    {
        "lv_ISBL0": "Pa",
        "lat_0": "lat",
        "lon_0": "lon",
        "TMP_P0_L100_GLL0": "TEMP",
        "RH_P0_L100_GLL0": "RH",
        "UGRD_P0_L100_GLL0": "U-WIND",
        "VGRD_P0_L100_GLL0": "V-WIND",
        "HGT_P0_L100_GLL0": "GPH",
    }
)
ds

In [152]:
df = ds.to_dataframe()
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TEMP,RH,U-WIND,V-WIND,GPH
validTime,Pa,lat,lon,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1654322400000000000,20000.0,90.0,0.0,226.497543,2.714223,4.310657,3.127881,11662.580078
1654322400000000000,20000.0,90.0,0.5,226.497543,2.714223,4.310657,3.127881,11662.580078
1654322400000000000,20000.0,90.0,1.0,226.497543,2.714223,4.310657,3.127881,11662.580078
1654322400000000000,20000.0,90.0,1.5,226.497543,2.714223,4.310657,3.127881,11662.580078
1654322400000000000,20000.0,90.0,2.0,226.497543,2.714223,4.310657,3.127881,11662.580078
...,...,...,...,...,...,...,...,...
1654365600000000000,100000.0,-90.0,357.5,242.958542,98.460106,-2.473158,-4.475143,163.594727
1654365600000000000,100000.0,-90.0,358.0,242.958542,98.460106,-2.473158,-4.475143,163.594727
1654365600000000000,100000.0,-90.0,358.5,242.958542,98.460106,-2.473158,-4.475143,163.594727
1654365600000000000,100000.0,-90.0,359.0,242.958542,98.460106,-2.473158,-4.475143,163.594727


In [95]:
from typing import Callable, Iterable
import requests
from bs4 import BeautifulSoup
import time
import numpy as np

class Urls:
    def __init__(self, urls:Iterable[str]):
        self._urls = np.array(urls)
        
    def __repr__(self):
        return self._urls.__repr__()

    def download(self, save_to="/media/external/data/", wait:int=10):
        for url in self._urls[:2]:
            local_filename = save_to + url.split("/")[-1]
            with requests.get(url, stream=True) as r:
                r.raise_for_status()
                with open(local_filename, "wb") as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)
            time.sleep(60 * wait)
            
def read_urls(func:Callable[[],str]):
    url, path_opt = func()
    def inner():
        # read the base url
        r = requests.get(url)
        r.raise_for_status()

        soup = BeautifulSoup(r.content)
        latests_run = url + soup.find_all("a")[-1].text

        r = requests.get(latests_run)
        r.raise_for_status()
        soup = BeautifulSoup(r.content)
        all_gribs:list[str] = [latests_run + a.text for a in soup.find_all("a")[1:]]

        if path_opt is None:
            return Urls(all_gribs)
            
        r = requests.get([g for g in all_gribs if g.endswith(path_opt)][0])
        r.raise_for_status()
        alpha = BeautifulSoup(r.content).find_all("a")
        return Urls([latests_run + path_opt +  a.text  for a in alpha  if a.text.endswith(".grib2")])


    return inner

@read_urls
def galwem()->Urls:
    return "https://nomads.ncep.noaa.gov/pub/data/nccf/com/557ww/prod/", None

@read_urls
def hrrr(loc="conus")->Urls:
    """loc = conus or alasia"""
    return "https://nomads.ncep.noaa.gov/pub/data/nccf/com/hrrr/prod/", loc+"/"

hrrr()#.download(wait=1)

array(['https://nomads.ncep.noaa.gov/pub/data/nccf/com/hrrr/prod/hrrr.20220604/conus/hrrr.t00z.wrfnatf00.grib2',
       'https://nomads.ncep.noaa.gov/pub/data/nccf/com/hrrr/prod/hrrr.20220604/conus/hrrr.t00z.wrfnatf01.grib2',
       'https://nomads.ncep.noaa.gov/pub/data/nccf/com/hrrr/prod/hrrr.20220604/conus/hrrr.t00z.wrfnatf02.grib2',
       ...,
       'https://nomads.ncep.noaa.gov/pub/data/nccf/com/hrrr/prod/hrrr.20220604/conus/hrrr.t16z.wrfsubhf16.grib2',
       'https://nomads.ncep.noaa.gov/pub/data/nccf/com/hrrr/prod/hrrr.20220604/conus/hrrr.t16z.wrfsubhf17.grib2',
       'https://nomads.ncep.noaa.gov/pub/data/nccf/com/hrrr/prod/hrrr.20220604/conus/hrrr.t16z.wrfsubhf18.grib2'],
      dtype='<U103')