In [10]:
%load_ext jupyter_black

In [11]:
import re
from itertools import islice
from pathlib import Path
from shutil import copyfileobj
from typing import Iterator, Optional, Union
from urllib.parse import urljoin

from requests import Session
from bs4 import BeautifulSoup
from bs4.element import SoupStrainer

In [130]:
class ApacheNode:
    """
    This represents a tree node in the Apache directory page for the NOAA NOMADS site;
    for more details see https://nomads.ncep.noaa.gov/

    It's used to look up TSRAGR data. TSRAGR is Terminal Aerodrome Forecast (TAF) code
    for a thunderstorm with hail.
    """

    def __init__(self, session: Session, url: str, name: Optional[str] = None) -> None:
        self.session = session
        self.url = url

        if name is None:
            name = url.removesuffix("/").rsplit("/", maxsplit=1)[-1]
        self.name = name

    def __repr__(self) -> str:
        return self.url


class ApacheFile(ApacheNode):
    def download(self, save_to: Path) -> None:
        print(f"Downloading {self.name}...", end=" ")
        local_filename = save_to / self.name

        with self.session.get(self.url, stream=True) as response:
            response.raise_for_status()
            with local_filename.open("wb") as f:
                copyfileobj(response.raw, f)

        print("saved")


class ApacheDir(ApacheNode):
    pre_strainer = SoupStrainer(name="pre")

    # Text has at least one character and cannot contain Parent Directory
    link_pattern = re.compile(
        "(?i)"  # ignore case
        "^"  # string start
        "(?:"  # non-capturing group
        "(?!parent directory)"  # negative lookahead: don't match 'parent directory'
        "."  # match any one character
        ")+"  # match one or more of the above chars
        "$"  # string end
    )

    def __init__(self, session: Session, url: str, name: Optional[str] = None) -> None:
        if not url.endswith("/"):
            url += "/"
        super().__init__(session, url, name)

    def children(self) -> Iterator[ApacheNode]:
        with self.session.get(self.url) as response:
            response.raise_for_status()
            soup = BeautifulSoup(
                markup=response.text, features="lxml", parse_only=self.pre_strainer
            )
        pre = soup.pre
        anchors = pre.find_all(name="a", text=self.link_pattern, recursive=False)

        for anchor in anchors:
            child_name = anchor["href"]
            child_url = urljoin(self.url, child_name)
            size_text = anchor.next_sibling.strip()
            if size_text.endswith("-"):
                child_type = ApacheDir
            else:
                child_type = ApacheFile
            yield child_type(self.session, child_url, child_name)

    def navto(self, *args: str) -> "ApacheDir":
        url = urljoin(self.url, "/".join(args))
        return ApacheDir(self.session, url=url, name=args[-1])

    def inav(self, index: int) -> "ApacheNode":
        (child,) = islice(self.children(), index, index + 1)
        return child

    def iterfiles(self):
        for node in self.children():
            if isinstance(node, ApacheFile):
                yield node

    def iterdir(self):
        for node in self.children():
            if isinstance(node, ApacheDir):
                yield node

In [142]:
import re
import datetime
import pandas as pd


class NotAvaliable(Exception):
    ...


def daily_download(
    base_url: str,
    path_opts: str,
    target_day=int,  # datetime.datetime,
) -> None:

    with Session() as session:
        ragr = ApacheDir(session, url=base_url)

        for path in ragr.navto(path_opts).iterdir():
            date = datetime.datetime.strptime(
                re.search(r"\d+/$", path.url).group(), "%Y%m%d/"
            )

            if date.day == target_day:
                if "hrrr" in path.url:
                    path = path.navto("conus")

                for file in path.iterfiles():
                    url = file.url
                    filename = url.rsplit("/", maxsplit=1)[-1]
                    time_delta = datetime.timedelta(hours=int(url[-4:]))
                    valid_time = date + time_delta
                    # save_to = Path('/media/external/data/')  # '/media/external/data/'
                    # save_to.mkdir(exist_ok=True)
                    if valid_time.day == target_day:
                        ...
                        # file.download()


if __name__ == "__main__":
    daily_download(
        "https://nomads.ncep.noaa.gov/pub/data",
        "nccf/com/557ww/prod",
        target_day=datetime.datetime.utcnow().day,
    )