# Extract

In [1]:
%load_ext jupyter_black

In [1]:
import os
import shutil
from pathlib import Path
from typing import Iterator
from datetime import datetime
from urllib.error import HTTPError

from requests import Session
import pandas as pd

root = Path(os.path.abspath(__name__)).parents[1]
root

PosixPath('/workspaces/MMM-Py')

In [3]:
def __iterurls(baseurl: str) -> Iterator[str]:
    (html,) = pd.read_html(baseurl)
    nodes = html["Name"].dropna()
    yield from baseurl + nodes[nodes.str.contains("MergedReflectivityQC")]


def __iterfiles(
    files: pd.Series, input_dt: datetime, max_seconds: int
) -> Iterator[str]:
    time_delta: pd.Series[datetime] = abs(
        input_dt - files.str.extract(r"(\d{8}-\d{6})").astype("datetime64[s]").squeeze()
    )
    yield from files[time_delta.dt.total_seconds() <= max_seconds]


def download_files(
    save_to: Path,
    *,
    input_dt: datetime = datetime.utcnow(),
    max_seconds: int = 300,
) -> None:
    baseurl = "http://mrms.ncep.noaa.gov/data/3DRefl/"
    if not save_to.exists():
        save_to.mkdir()
    with Session() as session:
        for url in __iterurls(baseurl):
            (html,) = pd.read_html(url, skiprows=[1, 2, 3], parse_dates=True)

            for file in __iterfiles(html["Name"].dropna(), input_dt, max_seconds):
                try:
                    r = session.get(url + file, stream=True)
                    r.raise_for_status()
                    with (save_to / file).open("wb") as fileout:
                        shutil.copyfileobj(r.raw, fileout)
                except HTTPError:
                    continue


if __name__ == "__main__":
    download_files(root / "data")