# Extract

In [27]:
%load_ext jupyter_black

In [28]:
import os
import shutil
from pathlib import Path
from typing import Iterator
from datetime import datetime
from urllib.error import HTTPError
import gzip
from requests import Session
import pandas as pd

import xarray as xr

data = Path(os.path.abspath(__name__)).parents[1] / "data"
data

PosixPath('/workspaces/mmmpy/data')

In [29]:
def __iterurls(baseurl: str) -> Iterator[str]:
    (html,) = pd.read_html(baseurl)
    nodes = html["Name"].dropna()
    yield from baseurl + nodes[nodes.str.contains("MergedReflectivityQC")]


def __iterfiles(
    files: pd.Series, input_dt: datetime, max_seconds: int
) -> Iterator[str]:
    time_delta: pd.Series[datetime] = abs(
        input_dt - files.str.extract(r"(\d{8}-\d{6})").astype("datetime64[s]").squeeze()
    )
    yield from files[time_delta.dt.total_seconds() <= max_seconds]


def __make_archive(source: Path, destination: Path) -> None:
    base_name = destination.parent / destination.stem
    fmt = destination.suffix.replace(".", "")
    root_dir = source.parent
    base_dir = source.name
    shutil.make_archive(str(base_name), fmt, root_dir, base_dir)


def download_files(
    save_to: Path,
    *,
    input_dt: datetime = datetime.utcnow(),
    max_seconds: int = 300,
    archive: str = None
) -> None:
    baseurl = "http://mrms.ncep.noaa.gov/data/3DRefl/"
    if not save_to.exists():
        save_to.mkdir()
    with Session() as session:
        # iterating the first page provides the levels that are avaliable in the 3DRefl database
        for url in __iterurls(baseurl):
            # all of the levels pages are read to get the validtimes to each of the files and file url
            (html,) = pd.read_html(url, skiprows=[1, 2, 3], parse_dates=True)
            # then some logic to select only recent files
            for file in __iterfiles(html["Name"].dropna(), input_dt, max_seconds):
                try:
                    # a request is made to hit the file url
                    r = session.get(url + file, stream=True, headers={"accept": "gzip"})
                    r.raise_for_status()
                except HTTPError:
                    continue
                # the response object is decompressed
                with gzip.GzipFile(fileobj=r.raw, mode="rb") as fsrc:
                    # written to the local drive
                    with (save_to / file.removesuffix(".gz")).open("wb") as fdst:
                        shutil.copyfileobj(fsrc, fdst)
    if archive:
        # passing an archive argument will archive the files
        # this is useful for the git purposes
        __make_archive(save_to, save_to.with_suffix(archive))


if __name__ == "__main__":
    download_files(data / "MRMS_MergedReflectivity", archive=".gztar")

In [32]:
# cfgrib will generate an idx file the first time it read the dataset
# the idx speeds up the read time, wil alot of files the load time is slow
xr.open_mfdataset((data / "MRMS_MergedReflectivity").glob("*"), chunks={},engine="cfgrib")

ECCODES ERROR   :  Truncating time: non-zero seconds(41) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(41) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(41) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(41) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(41) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(41) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(41) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(41) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(41) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(41) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(41) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(41) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(41) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(41) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(41) ignored
ECCODES ERROR   :  Trunca

In [17]:
import xarray as xr
file= tuple((root/"data").glob("*.grib2"))[0]
xr.open_dataset(file)

ECCODES ERROR   :  Truncating time: non-zero seconds(42) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(42) ignored


In [61]:
import xarray as xr

with tarfile.open(root / "data/test.gz.tar", "r:gz") as tar:
    ...
    # print(tar.())
    # print(tar.extractall())
d = tuple(Path("/workspaces/mmmpy/").glob("*.grib2"))
xr.open_dataset(d[3])

ERROR:cfgrib.messages:Can't create file '/workspaces/mmmpy/MRMS_MergedReflectivityQC_00.75_20220819-192841.grib2.923a8.idx'
Traceback (most recent call last):
  File "/opt/venv/lib/python3.10/site-packages/cfgrib/messages.py", line 261, in itervalues
    yield self.filestream.message_from_file(file, errors=errors)
  File "/opt/venv/lib/python3.10/site-packages/cfgrib/messages.py", line 328, in message_from_file
    return Message.from_file(file, offset, **kwargs)
  File "/opt/venv/lib/python3.10/site-packages/cfgrib/messages.py", line 102, in from_file
    raise EOFError("End of file: %r" % file)
EOFError: End of file: <_io.BufferedReader name='/workspaces/mmmpy/MRMS_MergedReflectivityQC_00.75_20220819-192841.grib2'>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/venv/lib/python3.10/site-packages/cfgrib/messages.py", line 523, in from_indexpath_or_filestream
    self = cls.from_fieldset(filestream, index_keys, compu

EOFError: No valid message found: '/workspaces/mmmpy/MRMS_MergedReflectivityQC_00.75_20220819-192841.grib2'

AttributeError: type object 'ArchPath' has no attribute '_flavour'

In [30]:
import xarray as xr

xr.open_mfdataset((root / "data/MRMS_MergedReflectivityQC").glob("*"))

ECCODES ERROR   :  Truncating time: non-zero seconds(40) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(40) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(40) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(40) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(40) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(40) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(40) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(40) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(40) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(40) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(40) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(40) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(40) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(40) ignored
ECCODES ERROR   :  Truncating time: non-zero seconds(40) ignored
ECCODES ERROR   :  Trunca

KeyboardInterrupt: 

In [21]:
data = root / "data"
import tarfile
import gzip

# data.glob("*.grib2.gz")
tar_archive = data / "MRMS_MergedReflectivityQC.grib2.gz.tar"
with tarfile.open(tar_archive, "w:gz") as fdst:
    for file in data.glob("*.grib2.gz"):
        fdst.add(file)

In [26]:
# for f in (data/ "MRMS_MergedReflectivityQC.gz.tar").glob("*.gz.tar"):
with tarfile.open((data / "MRMS_MergedReflectivityQC.gz.tar"), "r:gz") as fout:
    print(fout)

ReadError: not a gzip file