In [2]:
%load_ext jupyter_black

The jupyter_black extension is already loaded. To reload it, use:
  %reload_ext jupyter_black


In [9]:
import griblib.probsevere as ps
from griblib.probsevere.typed import FeatureCollection
from pathlib import Path

In [10]:
if __name__ == "__main__":
    ps.download2parquet(
        Path("./probsevere-bucket"),
        start="2022-03-01T00:00",
        end="2022-03-01T00:02",
    )

In [40]:
from pathlib import Path
from warnings import warn
from datetime import datetime
from typing import Callable, Union, Iterable, Iterator

import pandas as pd
import numpy as np
import dask.dataframe as dd
from dask.dataframe.core import DataFrame as DaskDataFrame
from geopandas import GeoDataFrame, GeoSeries
from requests import Session, HTTPError

from griblib.probsevere.typed import FeatureCollection

PROBSEVERE_URL_TEMPLATE = (
    "https://mtarchive.geol.iastate.edu/%Y/%m/%d/mrms/ncep/ProbSevere/MRMS_PROBSEVERE_%Y%m%d_%H%M00.json"
)


TimeLike = Union[datetime, str, pd.Timestamp]


def __iterdaterange(
    start: TimeLike, end: TimeLike, *, freq: str = "2min"
) -> Iterator[tuple[pd.Timestamp, pd.DataFrame]]:
    dr = pd.date_range(start=start, end=end, freq=freq)
    urls = dr.strftime(PROBSEVERE_URL_TEMPLATE)
    yield from pd.DataFrame({"date": dr, "urls": urls}).set_index(dr).groupby(pd.Grouper(key="date", freq="D", axis=0))


def __generate_from_features(session: Session, *, urls: Iterable[str]) -> Iterable[pd.DataFrame]:
    for url in urls:
        try:
            # with our session make a get request, r is a response object
            r = session.get(url, stream=True)
            # in the event of a non 200 status code we'll raise a HTTPError and trigger the except block
            r.raise_for_status()
        # if there was an error downloading, continue
        except (ConnectionError, HTTPError):
            warn(f"error downloading {url}")
            continue
        fc: FeatureCollection = r.json()

        features = fc["features"]
        # in the event no storms were record, continue
        if not features:
            # warn(f"url contained no features: {url}")
            continue

        df = GeoDataFrame.from_features(features)
        # validtime = datetime.strptime(fc["validTime"], "%Y%m%d_%H%M%S %Z")
        df["VALIDTIME"] = datetime.strptime(fc["validTime"], "%Y%m%d_%H%M%S %Z")
        yield df


def __geometry(
    df: GeoDataFrame,
) -> pd.DataFrame:
    # to keep thins consistent uppercase all of the bounds
    df[df.bounds.columns.str.upper()] = df.bounds
    point = df.representative_point()
    df["X"] = point.x
    df["Y"] =point.y
    return df
    # bounds.columns = bounds.columns.str.upper()
    # bounds["CENTROID_X"] = geometry.centroid.x
    # bounds["CENTROID_Y"] = geometry.centroid.y
    # return pd.concat((df, bounds), axis=1)


def __dtypes(
    ddf: DaskDataFrame,
    *,
    float32_cols: list[str] = [
        "EBSHEAR",
        "MEANWIND_1-3kmAGL",
        "MESH",
        "VIL_DENSITY",
        "FLASH_DENSITY",
        "MOTION_EAST",
        "MOTION_SOUTH",
        "MAXLLAZ",
        "P98LLAZ",
        "P98MLAZ",
        "WETBULB_0C_HGT",
        "PWAT",
        "LJA",
        "MINX",
        "MINY",
        "MAXX",
        "MAXY",
        "CENTROID_X",
        "CENTROID_Y",
    ],
    int32_cols: list[str] = [
        "MLCIN",
    ],
    uint32_cols: list[str] = [
        "MUCAPE",
        "MLCAPE",
        "SRH01KM",
        "FLASH_RATE",
        "CAPE_M10M30",
        "SIZE",
        "ID",
    ],
    # 0 - 255
    uint8_cols: list[str] = [
        "PS",
    ],
) -> DaskDataFrame:

    ddf[float32_cols] = ddf[float32_cols].astype(np.float32)
    # 32-bit signed integer (``-2_147_483_648`` to ``2_147_483_647``)
    ddf[int32_cols] = ddf[int32_cols].astype(np.int32)
    # 32-bit unsigned integer (``0`` to ``4_294_967_295``)
    ddf[uint32_cols] = ddf[uint32_cols].astype(np.uint32)
    # numpy.uint8`: 8-bit unsigned integer (``0`` to ``255``)
    ddf[uint8_cols] = ddf[uint8_cols].astype(np.uint8)
    return ddf


def __to_dask(df: pd.DataFrame, *, chunk_size: int) -> DaskDataFrame:
    return dd.from_pandas(df, chunksize=chunk_size).pipe(__dtypes)  # type: ignore


def __name_function(time: datetime) -> Callable[[int], str]:
    date_string = time.strftime("%Y-%m-%d")
    return lambda n: f"{n}-{date_string}.pq"


def download2parquet(
    path: Path,
    *,
    start: TimeLike,
    end: TimeLike,
    freq: str = "2min",
    chunk_size: int = 256,
) -> None:
    drop_columns = ["MAXRC_EMISS", "MAXRC_ICECF", "AVG_BEAM_HGT", "geometry"]
    with Session() as session:
        for timestamp, values in __iterdaterange(start, end, freq=freq):
            # create the inital pandas dataframe
            df = (
                # download data
                pd.concat(__generate_from_features(session, urls=values["urls"]))
                # wrangle the geometry
                .pipe(__geometry).drop(columns=drop_columns)
                # .pipe(__to_dask, chunk_size=chunk_size)
                # .to_parquet(  # type: ignore
                #     path,
                #     engine="pyarrow",
                #     append=True,
                #     name_function=__name_function(timestamp),
                #     ignore_divisions=True,
                # )
            )
        return df


if __name__ == "__main__":
    df = download2parquet(
        Path("./probsevere-data-new"),
        start="2022-03-01T00:00",
        end="2022-03-01T00:02",
    )
df

Unnamed: 0,MUCAPE,MLCAPE,MLCIN,EBSHEAR,SRH01KM,MEANWIND_1-3kmAGL,MESH,VIL_DENSITY,FLASH_RATE,FLASH_DENSITY,...,MOTION_SOUTH,PS,ID,VALIDTIME,MINX,MINY,MAXX,MAXY,X,Y
0,1524,1270,0,51.4,36,17.9,0.92,2.08,36,1.25,...,6.203,88,136524,2022-03-01 00:00:29,-79.21,30.12,-79.07,30.28,-79.14,30.2
1,756,433,0,39.2,82,16.2,0.11,0.68,0,0.0,...,5.193,4,136565,2022-03-01 00:00:29,-80.32,28.02,-80.25,28.09,-80.285,28.055
2,384,112,-5,37.9,80,7.3,0.0,0.78,0,0.0,...,5.236,2,136592,2022-03-01 00:00:29,-80.95,29.99,-80.77,30.21,-80.855833,30.1
3,551,422,-4,33.3,24,18.0,0.08,0.94,0,0.02,...,5.148,2,136648,2022-03-01 00:00:29,-81.13,28.31,-81.03,28.41,-81.0775,28.365
4,10,0,-999,6.4,170,39.0,0.0,0.27,0,0.0,...,4.775,0,136649,2022-03-01 00:00:29,-123.76,47.47,-123.57,47.54,-123.66875,47.505
5,612,267,-27,41.0,65,3.2,0.08,1.32,0,0.0,...,6.624,3,136658,2022-03-01 00:00:29,-80.49,30.34,-80.43,30.4,-80.46375,30.365
6,665,395,0,39.6,57,16.6,0.0,0.75,0,0.0,...,4.886,2,136662,2022-03-01 00:00:29,-80.52,27.93,-80.45,28.0,-80.485,27.965
7,1,7,0,0.9,170,43.5,0.0,0.49,0,0.0,...,5.113,0,136666,2022-03-01 00:00:29,-123.12,47.57,-122.99,47.78,-123.054375,47.665
8,124,89,0,31.7,342,54.1,0.0,0.35,0,0.0,...,5.345,1,136667,2022-03-01 00:00:29,-123.05,46.19,-122.98,46.25,-123.0125,46.22
9,112,77,0,36.8,334,52.9,0.0,0.48,0,0.0,...,4.395,1,136668,2022-03-01 00:00:29,-122.75,46.46,-122.64,46.56,-122.7025,46.5


0      POINT (-79.14000 30.20000)
1      POINT (-80.28500 28.05500)
2      POINT (-80.85583 30.10000)
3      POINT (-81.07750 28.36500)
4     POINT (-123.66875 47.50500)
5      POINT (-80.46375 30.36500)
6      POINT (-80.48500 27.96500)
7     POINT (-123.05437 47.66500)
8     POINT (-123.01250 46.22000)
9     POINT (-122.70250 46.50000)
10     POINT (-79.65000 29.71500)
11     POINT (-78.11750 28.39500)
12    POINT (-121.66500 48.32000)
13     POINT (-80.65500 30.05000)
14     POINT (-73.74250 47.75500)
15    POINT (-123.50000 47.47000)
16     POINT (-80.29125 31.05500)
17     POINT (-80.57950 30.67000)
18     POINT (-80.48844 30.20500)
19     POINT (-78.15750 29.76500)
0      POINT (-79.14000 30.19500)
1      POINT (-80.27812 28.05500)
2      POINT (-80.84500 30.09500)
3      POINT (-81.06500 28.37000)
4     POINT (-123.63583 47.47500)
5      POINT (-80.47500 30.39500)
6      POINT (-80.46700 27.95500)
7     POINT (-123.09500 47.60000)
8     POINT (-123.00500 46.23000)
9     POINT (-

Unnamed: 0,MUCAPE,MLCAPE,MLCIN,EBSHEAR,SRH01KM,MEANWIND_1-3kmAGL,MESH,VIL_DENSITY,FLASH_RATE,FLASH_DENSITY,...,SIZE,MOTION_EAST,MOTION_SOUTH,PS,ID,VALIDTIME,MINX,MINY,MAXX,MAXY
0,1524,1270,0,51.4,36,17.9,0.92,2.08,36,1.25,...,201,6.493,6.203,88,136524,2022-03-01 00:00:29,-79.21,30.12,-79.07,30.28
1,756,433,0,39.2,82,16.2,0.11,0.68,0,0.0,...,55,8.784,5.193,4,136565,2022-03-01 00:00:29,-80.32,28.02,-80.25,28.09
2,384,112,-5,37.9,80,7.3,0.0,0.78,0,0.0,...,271,5.375,5.236,2,136592,2022-03-01 00:00:29,-80.95,29.99,-80.77,30.21
3,551,422,-4,33.3,24,18.0,0.08,0.94,0,0.02,...,83,6.686,5.148,2,136648,2022-03-01 00:00:29,-81.13,28.31,-81.03,28.41
4,10,0,-999,6.4,170,39.0,0.0,0.27,0,0.0,...,119,5.171,4.775,0,136649,2022-03-01 00:00:29,-123.76,47.47,-123.57,47.54
5,612,267,-27,41.0,65,3.2,0.08,1.32,0,0.0,...,40,5.167,6.624,3,136658,2022-03-01 00:00:29,-80.49,30.34,-80.43,30.4
6,665,395,0,39.6,57,16.6,0.0,0.75,0,0.0,...,54,7.01,4.886,2,136662,2022-03-01 00:00:29,-80.52,27.93,-80.45,28.0
7,1,7,0,0.9,170,43.5,0.0,0.49,0,0.0,...,235,3.402,5.113,0,136666,2022-03-01 00:00:29,-123.12,47.57,-122.99,47.78
8,124,89,0,31.7,342,54.1,0.0,0.35,0,0.0,...,40,4.166,5.345,1,136667,2022-03-01 00:00:29,-123.05,46.19,-122.98,46.25
9,112,77,0,36.8,334,52.9,0.0,0.48,0,0.0,...,101,6.96,4.395,1,136668,2022-03-01 00:00:29,-122.75,46.46,-122.64,46.56
