In [1]:
%load_ext jupyter_black

In [2]:
import os
import uuid
from pathlib import Path
from warnings import warn
from datetime import datetime
from typing import Callable, Union, TypedDict

import pandas as pd
import numpy as np
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster
from geopandas import GeoDataFrame
from requests import Session, HTTPError


PROBSEVERE_URL_TEMPLATE = (
    "https://mtarchive.geol.iastate.edu/%Y/%m/%d/mrms/ncep/ProbSevere/MRMS_PROBSEVERE_%Y%m%d_%H%M00.json"
)

FILE_OUT_DIR = os.path.abspath("../../bucket")

TimeLike = Union[datetime, str, pd.Timestamp]


class Feature(TypedDict):
    properties: dict[str, Union[float, int, str]]


class FeatureCollection(TypedDict):
    validTime: str
    features: list[Feature]

In [3]:
cluster = LocalCluster()  # Launches a scheduler and workers locally
client = Client(cluster)  # Connect to distributed cluster and override default
client

2022-08-05 07:30:03,826 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-rzlwic6j', purging
2022-08-05 07:30:03,826 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-_lwwzptv', purging
2022-08-05 07:30:03,826 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-4pt3shpj', purging


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 3
Total threads: 6,Total memory: 15.58 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:41841,Workers: 3
Dashboard: http://127.0.0.1:8787/status,Total threads: 6
Started: Just now,Total memory: 15.58 GiB

0,1
Comm: tcp://127.0.0.1:45613,Total threads: 2
Dashboard: http://127.0.0.1:42163/status,Memory: 5.19 GiB
Nanny: tcp://127.0.0.1:34493,
Local directory: /tmp/dask-worker-space/worker-8w298qxo,Local directory: /tmp/dask-worker-space/worker-8w298qxo

0,1
Comm: tcp://127.0.0.1:34847,Total threads: 2
Dashboard: http://127.0.0.1:45351/status,Memory: 5.19 GiB
Nanny: tcp://127.0.0.1:38571,
Local directory: /tmp/dask-worker-space/worker-0j8ccg64,Local directory: /tmp/dask-worker-space/worker-0j8ccg64

0,1
Comm: tcp://127.0.0.1:34323,Total threads: 2
Dashboard: http://127.0.0.1:40035/status,Memory: 5.19 GiB
Nanny: tcp://127.0.0.1:46133,
Local directory: /tmp/dask-worker-space/worker-1w7d6_sb,Local directory: /tmp/dask-worker-space/worker-1w7d6_sb


In [4]:
__uuid = uuid.uuid4()


def __batch_id(validtime: datetime) -> Callable[[int], str]:
    def wrapper(n: int) -> str:
        return f"probsevere-{n}-{validtime.isoformat(timespec='minutes')}-{__uuid}.parquet"

    return wrapper


def __features(features: list[Feature]) -> pd.DataFrame:
    return GeoDataFrame.from_features(features)


def __bounds(
    df: pd.DataFrame,
    drop_columns: list[str] = ["MAXRC_EMISS", "MAXRC_ICECF", "AVG_BEAM_HGT", "geometry"],
) -> pd.DataFrame:
    return pd.concat((df, df["geometry"].bounds), axis=1).drop(columns=drop_columns)

In [5]:
def direct_to_parquet(
    path_dir: Path,
    start: TimeLike,
    end: TimeLike,
    freq: str = "2min",
) -> None:
    """scrape data from the iastate archive over a daterange"""
    # create a DatetimeIndex using the the function arguments and format the urls using the url template
    urls = pd.date_range(start=start, end=end, freq=freq)
    # using Session as a context manager
    with Session() as session:
        # iterating over all of the urls
        for url in urls.strftime(PROBSEVERE_URL_TEMPLATE):
            # using a try/catch block in the event the download fails
            try:
                # with our session make a get request, r is a response object
                r = session.get(url, stream=True)
                # in the event of a non 200 status code we'll raise a HTTPError and trigger the except block
                r.raise_for_status()
            # if there was an error downloading, continue
            except (ConnectionError, HTTPError):
                warn(f"error downloading {url}")
                continue
            fc: FeatureCollection = r.json()

            features = fc["features"]
            # in the event no storms were record, continue
            if not features:
                continue
            #
            df = __features(features).pipe(__bounds).astype(np.float32)

            valid_time = datetime.strptime(fc["validTime"], "%Y%m%d_%H%M%S %Z")
            # set the valid time
            df["valid_time"] = valid_time

            dd.from_pandas(df, chunksize=256).to_parquet(
                path_dir,
                engine="pyarrow",
                append=True,
                name_function=__batch_id(valid_time),
                ignore_divisions=True,
            )

In [6]:
if __name__ == "__main__":
    # NOTE: the first time this ran it collected files from 2022-03-01T00:00:00Z -> 2022-05-14T15:54:00Z
    direct_to_parquet(
        Path(FILE_OUT_DIR),
        # start="2022-03-01T00:00:00Z",
        start="2022-05-14T15:54",
        end="2022-08-4T00:00",
    )

  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downloading {url}")
  warn(f"error downl

KeyboardInterrupt: 