In [19]:
%load_ext jupyter_black
import os
from glob import glob
from typing import NamedTuple

import pandas as pd
import numpy as np
from numpy.typing import NDArray
from datetime import datetime

import pandas as pd
import json
import numpy as np
import requests
from geopandas import GeoDataFrame
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.triggers.interval import IntervalTrigger

# scheduler = BlockingScheduler()

NCEP_DATA = "https://mrms.ncep.noaa.gov/data"


def name_to_datetime(names: pd.Series) -> pd.DatetimeIndex:
    return pd.DatetimeIndex(names.str.replace("_", "T").str.extract(r"(\d*T\d*).json")[0]).rename("validTime")


def read_mrms(*args: str) -> pd.DataFrame:
    url = "/".join([NCEP_DATA, *args]) + "/?C=M;O=D"
    return pd.read_html(url)[0].dropna()


def read_probsevere() -> pd.DataFrame:
    df = read_mrms("ProbSevere", "PROBSEVERE")
    df.index = name_to_datetime(df.Name)
    return (NCEP_DATA + "/ProbSevere/PROBSEVERE/" + df["Name"]).rename("url")


dex = {"probsevere": ("ProbSevere", "PROBSEVERE")}


def extract_html(arg: str):
    arg = arg.upper()
    match arg:
        case "PROBSEVERE":
            return read_probsevere()
        case _:
            raise NotImplementedError


# def transform_to_dataframe():
#     ...
from typing import Mapping

files = extract_html("probsevere")


def to_dataframe(mrms_files: Mapping[pd.Timestamp, str]):
    def generate():
        for vt, url in mrms_files.items():
            for feat in requests.get(url).json()["features"]:
                props = feat["properties"]
                props["validTime"] = vt
                props["geometry"] = feat["geometry"]
                yield props

    ps = pd.DataFrame(generate()).set_index(["validTime", "ID"])

    ps["AVG_BEAM_HGT"] = ps["AVG_BEAM_HGT"].str.replace(r"[A-Za-z]", "", regex=True).apply(pd.eval)

    ps[["MAXRC_EMISS", "MAXRC_ICECF"]] = (
        ps[["MAXRC_EMISS", "MAXRC_ICECF"]]
        .stack()
        .str.extract(r"(?:\()([a-z]*)(?:\))")
        .replace({"weak": 1, "moderate": 2, "strong": 3})
        .fillna(0)
        .unstack(-1)
        .droplevel(0, axis=1)
    )
    # ps[[ps.columns != "geometry"]] = ps[ps.columns != "geometry"].astype(np.float32)
    ps.loc[:, ps.columns != "geometry"] = ps.loc[:, ps.columns != "geometry"].astype(np.float32)
    return ps


df = to_dataframe(files.tail())
df

The jupyter_black extension is already loaded. To reload it, use:
  %reload_ext jupyter_black


Unnamed: 0_level_0,Unnamed: 1_level_0,MUCAPE,MLCAPE,MLCIN,EBSHEAR,SRH01KM,MEANWIND_1-3kmAGL,MESH,VIL_DENSITY,FLASH_RATE,FLASH_DENSITY,...,WETBULB_0C_HGT,PWAT,CAPE_M10M30,LJA,SIZE,AVG_BEAM_HGT,MOTION_EAST,MOTION_SOUTH,PS,geometry
validTime,ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2022-05-28 14:08:36,133672,3202.0,2329.0,-4.0,8.600000,49.0,31.799999,0.31,0.71,7.0,0.20,...,12.3,2.1,651.0,0.0,101.0,3.278688,7.619,-6.162,1.0,"{'type': 'Polygon', 'coordinates': [[[-74.63, ..."
2022-05-28 14:08:36,133697,3056.0,2257.0,-4.0,16.299999,39.0,29.200001,0.03,0.35,0.0,0.03,...,13.2,2.1,591.0,0.0,73.0,3.281818,6.693,-7.198,1.0,"{'type': 'Polygon', 'coordinates': [[[-74.49, ..."
2022-05-28 14:08:36,133866,3059.0,2427.0,-4.0,4.800000,22.0,29.100000,0.28,0.61,3.0,0.32,...,12.6,2.0,604.0,0.0,98.0,3.279026,8.630,-3.626,0.0,"{'type': 'Polygon', 'coordinates': [[[-76.13, ..."
2022-05-28 14:08:36,133947,931.0,0.0,-4.0,25.000000,257.0,21.200001,0.06,0.48,1.0,0.20,...,12.3,1.4,271.0,0.0,211.0,3.289474,6.064,-7.198,1.0,"{'type': 'Polygon', 'coordinates': [[[-95.4, 3..."
2022-05-28 14:08:36,133997,2211.0,1778.0,-4.0,7.500000,26.0,14.000000,0.11,1.28,0.0,0.05,...,10.5,1.4,574.0,0.0,255.0,3.274194,3.196,-3.516,0.0,"{'type': 'Polygon', 'coordinates': [[[-85.87, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-28 14:00:39,134535,1330.0,152.0,-551.0,39.200001,188.0,40.599998,0.00,0.40,0.0,0.00,...,11.8,1.2,240.0,0.0,47.0,3.269939,10.469,-0.942,3.0,"{'type': 'Polygon', 'coordinates': [[[-94.83, ..."
2022-05-28 14:00:39,134536,2930.0,2130.0,-4.0,5.800000,26.0,31.700001,0.00,0.27,0.0,0.03,...,13.1,2.2,574.0,0.0,56.0,3.280269,7.641,-3.603,0.0,"{'type': 'Polygon', 'coordinates': [[[-75.29, ..."
2022-05-28 14:00:39,134537,2646.0,2069.0,-4.0,14.700000,21.0,15.800000,0.00,0.48,0.0,0.00,...,13.0,1.9,607.0,0.0,63.0,3.258065,-0.283,-2.271,0.0,"{'type': 'Polygon', 'coordinates': [[[-80.95, ..."
2022-05-28 14:00:39,134538,2820.0,1966.0,-4.0,15.600000,21.0,15.000000,0.06,1.16,0.0,0.00,...,12.2,1.9,596.0,0.0,86.0,3.278846,2.092,-0.247,1.0,"{'type': 'Polygon', 'coordinates': [[[-81.49, ..."


In [5]:
from typing import List
class ProbSevereFeature(pydantic.BaseModel):...

class ProbSevere(pydantic.BaseModel):
    features:List[ProbSevereFeature]

In [46]:
import pydantic

ModuleNotFoundError: No module named 'pydantic'

In [43]:
df.to_parquet("test.parquet", engine="pyarrow")

In [45]:
pd.read_parquet("test.parquet", engine="pyarrow")

Unnamed: 0_level_0,Unnamed: 1_level_0,MUCAPE,MLCAPE,MLCIN,EBSHEAR,SRH01KM,MEANWIND_1-3kmAGL,MESH,VIL_DENSITY,FLASH_RATE,FLASH_DENSITY,...,WETBULB_0C_HGT,PWAT,CAPE_M10M30,LJA,SIZE,AVG_BEAM_HGT,MOTION_EAST,MOTION_SOUTH,PS,geometry
validTime,ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2022-05-28 14:08:36,133672,3202,2329,-4,8.6,49,31.8,0.31,0.71,7,0.20,...,12.3,2.1,651,0.0,101,3.278689,7.619,-6.162,1,"{'coordinates': [[[-74.63 33.68], [-74.58 33..."
2022-05-28 14:08:36,133697,3056,2257,-4,16.3,39,29.2,0.03,0.35,0,0.03,...,13.2,2.1,591,0.0,73,3.281818,6.693,-7.198,1,"{'coordinates': [[[-74.49 34.59], [-74.45 34..."
2022-05-28 14:08:36,133866,3059,2427,-4,4.8,22,29.1,0.28,0.61,3,0.32,...,12.6,2.0,604,0.0,98,3.279026,8.63,-3.626,0,"{'coordinates': [[[-76.13 32.6 ], [-76.09 32..."
2022-05-28 14:08:36,133947,931,0,-4,25.0,257,21.2,0.06,0.48,1,0.20,...,12.3,1.4,271,0.0,211,3.289474,6.064,-7.198,1,"{'coordinates': [[[-95.4 36.6], [-95.34 36.6..."
2022-05-28 14:08:36,133997,2211,1778,-4,7.5,26,14.0,0.11,1.28,0,0.05,...,10.5,1.4,574,0.0,255,3.274194,3.196,-3.516,0,"{'coordinates': [[[-85.87 29.44], [-85.74 29..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-05-28 14:00:39,134535,1330,152,-551,39.2,188,40.6,0.00,0.40,0,0.00,...,11.8,1.2,240,0.0,47,3.269939,10.469,-0.942,3,"{'coordinates': [[[-94.83 41.9 ], [-94.82 41..."
2022-05-28 14:00:39,134536,2930,2130,-4,5.8,26,31.7,0.00,0.27,0,0.03,...,13.1,2.2,574,0.0,56,3.280269,7.641,-3.603,0,"{'coordinates': [[[-75.29 33.42], [-75.25 33..."
2022-05-28 14:00:39,134537,2646,2069,-4,14.7,21,15.8,0.00,0.48,0,0.00,...,13.0,1.9,607,0.0,63,3.258065,-0.283,-2.271,0,"{'coordinates': [[[-80.95 28.29], [-80.94 28..."
2022-05-28 14:00:39,134538,2820,1966,-4,15.6,21,15.0,0.06,1.16,0,0.00,...,12.2,1.9,596,0.0,86,3.278846,2.092,-0.247,1,"{'coordinates': [[[-81.49 28.25], [-81.45 28..."


In [17]:
from datetime import datetime

import pandas as pd
import json
import numpy as np
import requests
from geopandas import GeoDataFrame
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.triggers.interval import IntervalTrigger

scheduler = BlockingScheduler()


def name_to_datetime(names: pd.Series) -> pd.DatetimeIndex:
    return pd.DatetimeIndex(names.str.replace("_", "T").str.extract(r"(\d*T\d*).json")[0]).rename("validTime")


def read_mrms(*args: str) -> pd.DataFrame:
    url = "/".join(["https://mrms.ncep.noaa.gov/data", *args]) + "/?C=M;O=D"
    return pd.read_html(url)[0].dropna()


def read_probsevere() -> pd.DataFrame:
    df = read_mrms("ProbSevere", "PROBSEVERE")
    df.index = name_to_datetime(df.Name)
    return ("https://mrms.ncep.noaa.gov/data/ProbSevere/PROBSEVERE/" + df["Name"]).rename("url")


def to_dataframe(urls: pd.Series, source="URL") -> pd.DataFrame:
    def generate():
        for vt, path in urls.items():
            if source == "URL":
                feat = requests.get(path).json()

            if source == "PATH":
                with open(path, mode="r", encoding="utf-8") as f:
                    feat = json.load(f)
            df = GeoDataFrame.from_features(feat["features"])
            df["validTime"] = vt
            yield df

    # concat the iterable files
    ps = pd.concat(generate(), ignore_index=True)
    ps = pd.concat([ps, ps["geometry"].bounds], axis=1)
    # convert string values to
    ps["AVG_BEAM_HGT"] = ps["AVG_BEAM_HGT"].str.replace(r"[A-Za-z]", "", regex=True).apply(pd.eval)

    ps[["MAXRC_EMISS", "MAXRC_ICECF"]] = (
        ps[["MAXRC_EMISS", "MAXRC_ICECF"]]
        .stack()
        .str.extract(r"(?:\()([a-z]*)(?:\))")
        .replace({"weak": 1, "moderate": 2, "strong": 3})
        .fillna(0)
        .unstack(-1)
        .droplevel(0, axis=1)
    )

    return (
        ps.set_index(["validTime", "ID", "minx", "miny", "maxx", "maxy"]).drop("geometry", axis=1).astype(np.float32)
    )


@scheduler.scheduled_job(IntervalTrigger(minutes=10))
def on_interval():
    print(f"begining interval scheduled task at {datetime.now()}\n")
    # the previously downloaded dataset
    # this should be updated to save the data into files based on the utc date
    df = pd.read_parquet("data/PROBSEVERE.parquet", engine="pyarrow")
    # READ FROM MRMS DATASET
    urls = read_probsevere()
    # LOCATE FILES THAT HAVE NOT BEEN DOWNLOADED
    data_to_get = urls.loc[~urls.index.unique("validTime").isin(df.index.unique("validTime"))]
    if not data_to_get.any():
        print("NO NEW FILES FOUND")
        return
    getting = data_to_get.tail(20)

    print(f"getting {len(getting)} files valid for", ", ".join(getting.index.astype(str).tolist()), "\n\n")
    # DOWNLOAD NEWFILES
    new_files = to_dataframe(getting, source="URL")
    # JOIN THE FILES
    pd.concat([df, new_files]).to_parquet("data/PROBSEVERE.parquet", engine="pyarrow")
    print("FILE SAVED")


if __name__ == "__main__":
    on_interval()
    scheduler.start()

begining interval scheduled task at 2022-05-29 11:24:35.621737

getting 20 files valid for 2022-05-28 14:38:40, 2022-05-28 14:36:39, 2022-05-28 14:34:42, 2022-05-28 14:32:38, 2022-05-28 14:30:38, 2022-05-28 14:28:40, 2022-05-28 14:26:42, 2022-05-28 14:24:39, 2022-05-28 14:22:40, 2022-05-28 14:20:41, 2022-05-28 14:18:42, 2022-05-28 14:16:38, 2022-05-28 14:14:39, 2022-05-28 14:12:40, 2022-05-28 14:10:37, 2022-05-28 14:08:36, 2022-05-28 14:06:38, 2022-05-28 14:04:39, 2022-05-28 14:02:41, 2022-05-28 14:00:39 




KeyboardInterrupt: 

In [15]:
# read_mrms("ProbSevere", "PROBSEVERE")
# read_probsevere()
extract_html("probsevere")

NameError: name 'df' is not defined