In [33]:
%load_ext jupyter_black

from datetime import datetime

import pandas as pd
import numpy as np
import requests


from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.triggers.interval import IntervalTrigger
# scheduler = BlockingScheduler()

NCEP_DATA = "https://mrms.ncep.noaa.gov/data"


def name_to_datetime(names: pd.Series) -> pd.DatetimeIndex:
    return pd.DatetimeIndex(names.str.replace("_", "T").str.extract(r"(\d*T\d*).json")[0]).rename("validTime")
    
def read_mrms(*args: str) -> pd.DataFrame:
    url = "/".join([NCEP_DATA, *args]) + "/?C=M;O=D"
    return pd.read_html(url)[0].dropna()

def read_probsevere() -> pd.DataFrame:
    df = read_mrms("ProbSevere", "PROBSEVERE")
    df.index = name_to_datetime(df.Name)
    return (NCEP_DATA + "/ProbSevere/PROBSEVERE/" + df["Name"]).rename("url")

def get_last_hours_data():
    s = read_probsevere()
    last_hour =datetime.utcnow() - pd.to_timedelta(1, unit="h")
    is_last_hour = (s.index.day == last_hour.day ) & (s.index.hour == last_hour.hour) 
    is_10_min_interval = (s.index.minute % 10 )== 0
    return s[is_last_hour & is_10_min_interval]




The jupyter_black extension is already loaded. To reload it, use:
  %reload_ext jupyter_black


In [47]:
from sys import stdout
import os, platform
from time import sleep
try:
    import IPython
    from IPython.display import clear_output
except ImportError:
    clear_output = lambda wait:None

def isnotebook():
    try:
        shell = IPython.get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':
            return True   # Jupyter notebook or qtconsole
        elif shell == 'TerminalInteractiveShell':
            return False  # Terminal running IPython
        else:
            return False  # Other type (?)
    except NameError:
        return False

for i in range(1,20):
    x = "."*(i%4)
    y =" " * (3-len(x))
    
    if isnotebook():
        clear_output()
    elif platform.system() == 'Windows':
        os.system('cls')
    else:
        os.system('clear')

    print(f"downloading[.{x+y}]", end =' ', flush = True)
    sleep(.5)

# 0%4

downloading[....] 

In [34]:
from typing import Mapping


def to_dataframe(mrms_files: Mapping[pd.Timestamp, str]) -> pd.DataFrame:
    def generate():
        for vt, url in mrms_files.items():
            features = requests.get(url).json()["features"]
            print(f"data collected for {vt}")
            for feat in features:
                props = feat["properties"]
                props["validTime"] = vt
                props["geometry"] = feat["geometry"]
                yield props

    ps = pd.DataFrame(generate()).set_index(["validTime", "ID"])

    ps["AVG_BEAM_HGT"] = ps["AVG_BEAM_HGT"].str.replace(r"[A-Za-z]", "", regex=True).apply(pd.eval)

    ps[["MAXRC_EMISS", "MAXRC_ICECF"]] = (
        ps[["MAXRC_EMISS", "MAXRC_ICECF"]]
        .stack()
        .str.extract(r"(?:\()([a-z]*)(?:\))")
        .replace({"weak": 1, "moderate": 2, "strong": 3})
        .fillna(0)
        .unstack(-1)
        .droplevel(0, axis=1)
    )
    # ps[[ps.columns != "geometry"]] = ps[ps.columns != "geometry"].astype(np.float32)
    ps.loc[:, ps.columns != "geometry"] = ps.loc[:, ps.columns != "geometry"].astype(np.float32)
    return ps

In [35]:
scheduler = BlockingScheduler()
template = "data/{0}.parquet"


@scheduler.scheduled_job(IntervalTrigger(hours=1))
def on_hour():
    last = get_last_hours_data()
    df = to_dataframe(last)
    file_name = template.format(datetime.now().strftime("%Y-%m-%d.HR%H"))
    df.to_parquet(file_name)
    print(f"file saved as {file_name}")





if __name__ == "__main__":
    on_hour()
    scheduler.start()

data collected for 2022-05-30 11:50:40
data collected for 2022-05-30 11:40:41
data collected for 2022-05-30 11:30:40
data collected for 2022-05-30 11:20:41
data collected for 2022-05-30 11:10:40
data collected for 2022-05-30 11:00:43
file saved as data/data/2022-05-30HR07.parquet.parquet


KeyboardInterrupt: 