In [1]:
%load_ext jupyter_black

# transform a timeseries of json files into a Dataframe
- working with geopandas
- datatypes
- masking
- io with parquet filetype 


In [2]:
import json
from glob import glob
from pathlib import Path
from warnings import warn
from datetime import datetime
from typing import Iterator, NewType

import numpy as np
import pandas as pd
import geopandas as gpd

FeatureCollection = NewType("FeatureCollection", dict[str, any])

In [3]:
def iterpaths(path: str) -> Iterator[Path]:
    """
    generator function to be iterated over
    yields a iterable collection of Path objects
    """
    yield from (Path(file) for file in glob(path))


def open_probsevere(globpath: str) -> pd.DataFrame:
    def generate():
        for path in iterpaths(globpath):
            with path.open("rb") as f:
                try:
                    fc: FeatureCollection = json.load(f)
                except json.JSONDecodeError:
                    warn(f"json error decoding {path}")
                    continue
                df = gpd.GeoDataFrame.from_features(fc["features"])
                df["validTime"] = datetime.strptime(fc["validTime"], "%Y%m%d_%H%M%S %Z")
                yield df

    return pd.concat(generate()).set_index(["validTime", "ID"])


# takes rougly 10 seconds to read and process all of the files
df = open_probsevere("/workspaces/griblib/archive/20/*.json")
# dtypes are all jacked up so lets fix that
df.dtypes

geometry             geometry
MUCAPE                 object
MLCAPE                 object
MLCIN                  object
EBSHEAR                object
SRH01KM                object
MEANWIND_1-3kmAGL      object
MESH                   object
VIL_DENSITY            object
FLASH_RATE             object
FLASH_DENSITY          object
MAXLLAZ                object
P98LLAZ                object
P98MLAZ                object
MAXRC_EMISS            object
MAXRC_ICECF            object
WETBULB_0C_HGT         object
PWAT                   object
CAPE_M10M30            object
LJA                    object
SIZE                   object
AVG_BEAM_HGT           object
MOTION_EAST            object
MOTION_SOUTH           object
PS                     object
dtype: object

In [4]:
# the ["MAXRC_EMISS", "MAXRC_ICECF", "AVG_BEAM_HGT", "geometry"] are either a string or object type and cannont be cast as a float
# this demonstrastes mask indexing?
mask = df.columns.isin(["MAXRC_EMISS", "MAXRC_ICECF", "AVG_BEAM_HGT", "geometry"])
# to use this method your mask should be the same shape as the thing you are going to mask
assert mask.shape == df.columns.shape
# the mask is just an array of bool values
mask

array([ True, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True,  True, False, False,
       False, False, False,  True, False, False, False])

In [5]:
# the mask can be inverted using the ~ Bitwise NOT operator
~mask

array([False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False, False,  True,  True,
        True,  True,  True, False,  True,  True,  True])

In [6]:
# again since our mask is the same shape as our columns we can to select the columns we want to convert to floating points
float_cols = df.columns[~mask]
float_cols

Index(['MUCAPE', 'MLCAPE', 'MLCIN', 'EBSHEAR', 'SRH01KM', 'MEANWIND_1-3kmAGL',
       'MESH', 'VIL_DENSITY', 'FLASH_RATE', 'FLASH_DENSITY', 'MAXLLAZ',
       'P98LLAZ', 'P98MLAZ', 'WETBULB_0C_HGT', 'PWAT', 'CAPE_M10M30', 'LJA',
       'SIZE', 'MOTION_EAST', 'MOTION_SOUTH', 'PS'],
      dtype='object')

In [7]:
df[float_cols] = df[float_cols].astype(np.float32)
df.dtypes

geometry             geometry
MUCAPE                float32
MLCAPE                float32
MLCIN                 float32
EBSHEAR               float32
SRH01KM               float32
MEANWIND_1-3kmAGL     float32
MESH                  float32
VIL_DENSITY           float32
FLASH_RATE            float32
FLASH_DENSITY         float32
MAXLLAZ               float32
P98LLAZ               float32
P98MLAZ               float32
MAXRC_EMISS            object
MAXRC_ICECF            object
WETBULB_0C_HGT        float32
PWAT                  float32
CAPE_M10M30           float32
LJA                   float32
SIZE                  float32
AVG_BEAM_HGT           object
MOTION_EAST           float32
MOTION_SOUTH          float32
PS                    float32
dtype: object

In [8]:
# lets wrap all that in a function
def to_float(
    df: pd.DataFrame, dont_float: list[str] = ["MAXRC_EMISS", "MAXRC_ICECF", "AVG_BEAM_HGT", "geometry"]
) -> pd.DataFrame:
    mask = df.columns.isin(dont_float)
    float_cols = df.columns[~mask]
    df[float_cols] = df[float_cols].astype(np.float32)
    return df

In [9]:
# putting it all together in a single chained operation
open_probsevere("/workspaces/griblib/archive/20/*.json").pipe(to_float).to_parquet("probsevere.parquet")

In [28]:
# previously it took nearly 15 seconds to open a days worth of data, format it and save the output file.
# the parquet file type reads much faster .2 seconds reading with pandas 1.6 seconds with geopandas because of the geometry object creation
df: gpd.GeoDataFrame = gpd.read_parquet("probsevere.parquet")
df

Unnamed: 0_level_0,Unnamed: 1_level_0,geometry,MUCAPE,MLCAPE,MLCIN,EBSHEAR,SRH01KM,MEANWIND_1-3kmAGL,MESH,VIL_DENSITY,FLASH_RATE,...,MAXRC_ICECF,WETBULB_0C_HGT,PWAT,CAPE_M10M30,LJA,SIZE,AVG_BEAM_HGT,MOTION_EAST,MOTION_SOUTH,PS
validTime,ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2022-07-20 10:12:39,360483,"POLYGON ((-84.86000 36.23000, -84.83000 36.230...",1487.0,852.0,-22.0,32.500000,61.0,20.700001,0.12,0.97,1.0,...,0731Z 0.13/min (strong),10.7,1.7,286.0,0.0,380.0,8.51 kft / 2.60 km,9.813,-2.250,6.0
2022-07-20 10:12:39,360507,"POLYGON ((-83.38000 33.90000, -83.35000 33.900...",2172.0,1459.0,-25.0,36.700001,73.0,20.500000,0.30,2.23,4.0,...,,12.7,2.0,486.0,0.0,68.0,5.91 kft / 1.80 km,10.704,-1.752,9.0
2022-07-20 10:12:39,360611,"POLYGON ((-85.88000 37.17000, -85.84000 37.170...",2101.0,1437.0,-29.0,22.799999,95.0,16.900000,0.00,1.10,0.0,...,0906Z 0.09/min (strong),11.6,1.7,396.0,0.0,194.0,4.03 kft / 1.23 km,14.751,-4.079,2.0
2022-07-20 10:12:39,360845,"POLYGON ((-78.03000 45.45000, -78.00000 45.450...",723.0,372.0,-87.0,19.500000,101.0,25.100000,0.63,2.25,1.0,...,0831Z 0.02/min (weak),11.7,1.6,204.0,0.0,232.0,19.02 kft / 5.80 km,11.624,-8.151,5.0
2022-07-20 10:12:39,360923,"POLYGON ((-98.74000 35.52000, -98.71000 35.520...",455.0,28.0,0.0,33.500000,188.0,27.100000,0.04,0.83,4.0,...,,14.4,1.4,184.0,0.0,251.0,4.92 kft / 1.50 km,14.751,6.891,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-20 01:46:39,358602,"POLYGON ((-109.32000 30.84000, -109.28000 30.8...",4181.0,2374.0,-43.0,39.099998,47.0,6.200000,0.79,1.73,5.0,...,,12.2,1.5,798.0,0.0,61.0,10.47 kft / 3.19 km,-1.478,1.316,40.0
2022-07-20 01:46:39,358603,"POLYGON ((-110.02000 30.74000, -109.98000 30.7...",4011.0,2452.0,-22.0,38.099998,6.0,5.800000,0.62,1.26,24.0,...,,11.6,1.6,693.0,0.0,112.0,7.32 kft / 2.23 km,-3.420,-0.420,47.0
2022-07-20 01:46:39,358604,"POLYGON ((-85.58000 30.12000, -85.57000 30.110...",3169.0,2561.0,-4.0,22.200001,27.0,14.200000,0.11,0.57,0.0,...,,13.5,1.9,706.0,0.0,71.0,2.32 kft / 0.71 km,3.315,-1.290,1.0
2022-07-20 01:46:39,358605,"POLYGON ((-85.52000 29.43000, -85.45000 29.430...",3697.0,2857.0,-2.0,22.400000,14.0,13.200000,0.19,0.46,0.0,...,0131Z 0.0/min (weak),14.6,2.1,751.0,0.0,57.0,6.46 kft / 1.97 km,1.205,-3.020,5.0


In [34]:
df = df.copy()
cent = df["geometry"].centroid
df["x"] = cent.x
df["y"] = cent.y
df.set_index(["x", "y"], append=True).drop(columns=["geometry"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,MUCAPE,MLCAPE,MLCIN,EBSHEAR,SRH01KM,MEANWIND_1-3kmAGL,MESH,VIL_DENSITY,FLASH_RATE,FLASH_DENSITY,...,MAXRC_ICECF,WETBULB_0C_HGT,PWAT,CAPE_M10M30,LJA,SIZE,AVG_BEAM_HGT,MOTION_EAST,MOTION_SOUTH,PS
validTime,ID,x,y,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2022-07-20 10:12:39,360483,-84.899289,36.107783,1487.0,852.0,-22.0,32.500000,61.0,20.700001,0.12,0.97,1.0,0.05,...,0731Z 0.13/min (strong),10.7,1.7,286.0,0.0,380.0,8.51 kft / 2.60 km,9.813,-2.250,6.0
2022-07-20 10:12:39,360507,-83.376789,33.858257,2172.0,1459.0,-25.0,36.700001,73.0,20.500000,0.30,2.23,4.0,0.96,...,,12.7,2.0,486.0,0.0,68.0,5.91 kft / 1.80 km,10.704,-1.752,9.0
2022-07-20 10:12:39,360611,-85.886047,37.073609,2101.0,1437.0,-29.0,22.799999,95.0,16.900000,0.00,1.10,0.0,0.00,...,0906Z 0.09/min (strong),11.6,1.7,396.0,0.0,194.0,4.03 kft / 1.23 km,14.751,-4.079,2.0
2022-07-20 10:12:39,360845,-78.085758,45.351800,723.0,372.0,-87.0,19.500000,101.0,25.100000,0.63,2.25,1.0,0.07,...,0831Z 0.02/min (weak),11.7,1.6,204.0,0.0,232.0,19.02 kft / 5.80 km,11.624,-8.151,5.0
2022-07-20 10:12:39,360923,-98.761126,35.378621,455.0,28.0,0.0,33.500000,188.0,27.100000,0.04,0.83,4.0,0.55,...,,14.4,1.4,184.0,0.0,251.0,4.92 kft / 1.50 km,14.751,6.891,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-20 01:46:39,358602,-109.306395,30.803673,4181.0,2374.0,-43.0,39.099998,47.0,6.200000,0.79,1.73,5.0,0.20,...,,12.2,1.5,798.0,0.0,61.0,10.47 kft / 3.19 km,-1.478,1.316,40.0
2022-07-20 01:46:39,358603,-109.973082,30.700735,4011.0,2452.0,-22.0,38.099998,6.0,5.800000,0.62,1.26,24.0,0.40,...,,11.6,1.6,693.0,0.0,112.0,7.32 kft / 2.23 km,-3.420,-0.420,47.0
2022-07-20 01:46:39,358604,-85.554434,30.078410,3169.0,2561.0,-4.0,22.200001,27.0,14.200000,0.11,0.57,0.0,0.00,...,,13.5,1.9,706.0,0.0,71.0,2.32 kft / 0.71 km,3.315,-1.290,1.0
2022-07-20 01:46:39,358605,-85.474146,29.395081,3697.0,2857.0,-2.0,22.400000,14.0,13.200000,0.19,0.46,0.0,0.00,...,0131Z 0.0/min (weak),14.6,2.1,751.0,0.0,57.0,6.46 kft / 1.97 km,1.205,-3.020,5.0
